Publications
2024
@InProceedings{Naguib2024b, title = {{Few-shot clinical entity recognition in English, French and Spanish: masked language models outperform generative model prompting}}, author = {Naguib, Marco and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Proceedings of the Findings of the Association for Computational Linguistics: EMNLP 2024}, address = {Miami, Florida, USA}, year = {2024}, month = nov, publisher = {Association for Computational Linguistics} }
Objective: The aim of the study is to review research efforts and technical approaches in prompt engineering for medical applications as well as provide an overview of opportunities and challenges for clinical practice.
Methods: Databases indexing the fields of medicine, computer science, and medical informatics were queried in order to identify relevant published papers. Since prompt engineering is an emerging field, preprint databases were also considered. Multiple data were extracted, such as the prompt paradigm, the involved LLMs, the languages of the study, the domain of the topic, the baselines, and several learning, design, and architecture strategies specific to prompt engineering. We include studies that apply prompt engineering–based methods to the medical domain, published between 2022 and 2024, and covering multiple prompt paradigms such as prompt learning (PL), prompt tuning (PT), and prompt design (PD).
Results: We included 114 recent prompt engineering studies. Among the 3 prompt paradigms, we have observed that PD is the most prevalent (78 papers). In 12 papers, PD, PL, and PT terms were used interchangeably. While ChatGPT is the most commonly used LLM, we have identified 7 studies using this LLM on a sensitive clinical data set. Chain-of-thought, present in 17 studies, emerges as the most frequent PD technique. While PL and PT papers typically provide a baseline for evaluating prompt-based approaches, 61% (48/78) of the PD studies do not report any nonprompt-related baseline. Finally, we individually examine each of the key prompt engineering–specific information reported across papers and find that many studies neglect to explicitly mention them, posing a challenge for advancing prompt engineering research.
Conclusions: In addition to reporting on trends and the scientific landscape of prompt engineering, we provide reporting guidelines for future studies to help advance research in the medical field. We also disclose tables and figures summarizing medical prompt engineering papers available and hope that future contributions will leverage these existing works to better advance the field.
@Article{Zaghir2024, title = {{Prompt Engineering Paradigms for Medical Applications: Scoping Review}}, author = {Zaghir, Jamil and Naguib, Marco and Bjelogrlic, Mina and Névéol, Aurélie and Tannier, Xavier and Lovis, Christian}, year = {2024}, month = sep, journal = {Journal of Medical Internet Research}, doi = {10.2196/60501} }
@InProceedings{ElGhosh2024, title = {{Towards Semantic Interoperability among Heterogeneous Cancer Image Data Models using a Layered Modular Hyperontology}}, author = {El Ghosh, Mirna and Kalokyri, Varvara and Sambres, Mélanie and Vaterkowski, Morgan and Duclos, Catherine and Tannier, Xavier and Tsiknakis, Manolis and Daniel, Christel and Dhombres, Ferdinand}, booktitle = {Proceedings of the 14th Formal Ontology in Information Systems Conference (FOIS 2024)}, address = {Enschede, Netherlands}, year = {2024}, month = jul }
@InProceedings{Cohen2024, title = {{Leveraging Information Redundancy of Real-World Data through Distant Supervision}}, author = {Cohen, Ariel and Lanson, Alexandrine and Kempf, Emmanuelle and Tannier, Xavier}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, address = {Torino, Italia}, year = {2024}, month = may, publisher = {ELRA and ICCL}, pages = {10352–10364} }
@InProceedings{Bannour2024, title = {{A Benchmark Evaluation of Clinical Named Entity Recognition in French}}, author = {Bannour, Nesrine and Servan, Christophe and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, address = {Torino, Italia}, year = {2024}, month = may, publisher = {ELRA and ICCL}, pages = {14-21} }
@InProceedings{Chossegros2024, title = {{Improving Interpretability of Leucocyte Classification with Multimodal Network}}, author = {Chossegros, Manon and Tannier, Xavier and Stockholm, Daniel}, booktitle = {Proceedings of Medical Informatics Europe 2024 (MIE -- published in Studies in Health Technology and Informatics, Volume 316: Digital Health and Informatics Innovations for Sustainable Health Care Systems)}, address = {Athens, Greece}, year = {2024}, month = aug, pages = {1098-1102} }
@Article{Acherar2024, title = {{Evaluating Plasmodium falciparum automatic detection and parasitemia estimation: A comparative study on thin blood smear images}}, author = {Acherar, Aniss and Tannier, Xavier and Tantaoui, Ilhame and Brossas, Jean-Yves and Thellier, Marc and Piarroux, Renaud}, number = {6}, year = {2024}, month = jun, journal = {PLOS One}, volume = {19}, doi = {10.1371/journal.pone.0304789} }
Objective: The aim of our study is to determine whether the use of English tools to extract and normalize French medical concepts on translations provides comparable performance to that of French models trained on a set of annotated French clinical notes.
Methods: We compare two methods: one involving French-language models and one involving English-language models. For the native French method, the Named Entity Recognition (NER) and normalization steps are performed separately. For the translated English method, after the first translation step, we compare a two-step method and a terminology-oriented method that performs extraction and normalization at the same time. We used French, English and bilingual annotated datasets to evaluate all stages (NER, normalization and translation) of our algorithms.
Results: The native French method outperformed the translated English method, with an overall f1 score of 0.51 [0.47;0.55], compared with 0.39 [0.34;0.44] and 0.38 [0.36;0.40] for the two English methods tested.
Conclusions: Despite recent improvements in translation models, there is a significant difference in performance between the two approaches in favor of the native French method, which is more effective on French medical texts, even with few annotated documents.
@Article{Gerardin2024, title = {{Impact of translation on biomedical information extraction: an experiment on real-life clinical notes}}, author = {Gérardin, Christel and Xiong, Yuhan and Wajsbürt, Perceval and Carrat, Fabrice and Tannier, Xavier}, year = {2024}, month = jan, journal = {JMIR Medical Informatics}, doi = {10.2196/49607} }
Materials and Methods: The detection pipeline relied both on rule-based and machine learning algorithms, respectively, for named entity recognition and entity qualification, respectively. We used a large language model pre-trained on millions of clinical notes along with annotated clinical notes in the context of 3 cohort studies related to oncology, cardiology, and rheumatology. The overall workflow was conceived to foster collaboration between studies while respecting the privacy constraints of the data warehouse. We estimated the added values of the advanced technologies and of the collaborative setting.
Results: The pipeline reached macro-averaged F1-score positive predictive value, sensitivity, and specificity of 95.7 (95%CI 94.5-96.3), 95.4 (95%CI 94.0-96.3), 96.0 (95%CI 94.0-96.7), and 99.2 (95%CI 99.0-99.4), respectively. F1-scores were superior to those observed using alternative technologies or non-collaborative settings. The models were shared through a secured registry.
Conclusions: We demonstrated that a community of investigators working on a common clinical data warehouse could efficiently and securely collaborate to develop, validate and use sensitive artificial intelligence models. In particular, we provided an efficient and robust NLP pipeline that detects conditions mentioned in clinical notes.
@Article{PetitJean2024, title = {{Collaborative and privacy-enhancing workflows on a clinical data warehouse: an example developing natural language processing pipelines to detect medical conditions}}, author = {Petit-Jean, Thomas and Gérardin, Christel and Berthelot, Emmanuelle and Chatellier, Gilles and Franck, Marie and Tannier, Xavier and Kempf, Emmanuelle and Bey, Romain}, number = {6}, year = {2024}, month = apr, journal = {Journal of the American Medical Informatics Association}, volume = {31}, doi = {10.1093/jamia/ocae069} }
Methods: We annotated a corpus of clinical documents according to 12 types of identifying entities and built a hybrid system, merging the results of a deep learning model as well as manual rules.
Results and Discussion: Our results show an overall performance of 0.99 of F1-score. We discuss implementation choices and present experiments to better understand the effort involved in such a task, including dataset size, document types, language models, or rule addition. We share guidelines and code under a 3-Clause BSD license.
@Article{Tannier2024, title = {{Development and Validation of a Natural Language Processing Algorithm to Pseudonymize Documents in the Context of a Clinical Data Warehouse}}, author = {Tannier, Xavier and Wajsbürt, Perceval and Calliger, Alice and Dura, Basile and Mouchet, Alexandre and Hilka, Martin and Bey, Romain}, number = {01/02}, year = {2024}, month = mar, journal = {Methods of Information in Medicine}, volume = {63}, doi = {10.1055/s-0044-1778693} }
@Article{Mohammad2024b, title = {{Predicting the age of field Anopheles mosquitoes using mass spectrometry and deep learning}}, author = {Mohammad, Noshine and Naudon, Pauline and Kane Dia, Abdoulaye and Boëlle, Pierre-Yves and Konaté, Abdoulaye and Konaté, Lassana and Niang, El Hadji Amadou and Piarroux, Renaud and Tannier, Xavier and Nabet, Cécile}, number = {19}, year = {2024}, month = may, journal = {Science Advances}, volume = {10}, doi = {10.1126/sciadv.adj6990} }
@InProceedings{ElGhosh2024b, title = {{From Syntactic to Semantic Interoperability Using a Hyperontology in the Oncology Domain}}, author = {El Ghosh, Mirna and Kalokyri, Varvara and Sambrès, Mélanie and Vaterkowski, Morgan and Duclos, Catherine and Tannier, Xavier and Tsakou, Gianna and Tsiknakis, Manolis and Daniel, Christel and Dhombres, Ferdinand}, booktitle = {Proceedings of Medical Informatics Europe 2024 (MIE -- published in Studies in Health Technology and Informatics, Volume 316: Digital Health and Informatics Innovations for Sustainable Health Care Systems)}, address = {Athens, Greece}, year = {2024}, month = aug, pages = {1385-1389} }
@Article{Bey2024, title = {{Natural language processing of multi-hospital electronic health records for public health surveillance of suicidality}}, author = {Romain Bey and Ariel Cohen and Vincent Trebossen and Basile Dura and Pierre-Alexis Geoffroy and Charline Jean and Benjamin Landman and Thomas Petit-Jean and Gilles Chatellier and Kankoe Sallah and Xavier Tannier and Aurelie Bourmaud and Richard Delorme}, number = {6}, year = {2024}, month = feb, journal = {npj Mental Health Research}, volume = {3}, doi = {10.1038/s44184-023-00046-7} }
On the test dataset using CNN model, all 31 non clonal isolates were correctly classified, 2/3 clonal isolates were unambiguously correctly classified whereas the third strain was undetermined (i.e the CNN model was unable to discriminate between GT8 and non-GT8). Clonal strains of A. flavus have persisted in the neonatal intensive care unit for several years. Indeed, two strains of A. flavus isolated from incubators in September 2007, are identical to the strain responsible for the second case that occurred 3 years later.
MALDI-TOF is a promising tool for detecting clonal isolates of A. flavus using CNN even with a limited training set for limited cost and handling time.
@Article{Mohammad2024, title = {{Nosocomial transmission of Aspergillus flavus in a neonatal intensive care unit: long term persistence in environment and interest of MALDI-ToF Mass-Spectrometry coupled with Convolutional Neural Network (CNN) for rapid clone recognition}}, author = {Mohammad, Noshine and Huguenin, Antoine and Lefebvre, Annick and Menvielle, Laura and Toubas, Dominique and Ranque, Stéphane and Villena, Isabelle and Tannier, Xavier and Normand, Anne-Cécile and Piarroux, Renaud}, year = {2024}, month = jan, journal = {Medical Mycology}, doi = {10.1093/mmy/myad136} }
@Article{Demirkol2024, title = {{Prediction of amputation risk of patients with diabetic foot using classification algorithms: A clinical study from a tertiary center}}, author = {Denizhan Demirkol and Çiğdem Selçukcan Erol and Xavier Tannier and Tuncay Özcan and Şamil Aktaş}, number = {1}, year = {2024}, month = jan, journal = {International Wound Journal}, volume = {21}, doi = {https://onlinelibrary.wiley.com/doi/10.1111/iwj.14556} }
@InProceedings{Naguib2024, title = {{Reconnaissance d’entités cliniques en few-shot en trois langues}}, author = {Naguib, Marco and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Actes de la 31ème conférence Traitement Automatique des Langues Naturelles (TALN 2024)}, address = {Toulouse, France}, year = {2024}, month = jul }
@InProceedings{Delourme2024, title = {{LIMICS@DEFT'24 : Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS ?}}, author = {Delourme, Solène and Remaki, Adam and Gérardin, Christel and Vaillant, Pascal and Tannier, Xavier and Séroussi, Brigitte and Redjdal, Akram}, booktitle = {Défi Fouille de Texte (DEFT), Traitement Automatique des Langues Naturelles, 2024}, address = {Toulouse, France}, year = {2024}, month = jul }
@Article{Kempf2024b, title = {{The More, the Better? Modalities of Metastatic Status Extraction on Free Medical Reports Based on Natural Language Processing (Response to Ahumada et al on Methodological and Practical Aspects of a Distant Metastasis Detection Model)}}, author = {Kempf, Emmanuelle and Priou, Sonia and Redjdal, Akram and Guével, Étienne and Tannier, Xavier}, year = {2024}, month = aug, journal = {JCO Clinical Cancer Informatics}, volume = {8}, doi = {10.1200/CCI.24.00026} }
@InProceedings{Gerardin2024b, title = {{Améliorer la caractérisation phénotypique des patients atteints de maladies inflammatoires à médiation immunitaire par l’analyse automatique des comptes-rendus hospitaliers}}, author = {Gérardin, Christel and Remaki, Adam and Ung, Jacques and Pagès, P and Wajsbürt, Perceval and Faure, Guillaume and Petit-Jean, Thomas and Tannier, Xavier}, booktitle = {89ème congrès français de médecine interne, Revue de Médecine Interne}, year = {2024}, month = mar }
@InProceedings{Verdoux2024, title = {{Identification multimodale d'une cohorte de patients porteurs de cancers rares de la tête et du cou au sein de l'Entrepôt de données de santé (EDS) de l'AP-HP}}, author = {Verdoux, Marie and La Rosa, Ambre and Lolli, Isabelle and Tannier, Xavier and Baujat, Bertrand and Kempf, Emmanuelle}, booktitle = {Congrès ÉMOIS, Special Issue of the Journal of Epidemiology and Population Health}, year = {2024}, month = mar }
@InProceedings{Kempf2024, title = {{Structuration des critères histopronostiques tumoraux par traitement automatique du langage naturel - Une comparaison entre apprentissage machine et règles}}, author = {Kempf, Emmanuelle and Priou, Sonia and Dura, Basile and Calderaro, Julien and Brones, Clara and Wajsbürt, Perceval and Bennani, Lina and Tannier, Xavier}, booktitle = {Congrès ÉMOIS, Special Issue of the Journal of Epidemiology and Population Health}, year = {2024}, month = mar }
@InProceedings{Girault2024, title = {{460 Enhancing neonatal acidosis prediction: A Machine Learning approach using CTG features and clinical characteristics}}, author = {Girault, Aude and Linares, Maximino and Tannier, Xavier}, booktitle = {SMFM 44th Annual Meeting: The Pregnancy Meeting (American Journal of Obstretrics and Gynecology)}, year = {2024}, month = jan }
2023
Method: A bibliographic search using a combination of Medical Subject Headings (MeSH) descriptors and free-text terms on CRI was performed using PubMed, followed by a double-blind review in order to select a list of candidate best papers to be then peer-reviewed by external reviewers. After peer-review ranking, a consensus meeting between the two section editors and the editorial team was organized to finally conclude on the selected three best papers.
Results: Among the 1,324 papers returned by the search, published in 2022, that were in the scope of the various areas of CRI, the full review process selected four best papers. The first best paper describes the process undertaken in Germany, under the national Medical Informatics Initiative, to define a process and to gain multi-decision-maker acceptance of broad consent for the reuse of health data for research whilst remaining compliant with the European General Data Protection Regulation. The authors of the second-best paper present a federated architecture for the conduct of clinical trial feasibility queries that utilizes HL7 Fast Healthcare Interoperability Resources and an HL7 standard query representation. The third best paper aligns with the overall theme of this Yearbook, the inclusivity of potential participants in clinical trials, with recommendations to ensure greater equity. The fourth proposes a multi-modal modelling approach for large scale phenotyping from electronic health record information. This year's survey paper has also examined equity, along with data bias, and found that the relevant publications in 2022 have focused almost exclusively on the issue of bias in Artificial Intelligence (AI).
Conclusions: The literature relevant to CRI in 2022 has largely been dominated by publications that seek to maximise the reusability of wide scale and representative electronic health record information for research, either as big data for distributed analysis or as a source of information from which to identify suitable patients accurately and equitably for invitation to participate in clinical trials.
@Article{Tannier2023, title = {{Clinical Research Informatics: Contributions from 2022}}, author = {Tannier, Xavier and Kalra, Dipak}, year = {2023}, month = dec, journal = {Yearbook of Medical Informatics}, volume = {32}, doi = {10.1055/s-0043-1768748} }
@InProceedings{Tahri2023, title = {{Transitioning from benchmarks to a real-world case of information-seeking in Scientific Publications}}, author = {Tahri, Chyrine and Bochnakian, Aurore and Haouat, Patrick and Tannier, Xavier}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2023}, address = {Toronto, Canada}, year = {2023}, month = jul, publisher = {Association for Computational Linguistics}, pages = {1066–1076} }
@InProceedings{Wajsburt2023, title = {{An end-to-end neural model based on cliques and scopes for frame extraction in long breast radiology reports}}, author = {Wajsburt, Perceval and Tannier, Xavier}, booktitle = {The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks}, address = {Toronto, Canada}, year = {2023}, month = jul, publisher = {Association for Computational Linguistics}, pages = {156–170} }
@InProceedings{Bannour2023b, title = {{Event-independent temporal positioning: application to French clinical text}}, author = {Bannour, Nesrine and Rance, Bastien and Tannier, Xavier and Neveol, Aurelie}, booktitle = {The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks}, address = {Toronto, Canada}, year = {2023}, month = jul, publisher = {Association for Computational Linguistics}, pages = {191–205} }
@Article{Doutreligne2023, title = {{Good practices for clinical data warehouse implementation: A case study in France}}, author = {Doutreligne, Matthieu and Degremont, Adeline and Jachiet, Pierre-Alain and Lamer, Antoine and Tannier, Xavier}, number = {7}, year = {2023}, month = jul, journal = {PLOS Digital Health}, volume = {2}, doi = {10.1371/journal.pdig.0000298} }
Material and methods: A retrospective study was conducted at a tertiary maternity hospital between June 2017 and July 2021. Patients with at least one FSBS during labor for category II fetal heart rate and delivery of a singleton cephalic infant were included. The rate of change in pH value between consecutive samples for each patient was calculated and plotted as a function of pH value. Linear regression models were used to model the evolution of the pH drop rate estimating slope and standard errors across predefined pH intervals. Exploration of alternative pH action thresholds was conducted. To explore the independence of the association between pH value and pH drop rate, multiple linear regression adjusted on age, body mass index, parity, oxytocin stimulation and suspected small for gestational age was performed.
Results:We included 2047 patients with at least one FSBS (total FSBS 3467); with 2047 umbilical cord blood pH, and a total of 5514 pH samples. Median pH values were 7.29 1 h before delivery, 7.26 30 min before delivery. The pH drop was slow between 7.40 and 7.30, then became more pronounced, with median rates of 0.0005 units/min at 7.25 and 0.0013 units/min at 7.20. Out of the alternative pH thresholds, 7.26 and 7.20 demonstrated the best alignment with our dataset. Multiple linear regression revealed that only pH value was significantly associated to the rate of pH change.
Conclusions:Our study confirms the validity and reliability of current guideline thresholds for fetal scalp pH in category II fetal heart rate.
@Article{Girault2023, title = {{Re-evaluating fetal scalp pH thresholds: An examination of fetal pH variations during labor}}, author = {Girault, Aude and Le Ray, Camille and Garabedian, Charles and Goffinet, François and Tannier, Xavier}, year = {2023}, month = dec, journal = {Acta Obstetricia et Gynecologica Scandinavica}, doi = {10.1111/aogs.14739} }
Methods:We performed a retrospective multicenter cohort study on the data warehouse of Greater Paris University Hospitals (AP-HP). We identified all female patients newly referred with a BC in 2019 and 2020. We assessed the timeline of their care trajectories, initial tumor stage, and treatment received: BC resection, exclusive systemic therapy, exclusive radiation therapy, or exclusive best supportive care (BSC). We calculated patients' 1-year overall survival (OS) and compared indicators in 2019 and 2020.
Results:In 2019 and 2020, 2055 and 1988, new BC patients underwent cancer treatment, and during the two lockdowns, the BC diagnoses varied by −18% and by +23% compared to 2019. De novo metastatic tumors (15% and 15%, p = 0.95), pTNM and ypTNM distributions of 1332 cases with upfront resection and of 296 cases with neoadjuvant therapy did not differ (p = 0.37, p = 0.3). The median times from first multidisciplinary meeting and from diagnosis to treatment of 19 days (interquartile 11–39 days) and 35 days (interquartile 22–65 days) did not differ. Access to plastic surgery (15% and 17%, p = 0.08) and to treatment categories did not vary: tumor resection (73% and 72%), exclusive systemic therapy (13% and 14%), exclusive radiation therapy (9% and 9%), exclusive BSC (5% and 5%) (p = 0.8). Among resected patients, the neoadjuvant therapy rate was lower in 2019 (16%) versus 2020 (20%) (p = 0.02). One-year OS rates were 99.3% versus 98.9% (HR = 0.96; 95% CI, 0.77–1.2), 72.6% versus 76.6% (HR = 1.28; 95% CI, 0.95–1.72), 96.6% versus 97.8% (HR = 1.09; 95% CI, 0.61–1.94), and 15.5% versus 15.1% (HR = 0.99; 95% CI, 0.72–1.37), in the treatment groups.
Conclusions:Despite a decrease in the number of new BCs, there was no tumor stage shift, and OS did not vary.
@Article{Guevel2023b, title = {{Impact of the COVID‐19 pandemic on clinical presentation, treatments, and outcomes of new breast cancer patients: A retrospective multicenter cohort study}}, author = {Guével, Étienne and Priou, Sonia and Lamé, Guillaume and Wassermann, Johanna and Bey, Romain and Uzan, Catherine and Chatellier, Gilles and Belkacemi, Yazid and Tannier, Xavier and Guillerm, Sophie and Flicoteaux, Rémi and Gligorov, Joseph and Cohen, Ariel and Benderra, Marc‐Antoine and Teixeira, Luis and Daniel, Christel and Hersant, Barbara and Tournigand, Christophe and Kempf, Emmanuelle}, year = {2023}, month = nov, journal = {Cancer Medicine}, doi = {10.1002/cam4.6637} }
@Article{Kempf2023b, title = {{No changes in clinical presentation, treatment strategies and survival of pancreatic cancer cases during the SARS-COV-2 outbreak: A retrospective multicenter cohort study on real-world data}}, author = {Kempf, Emmanuelle and Priou, Sonia and Lamé, Guillaume and Laurent, Alexis and Guével, Etienne and Tzedakis, Stylianos and Bey, Romain and Fuks, David and Chatellier, Gilles and Tannier, Xavier and Galula, Gilles and Flicoteaux, Rémi and Daniel, Christel and Tournigand, Christophe}, year = {2023}, month = aug, journal = {International Journal of Cancer}, doi = {10.1002/ijc.34675} }
Materials and methods: We identified the prescreening information items being relevant for prescreening of patients with cancer. We randomly selected 15 academic and industry-sponsored urology phase I-IV clinical trials (CTs) launched at APHP between 2016 and 2021. The computability of the related prescreening criteria (PC) was defined by their translation rate in OMOP-compliant queries and by their execution rate on the APHP clinical data warehouse (CDW) containing data of 205,977 patients with cancer. The overall performance of the prescreening tool was assessed by the rate of true- and false-positive cases of three randomly selected CTs.Results: We defined a list of 15 minimal information items being relevant for patients' prescreening. We identified 83 PC of the 534 eligibility criteria from the 15 CTs. We translated 33 and 62 PC in queries on the basis of OMOP CDM v5.3 and v5.4, respectively (translation rates of 40% and 75%, respectively). Of the 33 PC translated in the v5.3 of the OMOP CDM, 19 could be executed on the APHP CDW (execution rate of 58%). Of 83 PC, the computability rate on the APHP CDW reached 23%. On the basis of three CTs, we identified 17, 32, and 63 patients as being potentially eligible for inclusion in those CTs, resulting in positive predictive values of 53%, 41%, and 21%, respectively.
Conclusion: We showed that PC could be formalized according to the OMOP CDM and that the oncology extension increased their translation rate through better representation of cancer natural history.
@Article{Kempf2023a, title = {{How to Improve Cancer Patients ENrollment in Clinical Trials From rEal-Life Databases Using the Observational Medical Outcomes Partnership Oncology Extension: Results of the PENELOPE Initiative in Urologic Cancers}}, author = {Emmanuelle Kempf and Morgan Vaterkowski and Damien Leprovost and Nicolas Griffon and David Ouagne and Stéphane Bréant and Patricia Serre and Alexandre Mouchet and Bastien Rance and Gilles Chatellier and Ali Bellamine and Marie Frank and Julien Guerin and Xavier Tannier and Alain Livartowski and Martin Hilka and Christel Daniel}, year = {2023}, month = may, journal = {JCO Clinical Cancer Informatics}, volume = {7}, doi = {10.1200/CCI.22.00179} }
@Article{Mohammad2023, title = {{Improving the Detection of Epidemic Clones in Candida parapsilosis Outbreaks by Combining MALDI-TOF Mass Spectrometry and Deep Learning Approaches}}, author = {Mohammad, Noshine and Normand, Anne-Cécile and Nabet, Cécile and Godmer, Alexandre and Brossas, Jean-Yves and Blaize, Marion and Bonnal, Christine and Fekkar, Arnaud and Imbert, Sébastien and Tannier, Xavier and Piarroux, Renaud}, number = {4}, year = {2023}, month = apr, journal = {Microorganisms}, volume = {11}, doi = {10.3390/microorganisms11041071} }
@InProceedings{Naguib2023, title = {{Stratégies d'apprentissage actif pour la reconnaissance d'entités nommées en français}}, author = {Naguib, Marco and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Actes de la 30ème conférence Traitement Automatique des Langues Naturelles (TALN 2023)}, address = {Paris, France}, year = {2023}, month = jun }
@InProceedings{Bannour2023, title = {{Positionnement temporel indépendant des évènements : application à des textes cliniques en français}}, author = {Bannour, Nesrine and Tannier, Xavier and Rance, Bastien and Névéol, Aurélie}, booktitle = {Actes de la 30ème conférence Traitement Automatique des Langues Naturelles (TALN 2023)}, address = {Paris, France}, year = {2023}, month = jun }
@InProceedings{Guevel2023, title = {{Development of a Natural Language Processing Model for deriving breast cancer quality indicators: A cross-sectional, multicenter study}}, author = {Guével, Étienne and Priou, Sonia and Flicoteaux, Rémi and Bey, Romain and Tannier, Xavier and Cohen, Ariel and Chatellier, Gilles and Daniel, Christel and Tournigand, Christophe and Kempf, Emmanuelle}, booktitle = {Revue d'Épidémiologie et de Santé Publique}, year = {2023}, month = dec }
@InProceedings{Gerardin2023, title = {{AB1767-HPR Document Search In Large Rheumatology Databases: Advanced Keyword Queries To Select Homogeneous Phenotypes}}, author = {Gérardin, Christel and Xong, Y and Mekinian, Arsène and Carrat, Fabrice and Tannier, Xavier}, booktitle = {Annals of the Rheumatic Diseases, Health Professionals in Rheumatology Abstracts}, year = {2023}, month = may, pages = {2117-2118} }
@InProceedings{Priou2023, title = {{463P Impact of two waves of Sars-Cov-2 outbreak on the clinical presentation and outcomes of newly referred breast cancer cases at AP-HP: A retrospective multicenter cohort study}}, author = {Priou, S. and Guével, E. and Lamé, G. and Wassermann, J. and Bey, R. and Uzan, C. and Chatellier, G. and Belkacémi, Y. and Tannier, X. and Guillerm, S. and Flicoteaux, R. and Gligorov, J. and Cohen, A. and Benderra, M-A. and Teixeira, L. and Daniel, C. and Tournigand, C. and Kempf, E.}, booktitle = {Annals of the Rheumatic Diseases, European Society for Medical Oncology Abstracts}, year = {2023}, month = oct, volume = {34}, doi = {10.1016/j.annonc.2023.09.639} }
2022
@InProceedings{Tahri2022, title = {{On the portability of extractive Question-Answering systems on scientific papers to real-life application scenarios}}, author = {Tahri, Chyrine and Tannier, Xavier and Haouat, Patrick}, booktitle = {Proceedings of the AACL Workshop on Information Extraction from Scientific Publications}, year = {2022}, month = nov, publisher = {Association for Computational Linguistics}, pages = {67-77} }
Objective: We aimed to provide an automated end-to-end extraction of cohorts of similar patients from electronic health records for systemic diseases.
Methods: Our multistep algorithm includes a named-entity recognition step, a multilabel classification using medical subject headings ontology, and the computation of patient similarity. A selection of cohorts of similar patients on a priori annotated phenotypes was performed. Six phenotypes were selected for their clinical significance: P1, osteoporosis; P2, nephritis in systemic erythematosus lupus; P3, interstitial lung disease in systemic sclerosis; P4, lung infection; P5, obstetric antiphospholipid syndrome; and P6, Takayasu arteritis. We used a training set of 151 clinical notes and an independent validation set of 256 clinical notes, with annotated phenotypes, both extracted from the Assistance Publique-Hôpitaux de Paris data warehouse. We evaluated the precision of the 3 patients closest to the index patient for each phenotype with precision-at-3 and recall and average precision.
Results: For P1-P4, the precision-at-3 ranged from 0.85 (95% CI 0.75-0.95) to 0.99 (95% CI 0.98-1), the recall ranged from 0.53 (95% CI 0.50-0.55) to 0.83 (95% CI 0.81-0.84), and the average precision ranged from 0.58 (95% CI 0.54-0.62) to 0.88 (95% CI 0.85-0.90). P5-P6 phenotypes could not be analyzed due to the limited number of phenotypes.
Conclusions: Using a method close to clinical reasoning, we built a scalable and interpretable end-to-end algorithm for extracting cohorts of similar patients.
@Article{Gérardin2022b, title = {{Construction of Cohorts of Similar Patients From Automatic Extraction of Medical Concepts: Phenotype Extraction Study}}, author = {Gérardin, Christel and Mageau, Arthur and Mékinian, Arsène and Tannier, Xavier and Carrat, Fabrice}, number = {12}, year = {2022}, month = dec, journal = {JMIR Medical Informatics}, volume = {10}, doi = {10.2196/42379} }
- We built a new dataset of real-life P. falciparum-infected red blood cells and uninfected blood components.
- We cross-validated deep learning models for the classification of P. falciparum-infected red blood cells using two datasets.
- We performed a patient-level validation to assess the generalizability of the models in real-life conditions.
- We demonstrated that our dataset generalizes better than the National Institute of Health (NIH) malaria dataset.
Malaria is a fatal disease transmitted by bites from mosquito-type vectors. Biologists examined blood smears under a microscope at high magnification (1000 × ) to identify the presence of parasites in red blood cells (RBCs). Such an examination is laborious and time-consuming. Moreover, microscopists sometimes have difficulty identifying parasitized RBCs due to a lack of skill or practice. Deep learning, especially convolutional neural networks (CNNs) applied for malaria diagnosis, are able to identify complex features of a large number of medical images.The proposed work focuses on the construction of a dataset of blood components images representative of the diagnostic reality captured from 202 patients at 500x magnification. We evaluated through a cross-validation study different deep learning networks for the classification of Plasmodium falciparum-infected RBCs and uninfected blood components. These models include a custom-built CNN, VGG-19, ResNet-50 and EfficientNet-B7. In addition, we conducted the same experiments on a public dataset and compared the performance of the resultant models through a patient-level inference including 200 extra patients. The models trained on our dataset show better performance in terms of generalization and achieved better accuracy, sensitivity and specificity scores of 99.7%, 77.9% and 99.8%, respectively.
@Article{Acherar2022, title = {{Real-life evaluation of deep learning models trained on two datasets for Plasmodium falciparum detection with thin blood smear images at 500x magnification}}, author = {Acherar, Aniss and Tantaoui, Ilhame and Thellier, Marc and Lampros, Alexandre and Piarroux, Renaud and Tannier, Xavier}, number = {101132}, year = {2022}, month = nov, journal = {Informatics in Medicine Unlocked}, volume = {35}, doi = {10.1016/j.imu.2022.101132} }
- Extracting key informations from clinical narratives is a NLP Challenge.
- There is a particular need to improve NLP tasks in languages other than English.
- Our approach allows automatic pathological domains detection from clinical notes.
- Using multilingual vocabularies and multilingual model leads to better results.
Background: The development of electronic health records has provided a large volume of unstructured biomedical information. Extracting patient characteristics from these data has become a major challenge, especially in languages other than English.
Objective: We developed a methodology able to explore and target topics of interest via an interactive user interface for health professionals with limited computer science knowledge. We aim to reach near state-of-the-art performance while reducing memory consumption, increasing scalability, and minimizing user interaction effort to improve the clinical decision-making process. The performance was evaluated on diabetes-related abstracts from PubMed.
Methods: Inspired by the French Text Mining Challenge (DEFT 2021) [1] in which we participated, our study proposes a multilabel classification of clinical narratives, allowing us to automatically extract the main features of a patient report. Our system is an end-to-end pipeline from raw text to labels with two main steps: named entity recognition and multilabel classification. Both steps are based on a neural network architecture based on transformers. To train our final classifier, we extended the dataset with all English and French Unified Medical Language System (UMLS) vocabularies related to human diseases. We focus our study on the multilingualism of training resources and models, with experiments combining French and English in different ways (multilingual embeddings or translation).
Results: We obtained an overall average micro-F1 score of 0.811 for the multilingual version, 0.807 for the French-only version and 0.797 for the translated version.
Conclusion: Our study proposes an original multilabel classification of French clinical notes for patient phenotyping. We show that a multilingual algorithm trained on annotated real clinical notes and UMLS vocabularies leads to the best results.
@Article{Gérardin2022, title = {{Multilabel classification of medical concepts for patient clinical profile identification}}, author = {Gérardin, Christel and Wajsbürt, Perceval and Vaillant, Pascal and Bellamine, Ali and Carrat, Fabrice and Tannier, Xavier}, year = {2022}, month = jun, journal = {Artificial Intelligence in Medicine}, volume = {128}, doi = {10.1016/j.artmed.2022.102311} }
- We propose Privacy-Preserving Mimic Models for clinical named entity recognition.
- Models are trained without processing any sensitive data or private model weights.
- Mimic models achieve up to 0.706 macro exact F-measure on 15 clinical entity types.
- Our approach offers a good compromise between performance and privacy preservation.
A vast amount of crucial information about patients resides solely in unstructured clinical narrative notes. There has been a growing interest in clinical Named Entity Recognition (NER) task using deep learning models. Such approaches require sufficient annotated data. However, there is little publicly available annotated corpora in the medical field due to the sensitive nature of the clinical text. In this paper, we tackle this problem by building privacy-preserving shareable models for French clinical Named Entity Recognition using the mimic learning approach to enable the knowledge transfer through a teacher model trained on a private corpus to a student model. This student model could be publicly shared without any access to the original sensitive data. We evaluated three privacy-preserving models using three medical corpora and compared the performance of our models to those of baseline models such as dictionary-based models. An overall macro F-measure of 70.6% could be achieved by a student model trained using silver annotations produced by the teacher model, compared to 85.7% for the original private teacher model. Our results revealed that these privacy-preserving mimic learning models offer a good compromise between performance and data privacy preservation.
@Article{Bannour2022, title = {{Privacy-Preserving Mimic Models for clinical Named Entity Recognition in French}}, author = {Bannour, Nesrine and Wajsbürt, Perceval and Rance, Bastien and Tannier, Xavier and Névéol, Aurélie}, year = {2022}, month = jun, journal = {Journal of Biomedical Informatics}, volume = {130}, doi = {10.1016/j.jbi.2022.104073} }
@Article{Normand2022, title = {{Identification of a clonal population of Aspergillus flavus by MALDI-TOF mass spectrometry using deep learning}}, author = {Normand, Anne-Cécile and Chaline, Aurélien and Mohammad, Noshine and Godmer, Alexandre and Acherar, Aniss and Huguelin, Antoine and Ranque, Stéphane and Tannier, Xavier and Piarroux, Renaud}, number = {1575}, year = {2022}, month = jan, journal = {Scientific Reports}, volume = {12}, doi = {10.1038/s41598-022-05647-4} }
Objective:We developed a methodology able to explore and target topics of interest via an interactive user interface for health professionals with limited computer science knowledge. We aim to reach near state-of-the-art performance while reducing memory consumption, increasing scalability, and minimizing user interaction effort to improve the clinical decision-making process. The performance was evaluated on diabetes-related abstracts from PubMed.
Methods:The methodology consists of 4 parts: (1) a novel interpretable hierarchical clustering of documents where each node is defined by headwords (words that best represent the documents in the node), (2) an efficient classification system to target topics, (3) minimized user interaction effort through active learning, and (4) a visual user interface. We evaluated our approach on 50,911 diabetes-related abstracts providing a hierarchical Medical Subject Headings (MeSH) structure, a unique identifier for a topic. Hierarchical clustering performance was compared against the implementation in the machine learning library scikit-learn. On a subset of 2000 randomly chosen diabetes abstracts, our active learning strategy was compared against 3 other strategies: random selection of training instances, uncertainty sampling that chooses instances about which the model is most uncertain, and an expected gradient length strategy based on convolutional neural networks (CNNs).
Results:For the hierarchical clustering performance, we achieved an F1 score of 0.73 compared to 0.76 achieved by scikit-learn. Concerning active learning performance, after 200 chosen training samples based on these strategies, the weighted F1 score of all MeSH codes resulted in a satisfying 0.62 F1 score using our approach, 0.61 using the uncertainty strategy, 0.63 using the CNN, and 0.45 using the random strategy. Moreover, our methodology showed a constant low memory use with increased number of documents.
Conclusions:We proposed an easy-to-use tool for health professionals with limited computer science knowledge who combine their domain knowledge with topic exploration and target specific topics of interest while improving transparency. Furthermore, our approach is memory efficient and highly parallelizable, making it interesting for large Big Data sets. This approach can be used by health professionals to gain deep insights into biomedical literature to ultimately improve the evidence-based clinical decision making process.
@Article{Ahne2022, title = {{Improving Diabetes-Related Biomedical Literature Exploration in the Clinical Decision-making Process via Interactive Classification and Topic Discovery: Methodology Development Study}}, author = {Ahne, Adrian and Fagherazzi, Guy and Tannier, Xavier and Czernichow, Thomas and Orchard, Francisco}, number = {1}, year = {2022}, month = jan, journal = {Journal of Medical Internet Research}, volume = {24}, doi = {10.2196/27434} }
@Article{TorresAguilar2022, title = {{Automatic medieval charters structure detection : A Bi-LSTM linear segmentation approach}}, author = {Torres Aguilar, Sergio and Chastang, Pierre and Tannier, Xavier}, year = {2022}, month = oct, journal = {Journal of Data Mining & Digital Humanities}, volume = {2022}, doi = {10.46298/jdmdh.8646} }
Method: Using PubMed, we did a bibliographic search using a combination of MeSH descriptors and free-text terms on CRI, followed by a double-blind review in order to select a list of candidate best papers to be peer-reviewed by external reviewers. After peer-review ranking, three section editors met for a consensus meeting and the editorial team was organized to finally conclude on the selected three best papers.
Results: Among the 1,096 papers (published in 2021) returned by the search and in the scope of the various areas of CRI, the full review process selected three best papers. The first best paper describes an operational and scalable framework for generating EHR datasets based on a detailed clinical model with an application in the domain of the COVID-19 pandemics. The authors of the second best paper present a secure and scalable platform for the preprocessing of biomedical data for deep data-driven health management applied for the detection of pre-symptomatic COVID-19 cases and for biological characterization of insulin-resistance heterogeneity. The third best paper provides a contribution to the integration of care and research activities with the REDCap Clinical Data and Interoperability sServices (CDIS) module improving the accuracy and efficiency of data collection.
Conclusions: The COVID-19 pandemic is still significantly stimulating research efforts in the CRI field to improve the process deeply and widely for conducting real-world studies as well as for optimizing clinical trials, the duration and cost of which are constantly increasing. The current health crisis highlights the need for healthcare institutions to continue the development and deployment of Big Data spaces, to strengthen their expertise in data science and to implement efficient data quality evaluation and improvement programs.
@Article{Daniel2022, title = {{Clinical Research Informatics}}, author = {Daniel, Christel and Tannier, Xavier and Kalra, Dipak}, year = {2022}, journal = {Yearbook of Medical Informatics}, volume = {31}, doi = {10.1055/s-0042-1742530} }
@InBook{Nabet2022, title = {{Artificial Intelligence and Malaria}}, author = {Nabet, Cécile and Acherar, Aniss and Huguenin, Antoine and Tannier, Xavier and Piarroux, Renaud}, booktitle = {Artificial Intelligence in Medicine}, year = {2022}, month = aug, publisher = {Springer International Publishing}, editor = {Lidströmer, Niklas and Ashrafian, Hutan}, pages = {1353--1368}, doi = {10.1007/978-3-030-64573-1_273} }
Background: Intervening in and preventing diabetes distress requires an understanding of its causes and, in particular, from a patient's perspective. Social media data provide direct access to how patients see and understand their disease and consequently show the causes of diabetes distress.
Objective: Leveraging machine learning methods, we aim to extract both explicit and implicit cause-effect relationships in patient-reported diabetes-related tweets and provide a methodology to better understand the opinions, feelings, and observations shared within the diabetes online community from a causality perspective.
Methods:More than 30 million diabetes-related tweets in English were collected between April 2017 and January 2021. Deep learning and natural language processing methods were applied to focus on tweets with personal and emotional content. A cause-effect tweet data set was manually labeled and used to train (1) a fine-tuned BERTweet model to detect causal sentences containing a causal relation and (2) a conditional random field model with Bidirectional Encoder Representations from Transformers (BERT)-based features to extract possible cause-effect associations. Causes and effects were clustered in a semisupervised approach and visualized in an interactive cause-effect network.
Results:Causal sentences were detected with a recall of 68% in an imbalanced data set. A conditional random field model with BERT-based features outperformed a fine-tuned BERT model for cause-effect detection with a macro recall of 68%. This led to 96,676 sentences with cause-effect relationships. "Diabetes" was identified as the central cluster followed by "death" and "insulin." Insulin pricing-related causes were frequently associated with death.
Conclusions:A novel methodology was developed to detect causal sentences and identify both explicit and implicit, single and multiword cause, and the corresponding effect, as expressed in diabetes-related tweets leveraging BERT-based architectures and visualized as cause-effect network. Extracting causal associations in real life, patient-reported outcomes in social media data provide a useful complementary source of information in diabetes research.
@Article{Ahne2022b, title = {{Extraction of Explicit and Implicit Cause-Effect Relationships in Patient-Reported Diabetes-Related Tweets From 2017 to 2021: Deep Learning Approach}}, author = {Adrian Ahne and Vivek Khetan and Xavier Tannier and Md Imbesat Hassan Rizvi and Thomas Czernichow and Francisco Orchard and Charline Bour and Andrew Fano and Guy Fagherazzi}, number = {7}, year = {2022}, month = jul, journal = {JMIR Medical Informatics}, volume = {10}, doi = {10.2196/37201} }
- During the 1 st SARS-CoV2 lockdown, the number of new lung cancers decreased by 32%.
- In 6,240 cases, initial tumor stage, treatment categories did not vary (2018–2021).
- Delay between multidisciplinary boards and cancer treatments did not vary over time.
- Overall survival of patients diagnosed after the outbreak did not impair.
- COVID was associated with poorer OS in patients with systemic anticancer therapy.
Introduction: The SARS-CoV-2 pandemic has impacted the care of cancer patients. This study sought to assess the pandemic’s impact on the clinical presentations and outcomes of newly referred patients with lung cancer from the Greater Paris area.
Methods:We retrospectively retrieved the electronic health records and administrative data of 11.4 million patients pertaining to Greater Paris University Hospital (AP-HP). We compared indicators for the 2018-2019 period to those of 2020 in regard to newly referred lung cancer cases. We assessed initial tumor stage, delay between first multidisciplinary tumor board (MTB) and anticancer treatment initiation, and 6-month overall survival (OS) rates depending on the anticancer treatment including surgery, palliative systemic treatment, and best supportive care (BSC).
Result:Among 6,240 patients with lung cancer, 2,179 (35%) underwent tumor resection, 2,069 (33%) systemic anticancer therapy, 775 (12%) BSC, whereas 1,217 (20%) did not receive any treatment. During the first lockdown, the rate of new diagnoses decreased by 32% compared with that recorded in 2018-2019. Initial tumor stage, repartition of patients among treatment categories, and MTB-related delays remained unchanged. The 6-month OS rates of patients diagnosed in 2018-2019 who underwent tumor resection were 98% vs. 97% (HR=1.2; 95% CI: 0.7-2.0) for those diagnosed in 2020; the respective rates for patients who underwent systemic anticancer therapy were 78% vs. 79% (HR=1.0; 95% CI: 0.8-1.2); these rates were 20% vs. 13% (HR=1.3; 95% CI: 1.1-1.6) for those who received BSC. COVID-19 was associated with poorer OS rates (HR=2.1; 95% CI: 1.6-3.0) for patients who received systemic anticancer therapy.
Conclusions:The SARS-CoV-2 pandemic has not exerted any deleterious impact on 6-month OS of new lung cancer patients that underwent active anticancer therapy in Greater Paris University hospitals.
@Article{Priou2022, title = {{Influence of the SARS-CoV-2 outbreak on management and prognosis of new lung cancer cases, a retrospective multicenter real-life cohort study}}, author = {Sonia Priou and Guillaume Lamé and Gérard Zalcman and Marie Wislez and Romain Bey and Gilles Chatellier and Jacques Cadranel and Xavier Tannier and Laurent Zelek and Christel Daniel and Christophe Tournigand and Emmanuelle Kempf}, year = {2022}, month = sep, journal = {European Journal of Cancer}, volume = {173}, pages = {33-40}, doi = {10.1016/j.ejca.2022.06.018} }
Methods: 407 patients with DFUs (286 male, 121 female; mean age = 60, age range = 32-92) who were managed in a tertiary care centre from 2009 to 2019 were retrospectively identified and included in the study. DFUs were categorized based on the Meggit-Wagner, PEDIS, S(AD)SAD, and University of Texas (UT) classification systems. To identify amputation risk-related factors, results of patients with DFUs who underwent amputations (minor or major) were compared to those who received other adjunctive treatments using Chi-Square, oneway analysis of variance (ANOVA) and Spearman correlation analysis.
Results: The mean C-reactive protein (CRP) and White Blood Cell (WBC) values were significantly higher in patients with major or minor amputation than in those without amputation. The mean Neutrophil (PNL), Platelets (PLT), wound width, creatinine and sedimentation (ESR) values were significantly higher in patients with major amputation compared to other groups of patients. Elevated levels of Highdensity lipoprotein (HDL), Hemoglobin (HGB) and albumin were determined to be protective factors against the risk of amputation. Spearman correlation analysis revealed a positive-sided, strong-levelled, significant relation between Wagner grades and amputation status of patients.
Conclusion: This study has identified specific factors for major and minor amputation risk of patients with DFUs. Especially infection markers such as CRP, WBC, ESR and PNL were higher in the amputation group. Most importantly, Meggit Wagner, one of the four different classification systems used in the DFUs, was determined to be highly associated with patients’ amputation risk.
@Article{Demirkol2022, title = {{Analysis of risk factors for amputation in patients with diabetic foot ulcers: a cohort study from a tertiary center}}, author = {Denizhan Demirkol and Şamil Aktaş and Tuncay Özcan and Xavier Tannier and Çiğdem Selçukcan Erol}, number = {5}, year = {2022}, month = sep, journal = {Acta Orthopaedica et Traumatologica Turcica}, volume = {56}, doi = {10.5152/j.aott.2022.22052} }
In this retrospective cohort study, we collected prospectively the clinical data of the 11.4 million of patients referred to the Greater Paris University Hospitals (AP HP). We identified new CRC cases between January first 2018 and December 31st 2020, and compared indicators for 2018-2019 to 2020. pTNM tumor stage was extracted from postoperative pathology reports for localized colon cancer, and metastatic status was extracted from CT-scan baseline text reports.
Between 2018 and 2020, 3602 and 1083 new colon and rectal cancers were referred to the APHP, respectively.
The 1-year OS rates reached 94%, 93% and 76% for new CRC patients undergoing a resection of the primary tumor, in 2018-2019, in 2020 without any Sars-Cov2 infection and in 2020 with a Sars-Cov2 infection, respectively (HR 3.78, 95%CI 2.1-7.1). For patients undergoing other kind of anticancer treatment, the percentages are 64%, 66% and 27% (HR 2.1, 95%CI 1.4-3.3).
Tumor stage at initial presentation, emergency level of primary tumor resection, delays between the first multidisciplinary meeting and the first anticancer treatment did not differ over time.
The SARS-Cov2 pandemic has been associated with less newly diagnosed CRC patients and worse 1-yr OS rates attributable to the infection itself rather than to its impact on hospital care delivery or tumor stage at initial presentation.
@Article{Kempf2022, title = {{Impact of two waves of Sars-Cov2 outbreak on the number, clinical presentation, care trajectories and survival of patients newly referred for a colorectal cancer: A French multicentric cohort study from a large group of University hospitals}}, author = {Emmanuelle Kempf and Sonia Priou and Guillaume Lamé and Christel Daniel and Ali Bellamine and Daniele Sommacale and Yazid Belkacemi and Romain Bey and Gilles Galula and Namik Taright and Xavier Tannier and Bastien Rance and Rémi Flicoteaux and François Hemery and Etienne Audureau and Gilles Chatellier and Christophe Tournigand}, number = {10}, year = {2022}, month = jan, journal = {International Journal of Cancer}, volume = {150}, doi = {10.1002/ijc.33928} }
@TechReport{Doutreligne2022, title = {{Entrepôts de données de santé hospitaliers en France : Quel potentiel pour la Haute Autorité de santé ?}}, author = {Doutreligne, Matthieu and Degremont, Adeline and Jachiet, Pierre-Alain and Tannier, Xavier and Lamer, Antoine}, year = {2022}, institution = {Haute Autorité de Santé (HAS)}, pages = {55} }
@InProceedings{Bannour2022b, title = {{Modèles préservant la confidentialité des données par mimétisme pour la reconnaissance d’entités nommées en français}}, author = {Bannour, Nesrine and Wajsbürt, Perceval and Rance, Bastien and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Actes de la journée d’étude sur la robustesse des systemes de TAL}, address = {Paris, France}, year = {2022}, month = dec }
2021
- We train a model to normalize medical entities in French with a very large list of concepts.
- Our method is a neural network model that requires no prior translation.
- Multilingual training data improves the performance of medical normalization in French.
- Multilingual embeddings are of less importance than multilingual data.
Objective: We present a system for concept normalization in French. We consider textual mentions already extracted and labeled by a named entity recognition system, and we classify these mentions with a UMLS concept unique identifier. We take advantage of the multilingual nature of available terminologies and embedding models to improve concept normalization in French without translation nor direct supervision.
Materials and methods: We consider the task as a highly-multiclass classification problem. The terms are encoded with contextualized embeddings and classified via cosine similarity and softmax. A first step uses a subset of the terminology to finetune the embeddings and train the model. A second step adds the entire target terminology, and the model is trained further with hard negative selection and softmax sampling.
Results: On two corpora from the Quaero FrenchMed benchmark, we show that our approach can lead to good results even with no labeled data at all; and that it outperforms existing supervised methods with labeled data.
Discussion: Training the system with both French and English terms improves by a large margin the performance of the system on a French benchmark, regardless of the way the embeddings were pretrained (French, English, multilingual). Our distantly supervised method can be applied to any kind of documents or medical domain, as it does not require any concept-labeled documents.
Conclusion: These experiments pave the way for simpler and more effective multilingual approaches to processing medical texts in languages other than English.
@Article{Wajsburt2021, title = {{Medical concept normalization in French using multilingual terminologies and contextual embeddings}}, author = {Wajsbürt, Perceval and Sarfati, Arnaud and Tannier, Xavier}, year = {2021}, month = jan, journal = {Journal of Biomedical Informatics}, volume = {114}, doi = {10.1016/j.jbi.2021.103684} }
@Article{Chastang2021, title = {{A Named Entity Recognition Model for Medieval Latin Charters}}, author = {Chastang, Pierre and Torres Aguilar, Sergio and Tannier, Xavier}, number = {4}, year = {2021}, month = nov, journal = {Digital Humanities Quarterly}, volume = {15} }
Methods: We collected prospectively the clinical data of the 11.4 million of patients referred to the Assistance Publique Hôpitaux de Paris Teaching Hospital. We identified new cancer cases between January 1st 2018 and September 31st 2020, and compared indicators for 2018 and 2019 to 2020 with a focus on the French lockdown (March 17th to May 11th, 2020), across cancer types and patient age classes.
Results: Between January and September, 28,348, 27,272 and 23,734 new cancer cases were identified in 2018, 2019 and 2020, respectively. The monthly median number of new cases reached 3,168 (interquartile range, IQR, 3,027; 3,282), 3,054 (IQR 2,945; 3,127) and 2,723 (IQR 2,085; 2,2,863) in 2018, 2019 and 2020, respectively. From March 1st to May 31st, new cancer decreased by 30% in 2020 compared to the 2018-19 average; then by 9% from June 1st to September 31st. This evolution was consistent across all tumor types: -30% and -9% for colon, -27% and -6% for lung, -29% and -14% for breast, -33% and -12% for prostate cancers, respectively. For patients aged < 70 years, the decrease of colorectal and breast new cancers in April between 2018-2019 average and 2020 reached 41 % and 39%, respectively.
Conclusion: The SARS-Cov2 pandemic led to a substantial decrease of new cancer cases. Delays in cancer diagnoses may affect clinical outcomes in the coming years.
@Article{Kempf2021, title = {{New cancer cases at the time of SARS-Cov2 pandemic and related public health policies: A persistent and concerning decrease long after the end of national lockdown}}, author = {Emmanuelle Kempf and Guillaume Lamé and Richard Layese and Sonia Priou and Gilles Chatellier and Hedi Chaieb and Marc-Antoine Benderra and Ali Bellamine and Romain Bey and Stéphane Bréant and Gilles Galula and Namik Taright and Xavier Tannier and Thomas Guyet and Elisa Salamanca and Etienne Audureau and Christel Daniel and Christophe Tournigand}, year = {2021}, month = feb, journal = {European Journal of Cancer}, volume = {150}, pages = {260-267}, doi = {10.1016/j.ejca.2021.02.015} }
@InProceedings{Wajsburt2021b, title = {{Effect of depth order on iterative nested named entity recognition models}}, author = {Perceval Wajsbürt and Yoann Taillé and Xavier Tannier}, booktitle = {Conference on Artificial Intelligence in Medecine (AIME 2021)}, address = {Porto, Portugal}, year = {2021}, month = jun }
@InProceedings{Gerardin2021, title = {{Classification multilabel de concepts médicaux pour l’identification du profil clinique du patient}}, author = {Christel Gérardin and Pascal Vaillant and Perceval Wajsbürt and Clément Gilavert and Ali Bellamine and Emmanuelle Kempf and Xavier Tannier}, booktitle = {Défi Fouille de Texte (DEFT), Traitement Automatique des Langues Naturelles, 2021}, address = {Lille, France}, year = {2021}, month = jun }
Patients et méthodesLe développement de l’outil de détection automatique s’appuie sur une chaîne de traitement d’algorithmes utilisant des techniques de traitement automatique du langage et de d’apprentissage automatique (Natural Language processing, Machine Learning and Rule-based solutions). Le développement de l’outil et sa validation ont été réalisés à partir des comptes rendus médicaux des départements des urgences et d’orthopédie de l’entrepôt de données de santé (EDS) de l’Assistance publique–Hôpitaux de Paris (AP–HP). L’outil a été développé à partir d’un échantillon aléatoire de 4917 documents issus d’un centre hospitalier. Les documents qui ont servi aux développements des algorithmes sont différents de ceux qui ont servi à leurs entraînements. La validation externe a été réalisée sur l’ensemble des comptes rendus médicaux d’orthopédie et des urgences recueillies en 3 mois dans l’EDS soit 154 031 documents. Les performances de l’outil (Sensibilité Se, Spécificité Sp, valeur prédictive positive VPP, valeur prédictive négative VPN) ont été calculées pour le développement et la validation de l’outil.
RésultatsL’outil a été développé à partir de 3913 documents des Urgences et 1004 documents d’orthopédie. Les performances des différents algorithmes conduisant à l’outil sont : Se comprise entre 80 et 93 %, Sp entre 62 et 99 %, VPP entre 90 et 96 % et VPN entre 69 et 99 %. L’outil a été validé dans une base de 154 031 documents (148 423 des urgences et 5608 d’orthopédie) (46 % de femmes, âge moyen 67 ans). L’outil a permis d’identifier 4 % de documents des urgences avec fracture susceptible d’être ostéoporotique (n = 5806) et 27 % des documents d’orthopédie (n = 1503), soit une population âgée de 74 ans en moyenne avec 68 % de femmes. Une validation manuelle par un expert a été réalisée sur 1000 documents avec fracture identifiée et 1000 documents sans fracture, sélectionnés au hasard. Les Se, Sp, VPP et VPN sont de 68 %, 100 %, 78 % et 99 % pour les comptes rendus des urgences et 84 %, 97 %, 92 % et 93 % pour les comptes rendus d’orthopédie.
ConclusionCette étude est le premier travail montrant qu’un outil d’identification automatique basé sur le traitement automatique du langage et d’apprentissage automatique permet d’identifier des patients avec des fractures susceptibles d’être ostéoporotique sur des comptes médicaux des urgences et d’orthopédie. Les performances de l’outil sont bonnes et permettent de répondre au besoin d’assistance à l’identification des patients dans le cadre de parcours de soins post fracture.
@InProceedings{Bellamine2021, title = {{Identification automatique des patients avec fractures ostéoporotiques à partir de comptes rendus médicaux}}, author = {Ali Bellamine and Christel Daniel and Perceval Wajsbürt and Christian Roux and Xavier Tannier and Karine Briot}, booktitle = {34e Congrès Français de Rhumatologie}, address = {Paris, France}, year = {2021}, month = dec }
@InProceedings{Bannour2021, title = {{Traitement Automatique de la Langue et Intégration de Données pour les Réunions de Concertations Pluridisciplinaires en Oncologie}}, author = {Bannour, Nesrine and Névéol, Aurélie and Tannier, Xavier and Rance, Bastien}, booktitle = {Journée AFIA/ATALA "la santé et le langage"}, year = {2021}, month = feb }
2020
@Article{Nabet2020, title = {{Prediction of malaria transmission drivers in Anopheles mosquitoes using artificial intelligence coupled to MALDI-TOF mass spectrometry}}, author = {Nabet, Cécile and Chaline, Aurélien and Franetich, Jean-François and Brossas, Jean-Yves and Shahmirian, Noémie and Silvie, Olivier and Tannier, Xavier and Piarroux, Renaud}, number = {1}, year = {2020}, month = jul, journal = {Scientific Reports}, volume = {10}, doi = {10.1038/s41598-020-68272-z} }
Research design and methods A total of 11.7 million diabetes-related tweets in English were collected between April 2017 and July 2019. Machine learning methods were used to filter tweets with personal content, to geolocate (to the USA) and to identify clusters of tweets with emotional elements. A sentiment analysis was then applied to each cluster.
Results We identified 46 407 tweets with emotional elements in the USA from which 30 clusters were identified; 5 clusters (18% of tweets) were related to insulin pricing with both positive emotions (joy, love) referring to advocacy for affordable insulin and sadness emotions related to the frustration of insulin prices, 5 clusters (12% of tweets) to solidarity and support with a majority of joy and love emotions expressed. The most negative topics (10% of tweets) were related to diabetes distress (24% sadness, 27% anger, 21% fear elements), to diabetic and insulin shock (45% anger, 46% fear) and comorbidities (40% sadness).
Conclusions Using social media data, we have been able to describe key diabetes-related concerns and their associated emotions. More specifically, we were able to highlight the real-world concerns of insulin pricing and its negative impact on mood. Using such data can be a useful addition to current measures that inform public decision making around topics of concern and burden among people with diabetes.
@Article{Ahne2020, title = {{Insulin pricing and other major diabetes-related concerns in the USA: a study of 46,407 tweets between 2017 and 2019}}, author = {Adrian Ahne and Francisco Orchard and Xavier Tannier and Camille Perchoux and Beverley Balkau and Sherry Pagoto and Jessica Lee Harding and Thomas Czernichow and Guy Fagherazzi}, number = {1}, year = {2020}, month = jun, journal = {BMJ Open Diabetes Research & Care}, volume = {8}, doi = {10.1136/bmjdrc-2020-001190} }
- We have built APcNER, a French corpus for clinical named-entity recognition.
- It includes a large variety of document types, and required 28 hours of annotation.
- We achieved on average 84% non-exact F-measure on five types of clinical entities.
- We give insight into the complementarity of terminology with a supervised model.
Methods We used a terminology-based system as baseline, built upon UMLS and SNOMED. Then, we evaluated a biGRU-CRF, and a hybrid system using the prediction of the terminology-based system as feature for the biGRU-CRF. In French, we built APcNER, a corpus of 147 documents annotated for 5 entities (Drug names, Signs or symptoms, Diseases or disorders, Diagnostic procedures or lab tests and Therapeutic procedures). We evaluated each NER systems using exact and partial match definition of F-measure for NER. The APcNER contains 4,837 entities, which took 28 hours to annotate. The inter-annotator agreement as measured by Cohen’s Kappa was substantial for non-exact match (Κ= 0.61) and moderate considering exact match (Κ = 0.42). In English, we evaluated the NER systems on the i2b2-2009 Medication Challenge for Drug name recognition, which contained 8,573 entities for 268 documents, and i2b2-small a version reduced to match APcNER number of entities.
Results For drug name recognition on both i2b2-2009 and APcNER, the biGRU-CRF performed better that the terminology-based system, with an exact-match F-measure of 91.1% versus 73% and 81.9% versus 75% respectively. For i2b2-small and APcNER, the hybrid system outperformed the biGRU-CRF, with an exact-match F-measure of 85.6% versus 87.8% and 88.4% versus 81.9% respectively. On APcNER corpus, the micro-average F-measure of the hybrid system on the 5 entities was 69.5% in exact match and 84.1% in non-exact match.
Conclusion APcNER is a French corpus for clinical-NER of five types of entities which covers a large variety of document types. The extension of the supervised model with terminology has allowed an easy increase in performance, especially for rare entities, and established near state of the art results on the i2b2-2009 corpus.
@Article{Lerner2020, title = {{Terminologies augmented recurrent neural network model for clinical named entity recognition}}, author = {Lerner, Ivan and Paris, Nicolas and Tannier, Xavier}, year = {2020}, month = feb, journal = {Journal of Biomedical Informatics}, volume = {102}, doi = {10.1016/j.jbi.2019.103356} }
@InProceedings{Tourille2020, title = {{Modèle neuronal pour la résolution de la coréférence dans les dossiers médicaux électroniques}}, author = {Tourille, Julien and Ferret, Olivier and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Actes de la 27ème conférence Traitement Automatique des Langues Naturelles (TALN 2020)}, address = {Nancy, France}, year = {2020}, month = jun }
@InProceedings{Charlet2020, title = {{Élémentaire mon cher Watson ?}}, author = {Jean Charlet and Xavier Tannier}, booktitle = {Journée IA et Santé}, address = {Angers, France}, year = {2020}, month = jun }
@InProceedings{Wajsburt2020, title = {{Participation de l'équipe du LIMICS à DEFT 2020}}, author = {Perceval Wajsbürt and Yoann Taillé and Guillaumé Lainé and Xavier Tannier}, booktitle = {Défi Fouille de Texte (DEFT) 2020}, address = {Nancy, France}, year = {2020}, month = jun }
2019
@InProceedings{Cao2019b, title = {{BeLink: Querying Networks of Facts, Statements and Beliefs}}, author = {Tien-Duc Cao and Ludivine Duroyon and François Goasdoué and Ioana Manolescu and Xavier Tannier}, booktitle = {Proceedings of the 28th ACM International Conference on Information and Knowledge Management (CIKM, demo session)}, address = {Beijing, China}, year = {2019}, month = nov }
@InProceedings{Cao2019a, title = {{Extracting statistical mentions from textual claims to provide trusted content}}, author = {Tien-Duc Cao and Ioana Manolescu and Xavier Tannier}, booktitle = {Proceedings of the 24th International Conference on Applications of Natural Language to Information Systems (NLDB 2019)}, address = {Salford, UK}, year = {2019}, month = nov }
@Misc{Tannier2019, title = {{Hybrid Approaches for our Participation to the n2c2 Challenge on Cohort Selection for Clinical Trials}}, author = {Xavier Tannier and Nicolas Paris and Hugo Cisneros and Christel Daniel and Matthieu Doutreligne and Catherine Duclos and Nicolas Griffon and Claire Hassen-Khodja and Ivan Lerner and Adrien Parrot and Éric Sadou and Cyril Saussol and Pascal Vaillant}, year = {2019}, month = mar, note = {arXiv} }
@InProceedings{Rudnik2019, title = {{Searching News Articles Using an Event Knowledge Graph Leveraged by Wikidata}}, author = {Rudnik, Charlotte and Ehrhart, Thibault and Ferret, Olivier and Teyssou, Denis and Troncy, Raphaël and Tannier, Xavier}, booktitle = {Proceedings of the Wiki Workshop 2019 (The Web Conference)}, address = {San Francisco, USA}, year = {2019}, month = may }
@InProceedings{Paris2019, title = {{Désidentification de comptes-rendus hospitaliers dans une base de données OMOP}}, author = {Nicolas Paris and Matthieu Doutreligne and Adrien Parrot and Xavier Tannier}, booktitle = {Actes de TALMED 2019 : Symposium satellite francophone sur le traitement automatique des langues dans le domaine biomédical}, address = {Lyon, France}, year = {2019}, month = aug }
@InProceedings{Hilbey2019, title = {{Participation de l’équipe LAI à DEFT 2019}}, author = {Jacques Hilbey and Louise Deléger and Xavier Tannier}, booktitle = {Défi Fouille de Texte (DEFT) 2019}, address = {Toulouse, France}, year = {2019}, month = jul }
2018
@InProceedings{Paris2018, title = {{i2b2 implemented over SMART-on-FHIR}}, author = {Nicolas Paris and Michael Mendis and Shawn Murphy and Christel Daniel and Xavier Tannier and Pierre Zweigenbaum}, booktitle = {Proceedings of the AMIA 2018 Informatics Summit}, address = {San Francisco, USA}, year = {2018}, month = mar }
@InProceedings{Tourille2018, title = {{Evaluation of a Sequence Tagging Tool for Biomedical Texts}}, author = {Julien Tourille and Matthieu Doutreligne and Olivier Ferret and Nicolas Paris and Aurélie Névéol and Xavier Tannier}, booktitle = {Proceedings of the EMNLP Workshop on Health Text Mining and Information Analysis (LOUHI 2018)}, address = {Brussels, Belgium}, year = {2018}, month = oct }
- Outlines the current state ofaffairs in the area of digital (or computational) fact-checkingin newsrooms, by journalists, NGO workers, scientists andIT companies;
- Shows which areas of digital contentmanagement research, in particular those relying on theWeb, can be leveraged to help fact-checking, and gives acomprehensive survey of efforts in this area;
- Highlightsongoing trends, unsolved problems, and areas where we envision future scientific and practical advances.
@Misc{Cazalens2018b, title = {{Computational fact-checking: a content management perspective}}, author = {Sylvie Cazalens and Philippe Lamarre and Julien Leblay and Ioana Manolescu and Xavier Tannier}, address = {Rio de Janeiro, Brazil}, year = {2018}, month = aug, note = {Tutorial presented at the conference VLDB.} }
@InProceedings{Cazalens2018, title = {{A Content Management Perspective on Fact-Checking}}, author = {Sylvie Cazalens and Philippe Lamarre and Julien Leblay and Ioana Manolescu and Xavier Tannier}, booktitle = {Proceedings of the Web Conference 2018}, address = {Lyon, France}, year = {2018}, month = apr }
- Outlines the current state of affairs in the area of digital (or computational) fact-checking in newsrooms, by journalists, NGO workers, scientists and IT companies;
- Shows which areas of digital content management research, in particular those relying on the Web, can be leveraged to help fact-checking, and gives a comprehensive survey of efforts in this area;
- Highlights ongoing trends, unsolved problems, and areas where we envision future scientific and practical advances.
@Misc{Leblay2018, title = {{Computational fact-checking: problems, state of the art, and perspectives}}, author = {Julien Leblay and Ioana Manolescu and Xavier Tannier}, address = {Lyon, France}, year = {2018}, month = apr, note = {Tutorial presented at the Web Conference 2018.} }
@InProceedings{Cao2018, title = {{Searching for Truth in a Database of Statistics}}, author = {Cao, Tien Duc and Manolescu, Ioana and Tannier, Xavier}, booktitle = {Proceedings of the 21st International Workshop on the Web and Databases (WebDB 2018)}, address = {Houston, USA}, year = {2018}, month = jun }
@InProceedings{Andrew2018, title = {{Automatic Extraction of Entities and Relation from Legal Documents}}, author = {Andrew, Judith Jeyafreeda and Tannier, Xavier}, booktitle = {Proceedings of the ACL Named Entities Workshop (NEWS 2018)}, address = {Melbourne, Australia}, year = {2018}, month = jul, pages = {1-8} }
@InProceedings{Cao2018b, title = {{Extracting Linked Data from statistic spreadsheets}}, author = {Cao, Tien Duc and Manolescu, Ioana and Tannier, Xavier}, booktitle = {34ème Conférence sur la Gestion de Données – Principes, Technologies et Applications (BDA 2018)}, address = {Bucarest, Romania}, year = {2018}, month = oct }
2017
@InProceedings{Tourille2017b, title = {{Neural Architecture for Temporal Relation Extraction: A Bi-LSTM Approach for Detecting Narrative Containers}}, author = {Tourille, Julien and Ferret, Olivier and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL 2017, short paper)}, address = {Vancouver, Canada}, year = {2017}, month = aug }
@InProceedings{Moreno2017, title = {{Combining Word and Entity Embeddings for Entity Linking}}, author = {Jose Moreno and Romaric Besançon and Romain Beaumont and Eva D'Hondt and Anne-Laure Ligozat and Sophie Rosset and Xavier Tannier and Brigitte Grau}, booktitle = {Proceedings of the 14th Extended Semantic Web Conference (ESWC 2017)}, address = {Portorož, Slovenia}, year = {2017}, month = may }
@InProceedings{Tourille2017, title = {{Temporal information extraction from clinical text}}, author = {Tourille, Julien and Ferret, Olivier and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Proceedings of the European Chapter of the ACL (EACL 2017, short paper)}, address = {Valencia, Spain}, year = {2017}, month = apr }
@InProceedings{Ribeiro2017, title = {{Unsupervised Event Clustering and Aggregation from Newswire and Web Articles}}, author = {Ribeiro, Swen and Ferret, Olivier and Tannier, Xavier}, booktitle = {Proceedings of the 2nd workshop "Natural Language meets Journalism" (EMNLP 2017)}, address = {Copenhagen, Denmark}, year = {2017}, month = sep }
@InProceedings{Tourille2017c, title = {{LIMSI-COT at SemEval-2017 Task 12: Neural Architecture for Temporal Information Extraction from Clinical Narratives}}, author = {Tourille, Julien and Ferret, Olivier and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval 2017)}, address = {Vancouver, Canada}, year = {2017}, month = aug }
@InProceedings{Cao2017, title = {{Extracting Linked Data from statistic spreadsheets}}, author = {Cao, Tien Duc and Manolescu, Ioana and Tannier, Xavier}, booktitle = {Proceedings of the SIGMOD workshop on Semantic Big Data (SBD 2017)}, address = {Chicago, USA}, year = {2017}, month = may }
@InProceedings{Moreno2017b, title = {{Apprendre des représentations jointes de mots et d'entités pour la désambiguïsation d'entités}}, author = {José Moreno and Romaric Besançon and Romain Beaumont and Eva D'Hondt and Anne-Laure Ligozat and Sophie Rosset and Xavier Tannier and Brigitte Grau}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2017)}, address = {Orléans, France}, year = {2017}, month = jun }
2016
@Article{Bellot2016, title = {{INEX Tweet Contextualization Task: Evaluation, Results and Lessons Learned}}, author = {Patrice Bellot and Véronique Moriceau and Josiane Mothe and Éric SanJuan and Xavier Tannier}, year = {2016}, journal = {Information Processing and Management} }
@InProceedings{Tannier2016a, title = {{NLP-driven Data Journalism: Time-Aware Mining and Visualization of International Alliances}}, author = {Xavier Tannier}, booktitle = {Proceedings of "Natural Language meets Journalism", workshop of the International Joint Conference on Artificial Intelligence (IJCAI 2016)}, address = {New York, USA}, year = {2016}, month = jul }
@InProceedings{Tannier2016b, title = {{Creation, Visualization and Edition of Timelines for Journalistic Use}}, author = {Xavier Tannier and Frédéric Vernier}, booktitle = {Proceedings of "Natural Language meets Journalism", workshop of the International Joint Conference on Artificial Intelligence (IJCAI 2016)}, address = {New York, USA}, year = {2016}, month = jul }
@InProceedings{Torres2016, title = {{Named entity recognition applied on a data base of Medieval Latin charters. The case of chartae burgundiae}}, author = {Torres Aguilar, Sergio and Tannier, Xavier and Chastang, Pierre}, booktitle = {Proceedings of the 3rd International Workshop on Computational History (HistoInformatics 2016)}, address = {Krakow, Poland}, year = {2016}, month = jul }
@InProceedings{Pontiki2016, title = {{SemEval-2016 Task 5: Aspect Based Sentiment Analysis}}, author = {Pontiki, Maria and Galanis, Dimitrios and Papageorgiou, Haris and Androutsopoulos, Ion and Manandhar, Suresh and Al-Smadi, Mohammad and Al-Ayyoub, Mahmoud and Zhao, Yanyan and Qin, Bing and De Clecq, Orphée and Hoste, Véronique and Apidianaki, Marianna and Tannier, Xavier and Loukachevitch, Natalia and Kotelnikov, Evgeny and Bel, Nuria and Jiménez-Zafra, Salud María and Eryiğit, Gülşen}, booktitle = {Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval 2016)}, address = {San Diego, USA}, year = {2016}, month = jun }
@InProceedings{Tourille2016b, title = {{LIMSI-COT at SemEval-2016 Task 12: Temporal relation identification using a pipeline of classifiers}}, author = {Tourille, Julien and Ferret, Olivier and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval 2016)}, address = {San Diego, USA}, year = {2016}, month = jun }
@InProceedings{Tourille2016a, title = {{Extraction de relations temporelles dans des dossiers électroniques patient}}, author = {Tourille, Julien and Ferret, Olivier and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2016, article court)}, address = {Paris, France}, year = {2016}, month = jul }
@InProceedings{Nguyen2016, title = {{A Dataset for Open Event Extraction in English}}, author = {Kiem-Hieu Nguyen and Xavier Tannier and Olivier Ferret and Romaric Besançon}, booktitle = {Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)}, address = {Portorož, Slovenia}, year = {2016}, month = may }
@InProceedings{Apidianaki2016, title = {{Datasets for Aspect-Based Sentiment Analysis in French}}, author = {Marianna Apidianaki and Xavier Tannier and Cécile Richart}, booktitle = {Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)}, address = {Portorož, Slovenia}, year = {2016}, month = may }
@InProceedings{Bonaque2016, title = {{Mixed-instance querying: a lightweight integration architecture for data journalism}}, author = {Bonaque, Raphaël and Cao, Tien-Duc and Cautis, Bogdan and Goasdoué, François and Letelier, Javier and Manolescu, Ioana and Mendoza, Oscar and Ribeiro, Swen and Tannier, Xavier and Thomazo, Michael}, booktitle = {Proceedings of the Conference on Very Large Databases (Demonstrations Track, PVLDB 2016)}, address = {New Delhi, India}, year = {2016}, month = sep }
@InProceedings{Neveol2016, title = {{Clinical Information Extraction at the CLEF eHealth Evaluation lab 2016}}, author = {Névéol, Aurélie and Cohen, K. Bretonnel and Grouin, Cyril and Hamon, Thierry and Lavergne, Thomas and Kelly, Liadh and Goeuriot, Lorraine and Rey, Grégoire and Robert, Aude and Tannier, Xavier and Zweigenbaum, Pierre}, booktitle = {CLEF 2016 (online working notes)}, address = {Evora, Portugal}, year = {2016}, month = sep }
2015
@InProceedings{Nguyen2015, title = {{Generative Event Schema Induction with Entity Disambiguation}}, author = {Kiem-Hieu Nguyen and Xavier Tannier and Olivier Ferret and Romaric Besançon}, booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and 7th International Joint Conference on Natural Language Processing of the Asian Federation of Natural Language Processing (ACL-IJCNLP 2015)}, address = {Beijing, China}, year = {2015}, month = jul }
@InProceedings{TapiNzali2015b, title = {{Automatic Extraction of Time Expressions Accross Domains in French Narratives}}, author = {Tapi Nzali, Mike Donald and Névéol, Aurélie and Tannier, Xavier}, booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2015, short paper)}, address = {Lisbon, Portugal}, year = {2015}, month = sep }
@InProceedings{Dhondt2015, title = {{Redundancy in French Electronic Health Records: A preliminary study}}, author = {D'hondt, Eva and Tannier, Xavier and Névéol, Aurélie}, booktitle = {Proceedings of the EMNLP International Workshop on Health Text Mining and Information Analysis (LOUHI 2015)}, address = {Lisbon, Portugal}, year = {2015}, month = sep }
@InProceedings{Arnulphy2015, title = {{Supervised Machine Learning Techniques to Detect TimeML Events in French and English}}, author = {Béatrice Arnulphy and Vincent Claveau and Xavier Tannier and Anne Vilnat}, booktitle = {Proceedings of the 20th International Conference on Applications of Natural Language to Information Systems (NLDB 2015)}, address = {Passau, Germany}, year = {2015}, month = jun }
@InProceedings{Neveol2015, title = {{CLEF eHealth Evaluation Lab 2015 Task 1b: clinical named entity recognition}}, author = {Aurélie Névéol and Cyril Grouin and Xavier Tannier and Thierry Hamon and Liadh Kelly and Lorraine Goeuriot ad Pierre Zweigenbaum}, booktitle = {Working Notes of the Conference and Labs of the Evaluation Forum (CLEF 2015, CEUR-WS 1391)}, address = {Toulouse, France}, year = {2015}, month = sep }
@Article{Bellot2015a, title = {{Mesures d’informativité et de lisibilité pour un cadre d’évaluation de la contextualisation de tweets}}, author = {Patrice Bellot and Véronique Moriceau and Josiane Mothe and Éric SanJuan and Xavier Tannier}, number = {1}, year = {2015}, journal = {Document Numérique}, volume = {18}, pages = {55-73} }
@InProceedings{Nguyen2015a, title = {{Désambiguïsation d'entités pour l'induction non supervisée de schémas événementiels}}, author = {Kiem-Hieu Nguyen and Xavier Tannier and Olivier Ferret and Romaric Besançon}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2015)}, address = {Caen, France}, year = {2015}, month = jun }
@InProceedings{TapiNzali2015, title = {{Analyse d'expressions temporelles dans les dossiers électroniques patients}}, author = {Mike Donald Tapi Nzali and Aurélie Névéol and Xavier Tannier}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2015)}, address = {Caen, France}, year = {2015}, month = jun }
2014
@InProceedings{Nguyen2014a, title = {{Ranking Multidocument Event Descriptions for Building Thematic Timelines}}, author = {Kiem-Hieu Nguyen and Xavier Tannier and Véronique Moriceau}, booktitle = {Proceedings of the 30th International Conference on Computational Linguistics (Coling 14)}, address = {Dublin, Ireland}, year = {2014}, month = aug }
@Misc{Tannier2014b, title = {{Traitement des événements et ciblage d'information}}, author = {Xavier Tannier}, year = {2014}, month = jun, school = {Université Paris-Sud, École Doctorale d'Informatique}, howpublished = {Habilitation à Diriger des Recherches (HDR)}, note = {} }
@InProceedings{Bellot2014, title = {{Overview of INEX 2014}}, author = {Patrice Bellot and Toine Bogers and Shlomo Geva and Mark Hall and Hugo Huurdeman and Jaap Kamps and Gabriella Kazai and Marijn Koolen and Véronique Moriceau and Josiane Mothe and Michael Preminger and Eric SanJuan and Ralf Schenkel and Mette Skov and Xavier Tannier and David Walsh}, booktitle = {Information Access Evaluation. Multilinguality, Multimodality, and Interaction, 5th International Conference of the CLEF Initiative, CLEF 2014}, year = {2014}, publisher = {Springer}, volume = {LNCS 8685}, pages = {212-228} }
@InProceedings{DeGroc2014a, title = {{Thematic Cohesion: Measuring Terms Discriminatory Power Toward Themes}}, author = {Clément De Groc and Xavier Tannier and Claude De Loupy}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC 2014)}, address = {Reykjavík, Iceland}, year = {2014}, month = may }
@InProceedings{DeGroc2014b, title = {{Evaluating Web-as-corpus Topical Document Retrieval with an Index of the OpenDirectory}}, author = {Clément De Groc and Xavier Tannier}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC 2014)}, address = {Reykjavík, Iceland}, year = {2014}, month = may }
@InProceedings{Moriceau2014a, title = {{French Resources for Extraction and Normalization of Temporal Expressions with HeidelTime}}, author = {Véronique Moriceau and Xavier Tannier}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC 2014)}, address = {Reykjavík, Iceland}, year = {2014}, month = may }
DCTFinder is made freely available on http://sourceforge.net/projects/dctfinder/, as well as all resources (vocabulary and annotated documents) built for training and evaluating the system in English and French, and the English trained model itself.
@InProceedings{Tannier2014a, title = {{Extracting News Web Page Creation Time with DCTFinder}}, author = {Xavier Tannier}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC 2014)}, address = {Reykjavík, Iceland}, year = {2014}, month = may }
@InProceedings{Filhol2014a, title = {{Construction of a French-LSF corpus}}, author = {Michael Filhol and Xavier Tannier}, booktitle = {Proceedings of the 7th Workshop on Building and Using Comparable Corpora Building Resources for Machine Translation Research (BUCC 2014)}, address = {Reykjavík, Iceland}, year = {2014}, month = may }
@InProceedings{Grouin2014, title = {{How to de-identify a large clinical corpus in 10 days}}, author = {Cyril Grouin and Louise Deléger and Jean-Baptiste Escudié and Gregory Groisy and Anne-Sophie Jeannot and Bastien Rance and Xavier Tannier and Aurélie Névéol}, booktitle = {Proceedings of the AMIA 2014 Annual Symposium (short paper)}, address = {Washington DC, USA}, year = {2014}, month = nov }
@InProceedings{Arnulphy2014, title = {{Techniques d’apprentissage supervisé pour l’extraction d’événements TimeML en anglais et français}}, author = {Béatrice Arnulphy and Vincent Claveau and Xavier Tannier and Anne Vilnat}, booktitle = {Actes de la COnférence en Recherche d'Information et ses Applications (CORIA 2014)}, address = {Nancy, France}, year = {2014}, month = mar }
@InProceedings{deGroc2014, title = {{Apprendre à ordonner la frontière de crawl pour le crawling orienté}}, author = {Clément de Groc and Xavier Tannier}, booktitle = {Actes de la COnférence en Recherche d'Information et ses Applications (CORIA 2014)}, address = {Nancy, France}, year = {2014}, month = mar }
2013
In order to build these event threads, we use a cascade of classifiers and other modules, taking advantage of the redundancy of information in the newswire corpus.
We also share interesting comments concerning our manual annotation procedure for building a training and testing set
@InProceedings{Tannier2013b, title = {{Building Event Threads out of Multiple News Articles}}, author = {Xavier Tannier and Véronique Moriceau}, booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2013)}, address = {Seattle, USA}, year = {2013} }
Design. To detect clinical events, we used rules and Conditional Random Fields. We built Random Forest models to identify event modality and polarity. To identify temporal expressions we built on the HeidelTime system. To detect temporal relations, we systematically studied their breakdown into distinct situations; we designed an oracle method to determine the most prominent situations and the most suitable associated classifiers, and combined their results.
Results. We achieved F-measures of 0.8307 for event identification, based on rules, and 0.8385 for temporal expression identification. In the temporal relation task, we identified nine main situations in three groups, experimentally confirming shared intuitions: within-sentence relations, section-related time, and across-sentence relations. Logistic regression and Naïve Bayes performed best on the first and third groups, and decision trees on the second. We reached a 0.6231 global F-measure, improving by 7.5 points our official submission.
Conclusions. Carefully hand-crafted rules obtained good results for the detection of events and temporal expressions, while a combination of classifiers improved temporal link prediction. The characterization of the oracle recall of situations allowed us to point at directions where further work would be most useful for temporal relation detection: within-sentence relations and linking History of Present Illness events to the admission date. We suggest that the systematic situation breakdown proposed in this paper could also help improve other systems addressing this task.
@Article{Grouin2013, title = {{Eventual situations for timeline extraction from clinical reports}}, author = {Cyril Grouin and Natalia Grabar and Thierry Hamon and Sophie Rosset and Xavier Tannier and Pierre Zweigenbaum}, year = {2013}, month = apr, journal = {Journal of the American Medical Informatics Association} }
@Article{Kessler2013, title = {{Extraction de dates saillantes pour la construction de chronologies thématiques}}, author = {Rémy Kessler and Xavier Tannier and Caroline Hagège and Véronique Moriceau and André Bittar}, number = {2}, year = {2013}, journal = {Traitement Automatique des Langues, numéro spécial sur le traitement automatique des informations temporelles et spatiales}, volume = {53} }
@InProceedings{Bellot2013a, title = {{\'Evaluation de la contextualisation de tweets}}, author = {Patrice Bellot and Véronique Moriceau and Josiane Mothe and Eric SanJuan and Xavier Tannier}, booktitle = {Actes de la COnférence en Recherche d'Information et ses Applications (CORIA 2013, article court)}, address = {Neuchâtel, Suisse}, year = {2013}, month = apr }
@InProceedings{Zweigenbaum13a, title = {{Extraction des relations temporelles entre événements médicaux dans des comptes rendus hospitaliers}}, author = {Pierre Zweigenbaum and Xavier Tannier}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2013, article court)}, address = {Les Sables d'Olonne, France}, year = {2013}, month = jun }
@InProceedings{Tannier2013a, title = {{Une interface pour la validation et l’évaluation de chronologies thématiques}}, author = {Xavier Tannier and Véronique Moriceau and Erwan le Flem}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2013), session Démonstrations}, address = {Les Sables d'Olonne, France}, year = {2013}, month = jun }
@InProceedings{Bellot2013b, title = {{Overview of the INEX Tweet Contextualization 2013 Track}}, author = {Patrice Bellot and Véronique Moriceau and Josiane Mothe and \'Eric SanJuan and Xavier Tannier}, booktitle = {Working Notes for the CLEF 2013 Workshop}, address = {Valencia (Spain)}, year = {2013}, month = sep }
@InProceedings{Bellot2013d, title = {{Overview of INEX 2013}}, author = {Patrice Bellot and Antoine Doucet and Shlomo Geva and Sairam Gurajada and Jaap Kamps and Gabriella Kazai and Marijn Koolen and Arunav Mishra and Véronique Moriceau and Josiane Mothe and Michael Preminger and Eric SanJuan and Ralf Schenkel and Xavier Tannier and Martin Theobald and Matthew Trappett and Qiuyue Wang}, booktitle = {Information Access Evaluation. Multilinguality, Multimodality, and Interaction, 4th International Conference of the CLEF Initiative, CLEF 2013}, year = {2013}, publisher = {Springer}, volume = {LNCS 8138}, pages = {269-281} }
@Article{Bellot2013c, title = {{Report on INEX 2013}}, author = {P. Bellot and G. Kazai and M. Preminger and M. Trappett and A. Doucet and M. Koolen and E. SanJuan and A. Trotman and S. Geva and A. Mishra and R. Schenkel and M. Sanderson and S. Gurajada and V. Moriceau and X. Tannier and F. Scholer and J. Kamps and J. Mothe and M. Theobald and Q. Wang}, number = {2}, year = {2013}, month = dec, journal = {SIGIR Forum}, volume = {47} }
@TechReport{Arnulphy2013, title = {{Entités Nommées Événement : guide d’annotation}}, author = {Béatrice Arnulphy and Xavier Tannier}, number = {2013-12}, year = {2013}, institution = {LIMSI-CNRS} }
2012
@InProceedings{Kessler2012a, title = {{Finding Salient Dates for Building Thematic Timelines}}, author = {Rémy Kessler and Xavier Tannier and Caroline Hagège and Véronique Moriceau and André Bittar}, booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012)}, address = {Jeju Island, Republic of Korea}, year = {2012}, month = jul, publisher = {Association for Computational Linguistics}, pages = {730-739} }
@Article{Bernhard2012, title = {{Question Generation for French: Collating Parsers and Paraphrasing Questions}}, author = {Delphine Bernhard and Louis De Viron and Véronique Moriceau and Xavier Tannier}, number = {2}, year = {2012}, journal = {Dialogue & Discourse, Special Issue on Question Generation}, volume = {3}, pages = {43-74} }
@InProceedings{Arnulphy12a, title = {{Automatically Generated Noun Lexicons for Event Extraction}}, author = {Béatrice Arnulphy and Xavier Tannier and Anne Vilnat}, booktitle = {Proceedings of the 13th International Conference on Intelligent Text Processing and Computational Linguistics (CicLing 2012)}, address = {New Delhi, India}, year = {2012}, month = mar, volume = {2}, pages = {219-231} }
@InProceedings{DeGroc12b, title = {{Experiments on Pseudo Relevance Feedback using Graph Random Walks}}, author = {Clément de Groc and Xavier Tannier}, booktitle = {Proceedings of the 19th International Symposium on String Processing and Information Retrieval (SPIRE 2012, Short Paper)}, address = {Cartagena, Colombia}, year = {2012}, month = oct, volume = {LNCS 7608}, pages = {193-198} }
@InProceedings{SanJuan2012b, title = {{Overview of the INEX 2011 Question Answering Track (QA@INEX)}}, author = {\'Eric SanJuan and Véronique Moriceau and Xavier Tannier and Patrice Bellot and Josiane Mothe}, booktitle = {Focused Retrieval of Content and Structure, 10th International Workshop of the Inititative for the Evaluation of XML Retrieval, INEX 2011}, year = {2012}, series = {Lecture Notes in Computer Science}, volume = {LNCS 7424}, editor = {Shlomo Geva and Jaap Kamps and Ralf Schenkel}, pages = {269-281} }
@InProceedings{SanJuan2012a, title = {{Overview of the INEX 2012 Tweet Contextualization Track}}, author = {\'Eric SanJuan and Véronique Moriceau and Xavier Tannier and Patrice Bellot and Josiane Mothe}, booktitle = {Working Notes for the CLEF 2012 Workshop}, address = {Rome (Italy)}, year = {2012}, month = sep }
@Article{Bellot12b, title = {{Report on INEX 2012}}, author = {P. Bellot and T. Chappell and A. Doucet and S. Geva and S. Gurajada and J. Kamps and G. Kazai and M. Koolen and M. Landoni and M. Marx and A. Mishra and V. Moriceau and J. Mothe and M. Preminger and G. Ramírez and M. Sanderson and E. Sanjuan and F. Scholer and X. Tannier and M. Theobald and M. Trappett and A. Trotman and Q. Wang}, number = {2}, year = {2012}, month = dec, journal = {SIGIR Forum}, volume = {46}, pages = {50-59} }
@Article{Bellot12a, title = {{Report on INEX 2011}}, author = {P. Bellot and T. Chappell and A. Doucet and S. Geva and J. Kamps and G. Kazai and M. Koolen and M. Landoni and M. Marx and V. Moriceau and J. Mothe and G. Ramírez and M. Sanderson and E. Sanjuan and F. Scholer and X. Tannier and M. Theobald and M. Trappett and A. Trotman and Q. Wang}, number = {1}, year = {2012}, month = jun, journal = {SIGIR Forum}, volume = {46}, pages = {33-42} }
@InProceedings{DeGroc12a, title = {{Un critère de cohésion thématique fondé sur un graphe de cooccurrences}}, author = {Clément de Groc and Xavier Tannier and Claude de Loupy}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2012)}, address = {Grenoble, France}, year = {2012}, month = jun, pages = {183-195} }
@InProceedings{Tannier2012a, title = {{Evolution of Event Designation in Media: Preliminary Study}}, author = {Xavier Tannier and Véronique Moriceau and Béatrice Arnulphy and Ruixin He}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)}, address = {Istanbul, Turkey}, year = {2012}, month = may }
In this paper, we present a typology and annotation guidelines for event nominals annotation. We applied these materials to French newswire articles and produced an annotated corpus. We present observations about the designations used in our manually annotated corpus and the behavior of their triggers. We provide statistics concerning word ambiguity and context of use of event nominals, as well as machine learning experiments showing the difficulty of using lexicons for extracting events.
@InProceedings{Arnulphy2012b, title = {{Event Nominals: Annotation Guidelines and a Manually Annotated Corpus in French}}, author = {Béatrice Arnulphy and Xavier Tannier and Anne Vilnat}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)}, address = {Istanbul, Turkey}, year = {2012}, month = may }
@InProceedings{Bittar2012, title = {{Temporal Annotation: A Proposal for Guidelines and an Experiment with Inter-annotator Agreement}}, author = {André Bittar and Caroline Hagège and Véronique Moriceau and Xavier Tannier and Charles Tesseidre}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)}, address = {Istanbul, Turkey}, year = {2012}, month = may }
Official web page
Manual
@InProceedings{Tannier2012b, title = {{WebAnnotator, an Annotation Tool for Web Pages}}, author = {Xavier Tannier}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)}, address = {Istanbul, Turkey}, year = {2012}, month = may }
@InProceedings{Paroubek2012, title = {{A Rough Set Formalization of Quantitative Evaluation with Ambiguity}}, author = {Patrick Paroubek and Xavier Tannier}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)}, address = {Istanbul, Turkey}, year = {2012}, month = may }
2011
@Article{Tannier2011a, title = {{Evaluating Temporal Graphs Built from Texts via Transitive Reduction}}, author = {Xavier Tannier and Philippe Muller}, year = {2011}, journal = {Journal of Artificial Intelligence Research}, volume = {40}, pages = {375-413} }
@InProceedings{DeGroc11a, title = {{GrawlTCQ: Terminology and Corpora Building by Ranking Simultaneously Terms, Queries and Documents using Graph Random Walks}}, author = {Clément de Groc and Xavier Tannier and Javier Couto}, booktitle = {Proceedings of ACL Workshop on Graph-based Methods for Natural Language Processing (TextGraph 2011)}, address = {Portland, Oregon, USA}, year = {2011}, month = jul, pages = {37-41} }
@InProceedings{Arnulphy11a, title = {{Un lexique pondéré des noms d'événements en français}}, author = {Béatrice Arnulphy and Xavier Tannier and Anne Vilnat}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2011, article court)}, address = {Montpellier, France}, year = {2011}, month = jul }
@InProceedings{SanJuan2011b, title = {{Overview of the INEX 2011 Question Answering Track (QA@INEX)}}, author = {\'Eric SanJuan and Véronique Moriceau and Xavier Tannier and Patrice Bellot and Josiane Mothe}, booktitle = {Pre-proceedings of the INitiative for the Evaluation of XML retrieval workshop (INEX 2011)}, address = {Saarbrücken (Germany)}, year = {2011}, month = dec, pages = {145-153} }
@InProceedings{DeViron11, title = {{Génération automatique de questions à partir de textes en français}}, author = {Louis de Viron and Delphine Bernhard and Véronique Moriceau and Xavier Tannier}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2011, article court)}, address = {Montpellier, France}, year = {2011}, month = jul }
@InProceedings{SanJuan2011a, title = {{Overview of the INEX 2010 Question Answering Track (QA@INEX)}}, author = {\'Eric SanJuan and Patrice Bellot and Véronique Moriceau and Xavier Tannier}, booktitle = {Comparative Evaluation of Focused Retrieval, 9th International Workshop of the Inititative for the Evaluation of XML Retrieval, INEX 2010}, address = {Vugh, The Netherlands}, year = {2011}, series = {Lecture Notes in Computer Science}, volume = {LNCS 6932}, editor = {Shlomo Geva and Jaap Kamps and Ralf Schenkel and Andrew Trotman}, pages = {269-281} }
@Article{Alexander11, title = {{Report on INEX 2010}}, author = {D. Alexander and P. Arvola and T. Beckers and P. Bellot and T. Chappell and C.M. De Vries and A. Doucet and N. Fuhr and S. Geva and J. Kamps and G. Kazai and M. Koolen and S. Kutty and M. Landoni and V. Moriceau and R. Nayak and R. Nordlie and N. Pharo and E. SanJuan and R. Schenkel and A. Tagarelli and X. Tannier and J.A. Thom and A. Trotman and J. Vaino and Q. Wang and C. Wu}, number = {1}, year = {2011}, month = jun, journal = {SIGIR Forum}, volume = {45}, pages = {2-17} }
@Misc{Arnulphy11b, title = {{Vers une extraction automatique des événements dans les textes}}, author = {Béatrice Arnulphy and Xavier Tannier and Anne Vilnat}, year = {2011}, month = may, howpublished = {Colloque international - Langage, discours, événements, Firenze, Italy}, note = {} }
@InProceedings{Campion11, title = {{FILTRAR-S : Nouveaux développements}}, author = {Campion, N. and Closson, J. and Ferret, O. and Besançon, R. and Wang, W. and Shin, J. and Floret, J.-M. and Grau, B. and Tannier, X. and Mezaour, A.-D. and Lazard, J. -M.}, booktitle = {Actes du Workshop Sécurité Globale (WISG' 2011)}, address = {Troyes, France}, year = {2011} }
2010
@ARTICLE{Moriceau2010a, title = {{FIDJI: Using Syntax for Validating Answers in Multiple Documents}}, author = {Véronique Moriceau and Xavier Tannier}, number = {5}, year = {2010}, month = oct, journal = {Information Retrieval, Special Issue on Focused Information Retrieval}, publisher = {Springer}, volume = {13}, pages = {507-533} }
@InProceedings{SanJuan2010, title = {{Overview of the 2010 QA Track: Preliminary results}}, author = {\'Eric SanJuan and Patrice Bellot and Véronique Moriceau and Xavier Tannier}, booktitle = {Pre-proceedings of the INitiative for the Evaluation of XML retrieval workshop (INEX 2010)}, address = {Vught (Netherlands)}, year = {2010}, month = dec, pages = {209-213} }
@InProceedings{Moriceau2010b, title = {{Overview of the 2009 QA Track: Towards a Common Task for QA, Focused IR and Automatic Summarization Systems}}, author = {Véronique Moriceau and \'Eric SanJuan and Xavier Tannier and Patrice Bellot}, booktitle = {Focused Retrieval and Evaluation, 8th International Workshop of the Initiative for the Evaluation of XML Retrieval, INEX 2009}, address = {Brisbane, Australia}, year = {2010}, publisher = {Springer Verlag}, series = {Lecture Notes in Computer Science}, volume = {LNCS 6203}, pages = {355-365} }
@InProceedings{Tannier2010b, title = {{FIDJI@ResPubliQA 2010}}, author = {Xavier Tannier and Véronique Moriceau}, booktitle = {Proceedings of Multiple Language Question Answering 2010 (MLQA10)}, address = {Padua, Italy}, year = {2010}, month = sep }
@InProceedings{Arnulphy10, title = {{Les entités nommées événement et les verbes de cause-conséquence}}, author = {Béatrice Arnulphy and Xavier Tannier and Anne Vilnat}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2010, article court)}, address = {Montréal, Canada}, year = {2010}, month = jul }
@INPROCEEDINGS{Moriceau2010c, title = {{Une étude des questions "complexes" en question-réponse}}, author = {Véronique Moriceau and Xavier Tannier and Mathieu Falco}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2010, article court)}, address = {Montréal, Canada}, year = {2010}, month = jul }
@Article{Beckers2010, title = {{Report on INEX 2009}}, author = {Thomas Beckers and Patrice Bellot and Gianluca Demartini and Ludovic Denoyer and Christopher M. De Vries and Antoine Doucet and Khairun Nisa Fachry and Norbert Fuhr and Patrick Gallinari and Shlomo Geva and Wei-Che Huang and Tereza Iofciu and Jaap Kamps and Gabriella Kazai and Marijn Koolen and Sangeetha Kutty and Monica Landoni and Miro Lehtonen and Véronique Moriceau and Richi Nayak and Ragnar Nordlie and Nils Pharo and \'Eric SanJuan and Ralf Schenkel and Xavier Tannier and Martin Theobald and James A. Thom and Andrew Trotman and Arjen P. de Vries}, number = {1}, year = {2010}, journal = {SIGIR Forum}, volume = {44}, pages = {38-57} }
@INPROCEEDINGS{Tannier2010a, title = {{FIDJI: Web Question-Answering at Quaero 2009}}, author = {Xavier Tannier and Véronique Moriceau}, booktitle = {Proceedings of the Seventh International Language Resources and Evaluation (LREC'10)}, address = {La Valette, Malta}, year = {2010}, month = may, publisher = {ELRA} }
@INPROCEEDINGS{Galibert2010b, title = {{Hybrid Citation Extraction from Patents}}, author = {Olivier Galibert and Sophie Rosset and Xavier Tannier and Fanny Grandry}, booktitle = {Proceedings of the Seventh International Language Resources and Evaluation (LREC'10)}, address = {La Valette, Malta}, year = {2010}, month = may, publisher = {ELRA} }
@INPROCEEDINGS{Grappy2010, title = {{A corpus for studying full answer justification}}, author = {Arnaud Grappy and Brigitte Grau and Olivier Ferret and Cyril Grouin and Véronique Moriceau and Isabelle Robba and Xavier Tannier and Anne Vilnat and Vincent Barbier}, booktitle = {Proceedings of the Seventh International Language Resources and Evaluation (LREC'10)}, address = {La Valette, Malta}, year = {2010}, month = may, publisher = {ELRA} }
@INPROCEEDINGS{Quintard2010, title = {{Question Answering on web data: the QA evaluation in Quaero}}, author = {Ludovic Quintard and Olivier Galibert and Gilles Adda and Brigitte Grau and Dominique Laurent and Véronique Moriceau and Sophie Rosset and Xavier Tannier and Anne Vilnat}, booktitle = {Proceedings of the Seventh International Language Resources and Evaluation (LREC'10)}, address = {La Valette, Malta}, year = {2010}, month = may, publisher = {ELRA} }
@INPROCEEDINGS{Galibert2010a, title = {{Named and specific entity detection in varied data: The Quaero Named Entity baseline evaluation}}, author = {Olivier Galibert and Ludovic Quintard and Sophie Rosset and Pierre Zweigenbaum and Claire Nédellec and Sophie Aubin and Laurent Gillard and Jean-Pierre Raysz and Delphine Pois and Xavier Tannier and Louise Deléger and Dominique Laurent}, booktitle = {Proceedings of the Seventh International Language Resources and Evaluation (LREC'10)}, address = {La Valette, Malta}, year = {2010}, month = may, publisher = {ELRA} }
@INPROCEEDINGS{Campion10, title = {{FILTRAR-S : un outil de filtrage sémantique et de fouille de textes pour la veille}}, author = {Nicolas Campion and Jacques Closson and Olivier Ferret and Dhafer Lahbib and Romaric Besançon and Jin Shin and Jean-Marc Floret and Brigitte Grau and Xavier Tannier and Jean-Marc Lazard and Amar-Djalil Mezaour}, booktitle = {Acte du Colloque international Veille Stratégique Scientifique & Technologique (VVST'2010)}, address = {Toulouse, France}, year = {2010}, month = oct }
@InProceedings{Tannier2010d, title = {{Studying Syntactic Analysis in a QA System: FIDJI@ResPubliQA'09}}, author = {Xavier Tannier and Véronique Moriceau}, booktitle = {Multilingual Information Access Evaluation I. Text Retrieval Experiments}, address = {Padua, Italy}, year = {2010}, publisher = {Springer}, volume = {LNCS 6241}, pages = {237-244} }
@Book{Tannier2010c, title = {{Se protéger sur Internet. Conseils pour la vie en ligne}}, author = {Xavier Tannier}, year = {2010}, month = aug, publisher = {Eyrolles} }
2009
@PATENT{Roux09, title = {{Event Extraction System for Electronic Messages}}, author = {Claude Roux and Xavier Tannier}, number = {US Patent 20090235280}, year = {2009}, assignee = {Xerox Corporation}, yearfiled = {2008}, nationality = {US} }
@InProceedings{Moriceau2009c, title = {{QA@INEX 2009: A common task for QA, focused IR and automatic summarization systems}}, author = {Véronique Moriceau and \'Eric SanJuan and Xavier Tannier}, booktitle = {Pre-proceedings of the INitiative for the Evaluation of XML retrieval workshop (INEX 2009)}, address = {Brisbane, Australia}, year = {2009}, editor = {Shlomo Geva and Jaap Kamps and Andrew Trotman}, pages = {334-338} }
Our answer validation approach assumes that the different entities of the question can be retrieved, properly connected, either in a sentence, in a passage or in multiple documents. FIDJI has to detect syntactic implications between questions and passages containing the answers. Our system relies on syntactic analysis provided by XIP, which is used to parse both the questions and the documents from which answers are extracted. We designed the system so that no particular linguistic-oriented pre-processing is needed, and as few semantic resources as possible.
Given the differences between ResPubliQA and more traditional question-answering campaigns, our aim was to estimate whether using syntactic analysis was as useful in this context as it proved to be in more focused QA. We obtained 30% of correct answers, with good scores for complex questions ('how', 'why') but lower than usual for factual and definition questions.
@INPROCEEDINGS{Tannier09, title = {{FIDJI in ResPubliQA 2009}}, author = {Xavier Tannier and Véronique Moriceau}, booktitle = {Working Notes for the CLEF 2009 Workshop}, address = {Corfu, Greece}, year = {2009}, month = sep }
Document selection, answer extraction as well as system behaviour on different types of questions have been experimented.
La sélection des documents, l'extraction de la réponse ainsi que le comportement selon les différents types de questions ont été étudiés.
@INPROCEEDINGS{Moriceau09b, title = {{Étude de l'apport de la syntaxe dans un système de question-réponse}}, author = {Véronique Moriceau and Xavier Tannier}, booktitle = {Actes de la Conférence Traitement Automatique des Langues Naturelles (TALN 2009, poster)}, address = {Senlis, France}, year = {2009}, month = jun }
@InProceedings{Moriceau09a, title = {{Utilisation de la syntaxe pour valider les réponses à des questions par plusieurs documents}}, author = {Véronique Moriceau and Xavier Tannier and Brigitte Grau}, booktitle = {Actes de la 6ème conférence en Recherche d'Information et Applications (CORIA 09)}, address = {Presqu'île de Giens, France}, year = {2009}, month = may }
@TechReport{Tannier09b, title = {{Evaluating Temporal Graphs built from Texts via Transitive Reduction}}, author = {Xavier Tannier and Philippe Muller}, number = {2009-21}, year = {2009}, institution = {LIMSI-CNRS} }
2008
This paper presents LIMSI results in Answer Validation Exercise (AVE) 2008 for French. In this task, systems have to consider triplets (question, answer, supporting text) and decide whether the answer to the question is correct and supported or not according to the given supporting text.
We tested two approaches during this campaign:
- A syntax-based strategy, where the system decides whether the supporting text is a reformulation of the question.
- A machine learning strategy, where several features are combined in order to validate answers: presence of common words in the question and in the text, word distance, etc.
The first system, called FIDJI, uses a syntactic parser on questions and provided passages. The approach is to detect, for a given tuple question/answer/supporting text, if all the characteristics of the question can be retrieved in the text. As in other works, some rewriting rules have been set up in order to account for syntactic variations such as passive/active voice, nominalization of verbs, appositions, coordinations, etc. Documents are also tagged with named entity types; Combined with the analysis of the question, this can be used to check that the answer corresponds to the expected type. A few heuristics are then applied to validate the answer.
The second strategy follows a machine learning approach and applies the question-answering system FRASQUES in order to compute some of the learning features. The learning set is extracted from the data provided by AVE 2006 and contains 75% of the total data. The chosen classifier is a combination of decision trees with the bagging method. It is provided by the WEKA program that allows to test a lot of classifiers. Features are terms in common between the passage and the answer (and especially the focus (main word), the answer type, the main verb and bi-terms), the answer given by our existing system FRASQUES, the longuest common chain of words, the answer type checking with Wikipedia, as well as answers given by FIDJI system.
The first system leads to a very good precision (88%) but a quite low recall (42%), while the second one improves recall and reaches a F-measure of 61%. These results must be put into perspective because of the low number of answers, and especially positive answers, provided by AVE for French this year.
@InProceedings{Tannier08d, title = {{Justification of Answers by Verification of Dependency Relations - The French AVE Task}}, author = {Véronique Moriceau and Xavier Tannier and Arnaud Grappy and Brigitte Grau}, booktitle = {Working Notes of CLEF Workshop}, address = {Aarhus, Denmark}, year = {2008} }
Après avoir présenté la tâche de questions-réponses et les enjeux qu'elle soulève, nous examinons jusqu'où on peut la réaliser avec très peu de connaissances linguistiques. Nous passons ensuite en revue les différents types de connaissances linguistiques que les équipes ont été amenées à mobiliser : connaissances syntaxiques et sémantiques pour l'analyse de phrases, rôle de la reconnaissance d'"entités nommées", prise en compte de la dimension textuelle des documents. Une discussion sur les contributions respectives des méthodes linguistiques et non linguistiques clôt l'article.
@Article{Tannier08c, title = {{Apports de la linguistique dans les systèmes de recherche d'informations précises}}, author = {Pierre Zweigenbaum and Brigitte Grau and Anne-Laure Ligozat and Isabelle Robba and Sophie Rosset and Xavier Tannier and Anne Vilnat and Patrice Bellot}, number = {1}, year = {2008}, journal = {Revue Française de Linguistique Appliquée}, volume = {XIII}, pages = {41-62} }
@InProceedings{Tannier08b, title = {{Evaluation Metrics for Automatic Temporal Annotation of Texts}}, author = {Xavier Tannier and Philippe Muller}, booktitle = {Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)}, address = {Marrakech, Morocco}, year = {2008}, publisher = {ELRA}, editor = {European Language Resources Association (ELRA)} }
@InProceedings{Tannier08a, title = {{XTM: A Robust Temporal Text Processor}}, author = {Caroline Hagège and Xavier Tannier}, booktitle = {Computational Linguistics and Intelligent Text Processing, proceedings of 9th International Conference CICLing 2008}, address = {Haifa, Israel}, year = {2008}, month = feb, publisher = {Springer Verlag}, series = {Lecture Notes in Computer Science}, volume = {LNCS 4919}, pages = {231-240} }
2007
@Article{Tannier07a, title = {{Traiter les documents XML avec les "contextes de lecture"}}, author = {Xavier Tannier}, number = {1}, year = {2007}, journal = {Traitement Automatique des Langues}, volume = {47} }
@INPROCEEDINGS{Tannier07b, title = {{XRCE-T: XIP temporal module for TempEval campaign}}, author = {Caroline Hagège and Xavier Tannier}, booktitle = {Proceedings of SemEval workshop at ACL 2007}, address = {Prague, Czech Republic}, year = {2007}, month = jun, publisher = {Association for Computational Linguistics}, pages = {492-495} }
2006
Problems raised at this level are many, notably for document content analysis and querying. We studied the specific solutions that could bring the natural language processing (NLP) techniques. We proposed a theoretical frame and a practical approach to allow the use of traditional textual analysis techniques in XML documents, disregarding the structure. We also conceived an interface for querying XML documents in natural language, and proposed methods using the structure in order to improve the retrieval of relevant elements.
Les problèmes posés par ces caractéristiques sont nombreux, que ce soit au niveau du pré-traitement des documents ou de leur interrogation. Face à ces problèmes, nous avons étudié les solutions spécifiques que pouvait apporter le traitement automatique de la langue (TAL). Nous avons ainsi proposé un cadre théorique et une approche pratique pour permettre l'utilisation des techniques d'analyse textuelle en faisant abstraction de la structure. Nous avons également conçu une interface d'interrogation en langage naturel pour la RI dans les documents XML, et proposé des méthodes tirant profit de la structure pour améliorer la recherche des éléments pertinents.
@PhDThesis{Tannier06d, title = {{Extraction et recherche d'information en langage naturel dans les documents semi-structurés}}, author = {Xavier Tannier}, year = {2006}, month = sep, school = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
@InProceedings{Tannier06e, title = {{XOR - XML Oriented Retrieval Language}}, author = {Shlomo Geva and Marcus Hassler and Xavier Tannier}, booktitle = {Proceedings of ACM SIGIR 2006 Workshop on XML Element Retrieval Methodology}, address = {Seattle, WA, USA}, year = {2006}, month = aug, publisher = {ACM Press, New York City, NY, USA}, pages = {5-12} }
@InProceedings{Tannier06h, title = {{Natural Language Processing and XML Retrieval}}, author = {Alan Woodley and Xavier Tannier and Marcus Hassler and Shlomo Geva}, booktitle = {Proceedings of the Australasian Language Technology Workshop (ALTW 2006), short paper}, address = {Sydney, Australia}, year = {2006} }
@InProceedings{Tannier06a, title = {{From natural language to NEXI, an interface for INEX 2005 queries.}}, author = {Xavier Tannier}, booktitle = {Advances in XML Information Retrieval: Fourth Workshop of the Initiative for the Evaluation of XML retrieval (INEX)}, address = {Schloss Dagstuhl, Germany}, year = {2006}, month = nov, publisher = {Springer Verlag}, series = {Lecture Notes in Computer Science}, volume = {LNCS 3977}, editor = {Norbert Fuhr, Mounia Lalmas, Saadia Malik and Gabriella Kazai} }
@TechReport{Tannier06g, title = {{Approaches to Translating Natural Language Queries for use in XML Information Retrieval Systems}}, author = {Xavier Tannier and Alan Woodley and Shlomo Geva and Marcus Hassler}, number = {2006-400-008}, year = {2006}, month = jul, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
Cet état de l'art n'est donc en aucun cas une description exhaustive des applications des techniques de traitement automatique des langues, mais une introduction à certaines problématiques, choisies au départ pour correspondre avec le sujet général de la thèse présentée.
@TechReport{Tannier06c, title = {{Traitement automatique du langage naturel pour l'extraction et la recherche d'information}}, author = {Xavier Tannier}, number = {2006-400-006}, year = {2006}, month = mar, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
Cet état de l'art n'est donc en aucun cas une description exhaustive des applications des techniques de recherche d'information, mais une introduction à certaines problématiques, choisies au départ pour correspondre avec le sujet général de la thèse présentée.
@TechReport{Tannier06b, title = {{Recherche d'information dans les documents XML}}, author = {Xavier Tannier}, number = {2006-400-007}, year = {2006}, month = jun, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
@TechReport{Tannier06f, title = {{Retrieval Status Values in Information Retrieval Evaluation}}, author = {Amélie Imafouo and Xavier Tannier}, number = {2006-400-002}, year = {2006}, month = mar, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
2005
@InProceedings{Tannier05g, title = {{XML Retrieval with a Natural Language Interface}}, author = {Xavier Tannier and Shlomo Geva}, booktitle = {String Processing and Information Retrieval: 12th International Conference, SPIRE 2005}, address = {Buenos Aires, Argentina}, year = {2005}, month = nov, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, editor = {Mariano Consens and Gonzalo Navarro}, pages = {29-40} }
@InProceedings{Tannier05f, title = {{Retrieval Status Values in Information Retrieval Evaluation}}, author = {Amélie Imafouo and Xavier Tannier}, booktitle = {String Processing and Information Retrieval: 12th International Conference, SPIRE 2005}, address = {Buenos Aires, Argentina}, year = {2005}, month = nov, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, editor = {Mariano Consens and Gonzalo Navarro}, pages = {222-227} }
@InProceedings{Tannier05h, title = {{Classifying XML Tags through "Reading Contexts"}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, booktitle = {Proceedings of the 2005 ACM Symposium on Document Engineering}, address = {Bristol, United Kingdom}, year = {2005}, month = nov, publisher = {ACM Press, New York City, NY, USA}, editor = {Peter R. King}, pages = {143-145} }
This paper describes first the different steps that we perform in order to transform (in an information retrieval framework) the natural language request into a context-free semantic representation. Some structure- and domain-specific rules are then applied, in order to obtain a final form, adapted to a conversion into a formal query language. Finally we describe our first experimentations and discuss different aspects of our approach.
L'article décrit dans un premier temps les différentes phases qui permettent de transformer (dans un cadre de recherche d'information) la requête en langage naturel en une représentation sémantique indépendante du contexte. Des règles de simplification adaptées à la structure et au domaine du corpus sont ensuite appliquées, permettant d'obtenir une forme finale, adaptée à une conversion vers un langage de requête formel. L'article décrit enfin les expérimentations effectuées et tire les premières conclusions sur divers aspects de cette approche.
@InProceedings{Tannier05a, title = {{Utilisation de la langue naturelle pour l'interrogation de documents structurés}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, booktitle = {Actes de la 2ème conférence en Recherche d'Information et Applications (CORIA 05)}, address = {Grenoble, France}, year = {2005}, pages = {135-150} }
@InProceedings{Tannier04e, title = {{Analysing Natural Language Queries at INEX 2004}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, booktitle = {Advances in XML Information Retrieval: Third Workshop of the Initiative for the Evaluation of XML retrieval (INEX)}, address = {Schloss Dagstuhl, Germany}, year = {2005}, month = dec, series = {Lecture Notes in Computer Science}, volume = {LNCS 3493}, editor = {Norbert Fuhr and Mounia Lalmas and Saadia Malik and Zolt\`an Szl\`avik}, pages = {395-409} }
We also present the concept of "reading contexts" and show how our tool deals with them.
@InProceedings{Tannier04f, title = {{XGTagger, an open-source interface dealing with XML contents}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, booktitle = {Proceedings of the workshop on Open Source Web Information Retrieval (OSWIR 2005)}, address = {Compiègne, France}, year = {2005}, editor = {Michel Beigbeder and Wai Gen Yee} }
This article goes back over an existing tag categorization allowing to distinguish different ways to manage textual content of XML elements. It gives for tag classes a clear definition, through the introduction of a new concept of "reading contexts". Furthermore it proposes a method that uses natural language processing techniques in order to find the class of XML tag names automatically. This work notably allows to recognize emphasis tags in a text, to define a new concept of term logical proximity in structured documents, to improve indexing techniques, but also to open up the way to advanced linguistic analyses of XML corpora.
@TechReport{Tannier05b, title = {{Dealing with XML structure through ''Reading Contexts''}}, author = {Xavier Tannier}, number = {2005-400-007}, year = {2005}, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
@TechReport{Tannier05c, title = {{XGTagger, a generic interface for analysing XML content}}, author = {Xavier Tannier and Aude Garnier}, number = {2005-400-008}, year = {2005}, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
2004
@InProceedings{Tannier04c, title = {{Natural Language Queries for Information Retrieval in Structured Documents}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, booktitle = {Proceedings of the International Conference on Advances in Intelligent Systems - Theory and Applications (AISTA 2004)}, address = {Luxembourg}, year = {2004} }
@InProceedings{Tannier04b, title = {{Annotating and measuring temporal relations in texts}}, author = {Philippe Muller and Xavier Tannier}, booktitle = {Proceedings of the 20th International Conference on Computational Linguistics (Coling 04)}, address = {Genève, Suisse}, year = {2004}, month = aug, pages = {50-56} }
@InProceedings{Tannier04a, title = {{Une méthode pour l'annotation de relations temporelles dans des textes et son évaluation}}, author = {Philippe Muller and Xavier Tannier}, booktitle = {Actes de la 11ème Conférence annuelle de Traitement Automatique des Langues Naturelles}, address = {Fès, Maroc}, year = {2004}, month = apr, pages = {319-328} }
This paper describes first the different steps that we perform in order to transform (in an information retrieval framework) the natural language request into a context-free semantic representation. Some structure- and domain-specific rules are then applied, in order to obtain a final form, adapted to a conversion into a formal query language. Finally we describe our first experimentations and discuss different aspects of our approach.
L'article décrit dans un premier temps les différentes phases qui permettent de transformer (dans un cadre de recherche d'information) la requête en langage naturel en une représentation sémantique indépendante du contexte. Des règles de simplification adaptées à la structure et au domaine du corpus sont ensuite appliquées, permettant d'obtenir une forme finale, adaptée à une conversion vers un langage de requête formel. L'article décrit enfin les expérimentations effectuées et tire les premières conclusions sur divers aspects de cette approche.
@TechReport{Tannier04d, title = {{Utilisation de la langue naturelle pour l'interrogation de documents structurés}}, author = {Xavier Tannier and Jean-Jacques Girardot and Mihaela Mathieu}, number = {2004-400-010}, year = {2004}, month = dec, institution = {Ecole Nationale Supérieure des Mines de Saint-Etienne} }
2003
This paper describes first the different steps that we perform in order to transform (in an information retrieval framework) the natural language request into a context-free semantic representation. Some structure- and domain-specific rules are then applied, in order to obtain a final form, adapted to a conversion into a formal query language. Finally we describe our first experimentations and discuss different aspects of our approach.
L'article décrit dans un premier temps les différentes phases qui permettent de transformer (dans un cadre de recherche d'information) la requête en langage naturel en une représentation sémantique indépendante du contexte. Des règles de simplification adaptées à la structure et au domaine du corpus sont ensuite appliquées, permettant d'obtenir une forme finale, adaptée à une conversion vers un langage de requête formel. L'article décrit enfin les expérimentations effectuées et tire les premières conclusions sur divers aspects de cette approche.
@MastersThesis{Tannier03, title = {{Calcul des relations temporelles du discours}}, author = {Xavier Tannier}, year = {2003}, school = {INSA Toulouse -- IRIT, France} }