@article{TEXTUAL,
      recid = {9823},
      author = {Beaulieu-Jones, Brett K. and Villamar, Mauricio F. and  Scordis, Phil and Bartmann, Ana Paula and Ali, Waqar and  Wissel, Benjamin D. and Alsentzer, Emily and de Jong,  Johann and Patra, Arijit and Kohane, Isaac},
      title = {Predicting seizure recurrence after an initial  seizure-like episode from routine clinical notes using  large language models: A retrospective cohort study},
      journal = {The Lancet Digital Health},
      address = {2023-11-22},
      number = {TEXTUAL},
      abstract = {<p> Background: The evaluation and management of  first-time seizure-like events in children can be difficult  because these episodes are not always directly observed and  might be epileptic seizures or other conditions (seizure  mimics). We aimed to evaluate whether machine learning  models using real-world data could predict seizure  recurrence after an initial seizure-like event.</p>  <p>Methods: This retrospective cohort study compared models  trained and evaluated on two separate datasets between Jan  1, 2010, and Jan 1, 2020: electronic medical records (EMRs)  at Boston Children's Hospital and de-identified,  patient-level, administrative claims data from the IBM  MarketScan research database. The study population  comprised patients with an initial diagnosis of either  epilepsy or convulsions before the age of 21 years, based  on International Classification of Diseases, Clinical  Modification (ICD-CM) codes. We compared machine  learning-based predictive modelling using structured data  (logistic regression and XGBoost) with emerging techniques  in natural language processing by use of large language  models.</p> <p>Findings: The primary cohort comprised 14  021 patients at Boston Children's Hospital matching  inclusion criteria with an initial seizure-like event and  the comparison cohort comprised 15 062 patients within the  IBM MarketScan research database. Seizure recurrence based  on a composite expert-derived definition occurred in 57% of  patients at Boston Children's Hospital and 63% of patients  within IBM MarketScan. Large language models with  additional domain-specific and location-specific  pre-training on patients excluded from the study (F1-score  0·826 [95% CI 0·817-0·835], AUC 0·897 [95% CI 0·875-0·913])  performed best. All large language models, including the  base model without additional pre-training (F1-score 0·739  [95% CI 0·738-0·741], AUROC 0·846 [95% CI 0·826-0·861])  outperformed models trained with structured data. With  structured data only, XGBoost outperformed logistic  regression and XGBoost models trained with the Boston  Children's Hospital EMR (logistic regression: F1-score  0·650 [95% CI 0·643-0·657], AUC 0·694 [95% CI 0·685-0·705],  XGBoost: F1-score 0·679 [0·676-0·683], AUC 0·725  [0·717-0·734]) performed similarly to models trained on the  IBM MarketScan database (logistic regression: F1-score  0·596 [0·590-0·601], AUC 0·670 [0·664-0·675], XGBoost:  F1-score 0·678 [0·668-0·687], AUC 0·710 [0·703-0·714]).</p>  <p>Interpretation: Physician's clinical notes about an  initial seizure-like event include substantial signals for  prediction of seizure recurrence, and additional  domain-specific and location-specific pre-training can  significantly improve the performance of clinical large  language models, even for specialised cohorts.</p>},
      url = {http://knowledge.uchicago.edu/record/9823},
}