% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Spitzl:303113,
      author       = {D. Spitzl and M. Mergen and R. Braren$^*$ and L. Endrös
                      and M. Eiber$^*$ and L. Steinhelfer},
      title        = {{LLM}-powered breast cancer staging from {PET}/{CT}
                      reports: a comparative performance study.},
      journal      = {International journal of medical informatics},
      volume       = {204},
      issn         = {1386-5056},
      address      = {Amsterdam [u.a.]},
      publisher    = {Elsevier},
      reportid     = {DKFZ-2025-01532},
      pages        = {106053},
      year         = {2025},
      abstract     = {Imaging reports are crucial in breast cancer management,
                      with the tumor-node-metastasis (TNM) classification serving
                      as a widely used model for assessing disease severity,
                      guiding treatment decisions, and predicting patient
                      outcomes. Large language models (LLMs) offer a potential
                      solution by extracting standardized UICC TNM classifications
                      and the corresponding UICC stage directly from existing
                      PET/CT reports. This approach holds promise to enhance
                      staging accuracy, streamline multidisciplinary discussions,
                      and improve patient outcomes.Here, we evaluated four
                      LLMs-ChatGPT-4o, DeepSeek V3, Claude 3.5 Sonnet, and Gemini
                      2.0 Flash-for their capacity to determine TNM staging based
                      on UICC/AJCC breast cancer guidelines. A total of 111
                      fictitious PET/CT reports were analyzed, and each model's
                      outputs were measured against expert-generated TNM
                      classifications and stage categorizations.Among the tested
                      models, Claude 3.5 Sonnet demonstrated superior F1 scores of
                      $0.95\%,$ $0.95\%,$ $1.00\%$ and $0.92\%$ for T, N, M
                      classification and UICC stage classification,
                      respectively.These findings underscore the ability of
                      advanced natural language processing (NLP) technologies to
                      support reliable cancer staging, potentially aiding
                      clinicians. Despite the encouraging performance, prospective
                      clinical trials and validation across diverse practice
                      settings remain critical to confirming these preliminary
                      outcomes. Nonetheless, this study highlights the promise of
                      LLM-based systems in reinforcing the accuracy of oncologic
                      workflows and lays the groundwork for broader adoption of
                      AI-driven tools in breast cancer management.},
      keywords     = {Artificial intelligence (Other) / Breast cancer (Other) /
                      Clinical decision support (Other) / Diagnostics (Other)},
      cin          = {MU01},
      ddc          = {004},
      cid          = {I:(DE-He78)MU01-20160331},
      pnm          = {899 - ohne Topic (POF4-899)},
      pid          = {G:(DE-HGF)POF4-899},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:40706196},
      doi          = {10.1016/j.ijmedinf.2025.106053},
      url          = {https://inrepo02.dkfz.de/record/303113},
}