% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Li:304480,
author = {C.-P. Li and A. T. Kalisa and S. Roohani$^*$ and K.
Hummedah and F. Menge and C. Reißfelder and M. Albertsmeier
and B. Kasper and J. Jakob and C. Yang},
title = {{T}he imitation game: large language models versus
multidisciplinary tumor boards: benchmarking {AI} against 21
sarcoma centers from the ring trial.},
journal = {Journal of cancer research and clinical oncology},
volume = {151},
number = {9},
issn = {0301-1585},
address = {Heidelberg},
publisher = {Springer},
reportid = {DKFZ-2025-01872},
pages = {248},
year = {2025},
abstract = {The study aims to compare the treatment recommendations
generated by four leading large language models (LLMs) with
those from 21 sarcoma centers' multidisciplinary tumor
boards (MTBs) of the sarcoma ring trial in managing complex
soft tissue sarcoma (STS) cases.We simulated STS-MTBs using
four LLMs-Llama 3.2-vison: 90b, Claude 3.5 Sonnet,
DeepSeek-R1, and OpenAI-o1 across five anonymized STS cases
from the sarcoma ring trial. Each model was queried 21 times
per case using a standardized prompt, and the responses were
compared with human MTBs in terms of intra-model
consistency, treatment recommendation alignment, alternative
recommendations, and source citation.LLMs demonstrated high
inter-model and intra-model consistency in only $20\%$ of
cases, and their recommendations aligned with human
consensus in only $20-60\%$ of cases. The model with the
highest concordance with the most common MTB recommendation,
Claude 3.5 Sonnet, aligned with experts in only $60\%$ of
cases. Notably, the recommendations across MTBs were highly
heterogenous, contextualizing the variable LLM performance.
Discrepancies were particularly notable, where common human
recommendations were often absent in LLM outputs.
Additionally, the sources for the recommendation rationale
of LLMs were clearly derived from the German S3 sarcoma
guidelines in only $24.8\%$ to $55.2\%$ of the responses.
LLMs occasionally suggested potentially harmful information
were also observed in alternative recommendations.Despite
the considerable heterogeneity observed in MTB
recommendations, the significant discrepancies and
potentially harmful recommendations highlight current AI
tools' limitations, underscoring that referral to
high-volume sarcoma centers remains essential for optimal
patient care. At the same time, LLMs could serve as an
excellent tool to prepare for MDT discussions.},
keywords = {Humans / Sarcoma: therapy / Sarcoma: pathology /
Benchmarking: methods / Cancer Care Facilities / Language /
Large Language Models / Artificial intelligence (Other) /
Clinical decision (Other) / Large language model (Other) /
Multidisciplinary tumor board (Other) / Soft tissue sarcoma
(Other)},
cin = {BE01},
ddc = {610},
cid = {I:(DE-He78)BE01-20160331},
pnm = {899 - ohne Topic (POF4-899)},
pid = {G:(DE-HGF)POF4-899},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:40926110},
doi = {10.1007/s00432-025-06304-9},
url = {https://inrepo02.dkfz.de/record/304480},
}