% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Dorfner:300283,
author = {F. J. Dorfner and A. Dada and F. Busch and M. R. Makowski
and T. Han and D. Truhn and J. Kleesiek$^*$ and M. Sushil
and L. C. Adams and K. K. Bressem},
title = {{E}valuating the effectiveness of biomedical fine-tuning
for large language models on clinical tasks.},
journal = {Journal of the American Medical Informatics Association},
volume = {32},
number = {6},
issn = {1067-5027},
address = {Oxford},
publisher = {Oxford Univ. Press},
reportid = {DKFZ-2025-00736},
pages = {1015-1024},
year = {2025},
note = {2025 Jun 1;32(6):1015-1024},
abstract = {Large language models (LLMs) have shown potential in
biomedical applications, leading to efforts to fine-tune
them on domain-specific data. However, the effectiveness of
this approach remains unclear. This study aims to critically
evaluate the performance of biomedically fine-tuned LLMs
against their general-purpose counterparts across a range of
clinical tasks.We evaluated the performance of biomedically
fine-tuned LLMs against their general-purpose counterparts
on clinical case challenges from NEJM and JAMA, and on
multiple clinical tasks, such as information extraction,
document summarization and clinical coding. We used a
diverse set of benchmarks specifically chosen to be outside
the likely fine-tuning datasets of biomedical models,
ensuring a fair assessment of generalization
capabilities.Biomedical LLMs generally underperformed
compared to general-purpose models, especially on tasks not
focused on probing medical knowledge. While on the case
challenges, larger biomedical and general-purpose models
showed similar performance (eg, OpenBioLLM-70B: $66.4\%$ vs
Llama-3-70B-Instruct: $65\%$ on JAMA), smaller biomedical
models showed more pronounced underperformance
(OpenBioLLM-8B: $30\%$ vs Llama-3-8B-Instruct: $64.3\%$ on
NEJM). Similar trends appeared across CLUE benchmarks, with
general-purpose models often achieving higher scores in text
generation, question answering, and coding. Notably,
biomedical LLMs also showed a higher tendency to
hallucinate.Our findings challenge the assumption that
biomedical fine-tuning inherently improves LLM performance,
as general-purpose models consistently performed better on
unseen medical tasks. Retrieval-augmented generation may
offer a more effective strategy for clinical
adaptation.Fine-tuning LLMs on biomedical data may not yield
the anticipated benefits. Alternative approaches, such as
retrieval augmentation, should be further explored for
effective and reliable clinical integration of LLMs.},
keywords = {benchmarking (Other) / biomedical fine-tuning (Other) /
domain-specific adaptation (Other) / hallucination in AI
models (Other) / large language models (LLMs) (Other)},
cin = {ED01},
ddc = {610},
cid = {I:(DE-He78)ED01-20160331},
pnm = {899 - ohne Topic (POF4-899)},
pid = {G:(DE-HGF)POF4-899},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:40190132},
doi = {10.1093/jamia/ocaf045},
url = {https://inrepo02.dkfz.de/record/300283},
}