% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Aghamaliyev:298349,
author = {U. Aghamaliyev and J. Karimbayli and A. Zamparas and F.
Bösch and M. Thomas and T. Schmidt and C. Krautz and C.
Kahlert and S. Schölch$^*$ and M. K. Angele and H. Niess
and M. O. Guba and J. Werner and M. Ilmer$^*$ and B. W.
Renz$^*$},
title = {{B}ots in white coats: are large language models the future
of patient education? a multi-center cross-sectional
analysis.},
journal = {International journal of surgery},
volume = {111},
number = {3},
issn = {1743-9191},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DKFZ-2025-00253},
pages = {2376-2384},
year = {2025},
note = {2025 Mar 1;111(3):2376-2384},
abstract = {Every year, around 300 million surgeries are conducted
worldwide, with an estimated 4.2 million deaths occurring
within 30 days after surgery. Adequate patient education is
crucial, but often falls short due to the stress patients
experience before surgery. Large language models (LLMs) can
significantly enhance this process by delivering thorough
information and addressing patient concerns that might
otherwise go unnoticed.This cross-sectional study evaluated
ChatGPT-4o's audio-based responses to frequently asked
questions (FAQs) regarding six general surgical procedures.
Three experienced surgeons and two senior residents
formulated seven general and three procedure-specific FAQs
for both preoperative and postoperative situations, covering
six surgical scenarios (major: pancreatic head resection,
rectal resection, total gastrectomy; minor: cholecystectomy,
Lichtenstein procedure, hemithyroidectomy). In total, 120
audio responses were generated, transcribed, and assessed by
11 surgeons from six different German university
hospitals.ChatGPT-4o demonstrated strong performance,
achieving an average score of 4.12/5 for accuracy, 4.46/5
for relevance, and 0.22/5 for potential harm across 120
questions. Postoperative responses surpassed preoperative
ones in both accuracy and relevance, while also exhibiting
lower potential for harm. Additionally, responses related to
minor surgeries were minimal, but significantly more
accurate compared to those for major surgeries.This study
underscores GPT-4o's potential to enhance patient education
both before and after surgery by delivering accurate and
relevant responses to FAQs about various surgical
procedures. Responses regarding the postoperative course
proved to be more accurate and less harmful than those
addressing preoperative ones. Although a few responses
carried moderate risks, the overall performance was robust,
indicating GPT-4o's value in patient education. The study
suggests the development of hospital-specific applications
or the integration of GPT-4o into interactive robotic
systems to provide patients with reliable, immediate
answers, thereby improving patient satisfaction and informed
decision-making.},
cin = {A430 / MU01},
ddc = {610},
cid = {I:(DE-He78)A430-20160331 / I:(DE-He78)MU01-20160331},
pnm = {311 - Zellbiologie und Tumorbiologie (POF4-311)},
pid = {G:(DE-HGF)POF4-311},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:39878073},
doi = {10.1097/JS9.0000000000002250},
url = {https://inrepo02.dkfz.de/record/298349},
}