% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Stueker:302984,
author = {E. H. Stueker and F. R. Kolbinger and O. L. Saldanha and D.
Digomann$^*$ and S. Pistorius and F. Oehme and M. Van Treeck
and D. Ferber and C. M. L. Löffler and J. Weitz$^*$ and M.
Distler and J. N. Kather and H. S. Muti$^*$},
title = {{V}ision-language models for automated video analysis and
documentation in laparoscopic surgery: a proof-of-concept
study.},
journal = {International journal of surgery},
volume = {nn},
issn = {1743-9191},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DKFZ-2025-01431},
pages = {nn},
year = {2025},
note = {epub},
abstract = {The ongoing shortage of medical personnel highlights the
urgent need to automate clinical documentation and reduce
administrative burden. Large Vision-Language Models (VLMs)
offer promising potential for supporting surgical
documentation and intraoperative analysis.We conducted an
observational, comparative performance study of two
general-purpose VLMs-GPT-4o (OpenAI) and Gemini-1.5-pro
(Google)-from June to September 2024, using 15
cholecystectomy and 15 appendectomy videos (1 fps) from the
CholecT45 and LapApp datasets. Tasks included object
detection (vessel clips, gauze, retrieval bags, bleeding),
surgery type classification, appendicitis grading, and
surgical report generation. In-context learning (ICL) was
evaluated as an enhancement method. Performance was assessed
using descriptive accuracy metrics.Both models identified
vessel clips with $100\%$ accuracy. GPT-4o outperformed
Gemini-1.5-pro in retrieval bag $(100\%$ vs. $93.3\%)$ and
gauze detection $(93.3\%$ vs. $60\%),$ while Gemini-1.5-pro
showed better results in bleeding detection $(93.3\%$ vs.
$86.7\%).$ In surgery classification, Gemini-1.5-pro was
more accurate for cholecystectomies $(93\%$ vs. $80\%),$
with both models achieving $60\%$ accuracy for
appendectomies. Appendicitis grading showed limited
performance (GPT-4o: $40\%,$ Gemini-1.5-pro: $26.7\%).$ For
surgical reports, GPT-4o produced for CCE more complete
outputs (CCE: $90.4\%,$ APE: $80.1\%),$ while Gemini-1.5-pro
achieved higher correctness overall (CCE: $71.1\%,$ APE:
$69.6\%).$ ICL notably improved tool recognition (e.g., in
APE step 4, GPT-4o improved from $69.2\%$ to $80\%),$ though
its effect on organ removal step recognition was
inconsistent.GPT-4o and Gemini-1.5-pro performed reliably in
object detection and procedure classification but showed
limitations in grading pathology and accurately describing
procedural steps, which could be enhanced through in-context
learning. This shows that domain-agnostic VLMs can be
applied to surgical video analysis. In the future, VLMs with
domain knowledge can be envisioned to enhance the operating
room in the form of companions.},
keywords = {appendectomy (Other) / cholecystectomy (Other) / minimally
invasive surgery (Other) / surgical video analysis (Other) /
vision-language models (Other)},
cin = {DD01},
ddc = {610},
cid = {I:(DE-He78)DD01-20160331},
pnm = {899 - ohne Topic (POF4-899)},
pid = {G:(DE-HGF)POF4-899},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:40679978},
doi = {10.1097/JS9.0000000000003069},
url = {https://inrepo02.dkfz.de/record/302984},
}