% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Yang:309648,
author = {S. Yang and F. Zhou and L. Mayer$^*$ and F. Huang and Y.
Chen and Y. Wang and S. He and Y. Nie and X. Wang and Y. Jin
and H. Sun and S. Xu and A. Q. Liu and Z. Li and J. Qin and
J. Y. Teoh and L. Maier-Hein$^*$ and H. Chen},
title = {{L}arge-scale self-supervised video foundation model for
intelligent surgery.},
journal = {npj digital medicine},
volume = {nn},
issn = {2398-6352},
address = {[Basingstoke]},
publisher = {Macmillan Publishers Limited},
reportid = {DKFZ-2026-00280},
pages = {nn},
year = {2026},
note = {#NCTZFB26# / epub},
abstract = {Computer-Assisted Intervention has the potential to
revolutionize modern surgery, with surgical scene
understanding serving as a critical component in supporting
decision-making and improving procedural efficacy. While
existing AI-driven approaches alleviate annotation burdens
via self-supervised spatial representation learning, their
lack of explicit temporal modeling during pre-training
fundamentally restricts the capture of dynamic surgical
contexts, resulting in incomplete spatiotemporal
understanding. In this work, we introduce the first
video-level surgical pre-training framework that enables
joint spatiotemporal representation learning from
large-scale surgical video data. To achieve this, we
constructed a large-scale surgical video dataset comprising
3650 videos and 3.55 million frames, spanning more than 20
surgical procedures and over 10 anatomical structures.
Building upon this dataset, we propose SurgVISTA (Surgical
Video-level Spatial-Temporal Architecture), a
reconstruction-based pre-training method that jointly
captures intricate spatial structures and temporal dynamics.
Additionally, SurgVISTA incorporates image-level knowledge
distillation guided by a surgery-specific expert model to
enhance the learning of fine-grained anatomical and semantic
features. To validate its effectiveness, we established a
comprehensive benchmark comprising 13 video-level datasets
spanning six surgical procedures across four tasks.
Extensive experiments demonstrate that SurgVISTA
consistently outperforms both natural- and surgical-domain
pre-trained models, demonstrating strong potential to
advance intelligent surgical systems in clinically
meaningful scenarios.},
cin = {E130 / HD02},
ddc = {610},
cid = {I:(DE-He78)E130-20160331 / I:(DE-He78)HD02-20160331},
pnm = {315 - Bildgebung und Radioonkologie (POF4-315)},
pid = {G:(DE-HGF)POF4-315},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:41639385},
doi = {10.1038/s41746-026-02403-0},
url = {https://inrepo02.dkfz.de/record/309648},
}