% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Yang:309648,
      author       = {S. Yang and F. Zhou and L. Mayer$^*$ and F. Huang and Y.
                      Chen and Y. Wang and S. He and Y. Nie and X. Wang and Y. Jin
                      and H. Sun and S. Xu and A. Q. Liu and Z. Li and J. Qin and
                      J. Y. Teoh and L. Maier-Hein$^*$ and H. Chen},
      title        = {{L}arge-scale self-supervised video foundation model for
                      intelligent surgery.},
      journal      = {npj digital medicine},
      volume       = {nn},
      issn         = {2398-6352},
      address      = {[Basingstoke]},
      publisher    = {Macmillan Publishers Limited},
      reportid     = {DKFZ-2026-00280},
      pages        = {nn},
      year         = {2026},
      note         = {#NCTZFB26# / epub},
      abstract     = {Computer-Assisted Intervention has the potential to
                      revolutionize modern surgery, with surgical scene
                      understanding serving as a critical component in supporting
                      decision-making and improving procedural efficacy. While
                      existing AI-driven approaches alleviate annotation burdens
                      via self-supervised spatial representation learning, their
                      lack of explicit temporal modeling during pre-training
                      fundamentally restricts the capture of dynamic surgical
                      contexts, resulting in incomplete spatiotemporal
                      understanding. In this work, we introduce the first
                      video-level surgical pre-training framework that enables
                      joint spatiotemporal representation learning from
                      large-scale surgical video data. To achieve this, we
                      constructed a large-scale surgical video dataset comprising
                      3650 videos and 3.55 million frames, spanning more than 20
                      surgical procedures and over 10 anatomical structures.
                      Building upon this dataset, we propose SurgVISTA (Surgical
                      Video-level Spatial-Temporal Architecture), a
                      reconstruction-based pre-training method that jointly
                      captures intricate spatial structures and temporal dynamics.
                      Additionally, SurgVISTA incorporates image-level knowledge
                      distillation guided by a surgery-specific expert model to
                      enhance the learning of fine-grained anatomical and semantic
                      features. To validate its effectiveness, we established a
                      comprehensive benchmark comprising 13 video-level datasets
                      spanning six surgical procedures across four tasks.
                      Extensive experiments demonstrate that SurgVISTA
                      consistently outperforms both natural- and surgical-domain
                      pre-trained models, demonstrating strong potential to
                      advance intelligent surgical systems in clinically
                      meaningful scenarios.},
      cin          = {E130 / HD02},
      ddc          = {610},
      cid          = {I:(DE-He78)E130-20160331 / I:(DE-He78)HD02-20160331},
      pnm          = {315 - Bildgebung und Radioonkologie (POF4-315)},
      pid          = {G:(DE-HGF)POF4-315},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:41639385},
      doi          = {10.1038/s41746-026-02403-0},
      url          = {https://inrepo02.dkfz.de/record/309648},
}