% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Das:303404,
author = {A. Das and D. Z. Khan and D. Psychogyios and Y. Zhang and
J. G. Hanrahan and F. Vasconcelos and Y. Pang and Z. Chen
and J. Wu and X. Zou and G. Zheng and A. Qayyum and M.
Mazher and I. Razzak and T. Li and J. Ye and J. He and S.
Płotka and J. Kaleta and A. Yamlahi$^*$ and A. Jund$^*$ and
P. Godau$^*$ and S. Kondo and S. Kasai and K. Hirasawa and
D. Rivoir and S. Speidel and A. Pérez and S. Rodriguez and
P. Arbeláez and D. Stoyanov and H. J. Marcus and S. Bano},
title = {{P}it{V}is-2023 challenge: {W}orkflow recognition in videos
of endoscopic pituitary surgery.},
journal = {Medical image analysis},
volume = {106},
issn = {1361-8415},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DKFZ-2025-01640},
pages = {103716},
year = {2025},
abstract = {The field of computer vision applied to videos of minimally
invasive surgery is ever-growing. Workflow recognition
pertains to the automated recognition of various aspects of
a surgery, including: which surgical steps are performed;
and which surgical instruments are used. This information
can later be used to assist clinicians when learning the
surgery or during live surgery. The Pituitary Vision
(PitVis) 2023 Challenge tasks the community to step and
instrument recognition in videos of endoscopic pituitary
surgery. This is a particularly challenging task when
compared to other minimally invasive surgeries due to: the
smaller working space, which limits and distorts vision; and
higher frequency of instrument and step switching, which
requires more precise model predictions. Participants were
provided with 25-videos, with results presented at the
MICCAI-2023 conference as part of the Endoscopic Vision 2023
Challenge in Vancouver, Canada, on 08-Oct-2023. There were
18-submissions from 9-teams across 6-countries, using a
variety of deep learning models. The top performing model
for step recognition utilised a transformer based
architecture, uniquely using an autoregressive decoder with
a positional encoding input. The top performing model for
instrument recognition utilised a spatial encoder followed
by a temporal encoder, which uniquely used a 2-layer
temporal architecture. In both cases, these models
outperformed purely spatial based models, illustrating the
importance of sequential and temporal information. This
PitVis-2023 therefore demonstrates state-of-the-art computer
vision models in minimally invasive surgery are transferable
to a new dataset. Benchmark results are provided in the
paper, and the dataset is publicly available at:
https://doi.org/10.5522/04/26531686.},
keywords = {Endoscopic vision (Other) / Instrument recognition (Other)
/ Step recognition (Other) / Surgical AI (Other) / Surgical
vision (Other) / Workflow analysis (Other)},
cin = {E130},
ddc = {610},
cid = {I:(DE-He78)E130-20160331},
pnm = {315 - Bildgebung und Radioonkologie (POF4-315)},
pid = {G:(DE-HGF)POF4-315},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:40769094},
doi = {10.1016/j.media.2025.103716},
url = {https://inrepo02.dkfz.de/record/303404},
}