% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Zenk:294922,
author = {M. Zenk$^*$ and D. Zimmerer$^*$ and F. Isensee$^*$ and J.
Traub$^*$ and T. Norajitra$^*$ and P. F. Jäger$^*$ and K.
Maier-Hein$^*$},
title = {{C}omparative benchmarking of failure detection methods in
medical image segmentation: {U}nveiling the role of
confidence aggregation.},
journal = {Medical image analysis},
volume = {101},
issn = {1361-8415},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DKFZ-2024-02629},
pages = {103392},
year = {2025},
note = {#EA:E230#LA:E230# / Available online 30 November 2024},
abstract = {Semantic segmentation is an essential component of medical
image analysis research, with recent deep learning
algorithms offering out-of-the-box applicability across
diverse datasets. Despite these advancements, segmentation
failures remain a significant concern for real-world
clinical applications, necessitating reliable detection
mechanisms. This paper introduces a comprehensive
benchmarking framework aimed at evaluating failure detection
methodologies within medical image segmentation. Through our
analysis, we identify the strengths and limitations of
current failure detection metrics, advocating for the
risk-coverage analysis as a holistic evaluation approach.
Utilizing a collective dataset comprising five public 3D
medical image collections, we assess the efficacy of various
failure detection strategies under realistic test-time
distribution shifts. Our findings highlight the importance
of pixel confidence aggregation and we observe superior
performance of the pairwise Dice score (Roy et al., 2019)
between ensemble predictions, positioning it as a simple and
robust baseline for failure detection in medical image
segmentation. To promote ongoing research, we make the
benchmarking framework available to the community.},
keywords = {Distribution shift (Other) / Failure detection (Other) /
Quality control (Other) / Semantic segmentation (Other) /
Uncertainty estimation (Other)},
cin = {E230 / E290},
ddc = {610},
cid = {I:(DE-He78)E230-20160331 / I:(DE-He78)E290-20160331},
pnm = {315 - Bildgebung und Radioonkologie (POF4-315)},
pid = {G:(DE-HGF)POF4-315},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:39657400},
doi = {10.1016/j.media.2024.103392},
url = {https://inrepo02.dkfz.de/record/294922},
}