% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Zenk:294922,
      author       = {M. Zenk$^*$ and D. Zimmerer$^*$ and F. Isensee$^*$ and J.
                      Traub$^*$ and T. Norajitra$^*$ and P. F. Jäger$^*$ and K.
                      Maier-Hein$^*$},
      title        = {{C}omparative benchmarking of failure detection methods in
                      medical image segmentation: {U}nveiling the role of
                      confidence aggregation.},
      journal      = {Medical image analysis},
      volume       = {101},
      issn         = {1361-8415},
      address      = {Amsterdam [u.a.]},
      publisher    = {Elsevier Science},
      reportid     = {DKFZ-2024-02629},
      pages        = {103392},
      year         = {2025},
      note         = {#EA:E230#LA:E230# / Available online 30 November 2024},
      abstract     = {Semantic segmentation is an essential component of medical
                      image analysis research, with recent deep learning
                      algorithms offering out-of-the-box applicability across
                      diverse datasets. Despite these advancements, segmentation
                      failures remain a significant concern for real-world
                      clinical applications, necessitating reliable detection
                      mechanisms. This paper introduces a comprehensive
                      benchmarking framework aimed at evaluating failure detection
                      methodologies within medical image segmentation. Through our
                      analysis, we identify the strengths and limitations of
                      current failure detection metrics, advocating for the
                      risk-coverage analysis as a holistic evaluation approach.
                      Utilizing a collective dataset comprising five public 3D
                      medical image collections, we assess the efficacy of various
                      failure detection strategies under realistic test-time
                      distribution shifts. Our findings highlight the importance
                      of pixel confidence aggregation and we observe superior
                      performance of the pairwise Dice score (Roy et al., 2019)
                      between ensemble predictions, positioning it as a simple and
                      robust baseline for failure detection in medical image
                      segmentation. To promote ongoing research, we make the
                      benchmarking framework available to the community.},
      keywords     = {Distribution shift (Other) / Failure detection (Other) /
                      Quality control (Other) / Semantic segmentation (Other) /
                      Uncertainty estimation (Other)},
      cin          = {E230 / E290},
      ddc          = {610},
      cid          = {I:(DE-He78)E230-20160331 / I:(DE-He78)E290-20160331},
      pnm          = {315 - Bildgebung und Radioonkologie (POF4-315)},
      pid          = {G:(DE-HGF)POF4-315},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:39657400},
      doi          = {10.1016/j.media.2024.103392},
      url          = {https://inrepo02.dkfz.de/record/294922},
}