% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Kahl:294883,
      author       = {K.-C. Kahl$^*$ and C. Lüth$^*$ and M. Zenk$^*$ and K.
                      Maier-Hein$^*$ and P. F. Jaeger$^*$},
      title        = {{V}al{UES}: {A} {F}ramework for {S}ystematic {V}alidation
                      of {U}ncertainty {E}stimation in {S}emantic {S}egmentation},
      publisher    = {arXiv},
      reportid     = {DKFZ-2024-02593},
      year         = {2024},
      note         = {Published as a conference paper at ICLR 2024},
      abstract     = {Uncertainty estimation is an essential and heavily-studied
                      component for the reliable application of semantic
                      segmentation methods. While various studies exist claiming
                      methodological advances on the one hand, and successful
                      application on the other hand, the field is currently
                      hampered by a gap between theory and practice leaving
                      fundamental questions unanswered: Can data-related and
                      model-related uncertainty really be separated in practice?
                      Which components of an uncertainty method are essential for
                      real-world performance? Which uncertainty method works well
                      for which application? In this work, we link this research
                      gap to a lack of systematic and comprehensive evaluation of
                      uncertainty methods. Specifically, we identify three key
                      pitfalls in current literature and present an evaluation
                      framework that bridges the research gap by providing 1) a
                      controlled environment for studying data ambiguities as well
                      as distribution shifts, 2) systematic ablations of relevant
                      method components, and 3) test-beds for the five predominant
                      uncertainty applications: OoD-detection, active learning,
                      failure detection, calibration, and ambiguity modeling.
                      Empirical results on simulated as well as real-world data
                      demonstrate how the proposed framework is able to answer the
                      predominant questions in the field revealing for instance
                      that 1) separation of uncertainty types works on simulated
                      data but does not necessarily translate to real-world data,
                      2) aggregation of scores is a crucial but currently
                      neglected component of uncertainty methods, 3) While
                      ensembles are performing most robustly across the different
                      downstream tasks and settings, test-time augmentation often
                      constitutes a light-weight alternative. Code is at:
                      https://github.com/IML-DKFZ/values},
      keywords     = {Computer Vision and Pattern Recognition (cs.CV) (Other) /
                      FOS: Computer and information sciences (Other)},
      cin          = {E290 / E230},
      cid          = {I:(DE-He78)E290-20160331 / I:(DE-He78)E230-20160331},
      pnm          = {315 - Bildgebung und Radioonkologie (POF4-315)},
      pid          = {G:(DE-HGF)POF4-315},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.48550/ARXIV.2401.08501},
      url          = {https://inrepo02.dkfz.de/record/294883},
}