% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Kahl:294883,
author = {K.-C. Kahl$^*$ and C. Lüth$^*$ and M. Zenk$^*$ and K.
Maier-Hein$^*$ and P. F. Jaeger$^*$},
title = {{V}al{UES}: {A} {F}ramework for {S}ystematic {V}alidation
of {U}ncertainty {E}stimation in {S}emantic {S}egmentation},
publisher = {arXiv},
reportid = {DKFZ-2024-02593},
year = {2024},
note = {Published as a conference paper at ICLR 2024},
abstract = {Uncertainty estimation is an essential and heavily-studied
component for the reliable application of semantic
segmentation methods. While various studies exist claiming
methodological advances on the one hand, and successful
application on the other hand, the field is currently
hampered by a gap between theory and practice leaving
fundamental questions unanswered: Can data-related and
model-related uncertainty really be separated in practice?
Which components of an uncertainty method are essential for
real-world performance? Which uncertainty method works well
for which application? In this work, we link this research
gap to a lack of systematic and comprehensive evaluation of
uncertainty methods. Specifically, we identify three key
pitfalls in current literature and present an evaluation
framework that bridges the research gap by providing 1) a
controlled environment for studying data ambiguities as well
as distribution shifts, 2) systematic ablations of relevant
method components, and 3) test-beds for the five predominant
uncertainty applications: OoD-detection, active learning,
failure detection, calibration, and ambiguity modeling.
Empirical results on simulated as well as real-world data
demonstrate how the proposed framework is able to answer the
predominant questions in the field revealing for instance
that 1) separation of uncertainty types works on simulated
data but does not necessarily translate to real-world data,
2) aggregation of scores is a crucial but currently
neglected component of uncertainty methods, 3) While
ensembles are performing most robustly across the different
downstream tasks and settings, test-time augmentation often
constitutes a light-weight alternative. Code is at:
https://github.com/IML-DKFZ/values},
keywords = {Computer Vision and Pattern Recognition (cs.CV) (Other) /
FOS: Computer and information sciences (Other)},
cin = {E290 / E230},
cid = {I:(DE-He78)E290-20160331 / I:(DE-He78)E230-20160331},
pnm = {315 - Bildgebung und Radioonkologie (POF4-315)},
pid = {G:(DE-HGF)POF4-315},
typ = {PUB:(DE-HGF)25},
doi = {10.48550/ARXIV.2401.08501},
url = {https://inrepo02.dkfz.de/record/294883},
}