% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Gruber:300314,
      author       = {S. Gruber$^*$ and F. Bach},
      title        = {{O}ptimizing {E}stimators of {S}quared {C}alibration
                      {E}rrors in {C}lassification},
      journal      = {Transactions on machine learning research},
      volume       = {nn},
      issn         = {2835-8856},
      address      = {[Amherst, Massachusetts]},
      publisher    = {OpenReview.net},
      reportid     = {DKFZ-2025-00760},
      pages        = {nn},
      year         = {2025},
      note         = {epub},
      abstract     = {In this work, we propose a mean-squared error-based risk
                      that enables the comparison and optimization of estimators
                      of squared calibration errors in practical settings.
                      Improving the calibration of classifiers is crucial for
                      enhancing the trustworthiness and interpretability of
                      machine learning models, especially in sensitive
                      decision-making scenarios. Although various calibration
                      (error) estimators exist in the current literature, there is
                      a lack of guidance on selecting the appropriate estimator
                      and tuning its hyperparameters. By leveraging the bilinear
                      structure of squared calibration errors, we reformulate
                      calibration estimation as a regression problem with
                      independent and identically distributed (i.i.d.) input
                      pairs. This reformulation allows us to quantify the
                      performance of different estimators even for the most
                      challenging calibration criterion, known as canonical
                      calibration. Our approach advocates for a
                      training-validation-testing pipeline when estimating a
                      calibration error on an evaluation dataset. We demonstrate
                      the effectiveness of our pipeline by optimizing existing
                      calibration estimators and comparing them with novel kernel
                      ridge regression-based estimators on standard image
                      classification tasks.},
      keywords     = {Machine Learning (cs.LG) (Other) / Machine Learning
                      (stat.ML) (Other) / FOS: Computer and information sciences
                      (Other)},
      cin          = {FM01},
      ddc          = {004},
      cid          = {I:(DE-He78)FM01-20160331},
      pnm          = {899 - ohne Topic (POF4-899)},
      pid          = {G:(DE-HGF)POF4-899},
      typ          = {PUB:(DE-HGF)16},
      doi          = {DOI:10.48550/arXiv.2410.07014},
      url          = {https://inrepo02.dkfz.de/record/300314},
}