% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Dexl:307499,
      author       = {J. Dexl and S. Gatidis and M. Früh and K. Jeblick and A.
                      Mittermeier and A. T. Stüber and B. Schachtner and J.
                      Topalis and M. P. Fabritius and S. Gu and G. K. Murugesan
                      and J. VanOss and J. Ye and J. He and A. Alloula and B. W.
                      Papież and Z. Mesbah and R. Modzelewski and M. Hadlich and
                      Z. Marinov and R. Stiefelhagen and F. Isensee$^*$ and K. H.
                      Maier-Hein$^*$ and A. Galdran and K. Nikolaou and C. la
                      Fougère$^*$ and M. Kim and N. Kallenberg and J.
                      Kleesiek$^*$ and K. Herrmann and R. Werner and M. Ingrisch
                      and C. C. Cyran and T. Küstner},
      title        = {{A}uto{PET} {C}hallenge on {F}ully {A}utomated {L}esion
                      {S}egmentation in {O}ncologic {PET}/{CT} {I}maging, {P}art
                      2: {D}omain {G}eneralization.},
      journal      = {Journal of nuclear medicine},
      volume       = {nn},
      issn         = {0097-9058},
      address      = {New York, NY},
      publisher    = {Soc.},
      reportid     = {DKFZ-2026-00004},
      pages        = {nn},
      year         = {2025},
      note         = {epub},
      abstract     = {This article reports the results of the second iteration of
                      the autoPET challenge on automated lesion segmentation in
                      whole-body PET/CT, held in conjunction with the 26th
                      International Conference on Medical Image Computing and
                      Computer Assisted Intervention in 2023. In contrast to the
                      first autoPET challenge, which served as a proof of concept,
                      this study investigates whether machine learning-based
                      segmentation models trained on data from a single source can
                      maintain performance across clinically relevant variations
                      in PET/CT data, reflecting the demands of real-world
                      deployment. Methods: A comprehensive biomedical segmentation
                      challenge on PET/CT domain generalization was designed and
                      conducted. Participants were tasked to train machine
                      learning models on annotated whole-body 18F-FDG data (n =
                      1,014). These models were then evaluated on a test set of
                      200 samples from 5 clinically relevant domains, including
                      variations in institutions, pathologies, and populations and
                      a different tracer. Performance was measured in terms of
                      average dice similarity coefficient, average false-positive
                      volume, and average false-negative volume. The
                      best-performing teams were awarded in 3 categories.
                      Furthermore, a detailed analysis was conducted after the
                      challenge, examining results across domains and unique
                      instances, along with a ranking analysis. Results:
                      Generalization from a single-source domain remains a
                      significant challenge. Seventeen international teams
                      successfully participated in the challenge. The
                      best-performing team reached an average dice similarity
                      coefficient of 0.5038, a mean false-positive volume of
                      87.8388 mL, and a mean false-negative volume of 8.4154 mL on
                      the test set. nnU-Net was the most commonly used framework,
                      with most participants using a 3-dimensional U-Net. Despite
                      competitive in-domain results, out-of-domain performance
                      deteriorated substantially, particularly on pediatric and
                      prostate-specific membrane antigen data. Detailed error
                      analysis revealed frequent false-positives due to
                      physiologic uptake and decreased sensitivity in detecting
                      small or low-uptake lesions. A majority-vote ensemble
                      offered minimal performance gains, whereas an oracle
                      ensemble indicates hypothetical gains. Ranking analysis
                      showed no single team consistently outperformed all others
                      across ranking schemes. Conclusion: The second autoPET
                      challenge provides a comprehensive evaluation of the current
                      state of automated PET/CT tumor segmentation, highlighting
                      both progress and persistent challenges of single-source
                      domain generalization and the need for diverse public
                      datasets to enhance algorithm robustness.},
      keywords     = {PET/CT (Other) / biomedical image analysis challenge
                      (Other) / deep learning (Other) / domain generalization
                      (Other) / oncology (Other) / segmentation (Other)},
      cin          = {E230 / TU01 / ED01},
      ddc          = {610},
      cid          = {I:(DE-He78)E230-20160331 / I:(DE-He78)TU01-20160331 /
                      I:(DE-He78)ED01-20160331},
      pnm          = {315 - Bildgebung und Radioonkologie (POF4-315)},
      pid          = {G:(DE-HGF)POF4-315},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:41469162},
      doi          = {10.2967/jnumed.125.270260},
      url          = {https://inrepo02.dkfz.de/record/307499},
}