% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{GhaffariLaleh:180070,
author = {N. Ghaffari Laleh and H. S. Muti and C. M. L. Loeffler and
A. Echle and O. L. Saldanha and F. Mahmood and M. Y. Lu and
C. Trautwein and R. Langer and B. Dislich and R. D. Buelow
and H. I. Grabsch and H. Brenner$^*$ and J. Chang-Claude$^*$
and E. Alwers$^*$ and T. J. Brinker$^*$ and F. Khader and D.
Truhn and N. T. Gaisa and P. Boor and M. Hoffmeister$^*$ and
V. Schulz and J. N. Kather},
title = {{B}enchmarking weakly-supervised deep learning pipelines
for whole slide classification in computational pathology.},
journal = {Medical image analysis},
volume = {79},
issn = {1361-8415},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DKFZ-2022-01069},
pages = {102474},
year = {2022},
abstract = {Artificial intelligence (AI) can extract visual information
from histopathological slides and yield biological insight
and clinical biomarkers. Whole slide images are cut into
thousands of tiles and classification problems are often
weakly-supervised: the ground truth is only known for the
slide, not for every single tile. In classical
weakly-supervised analysis pipelines, all tiles inherit the
slide label while in multiple-instance learning (MIL), only
bags of tiles inherit the label. However, it is still
unclear how these widely used but markedly different
approaches perform relative to each other. We implemented
and systematically compared six methods in six clinically
relevant end-to-end prediction tasks using data from N=2980
patients for training with rigorous external validation. We
tested three classical weakly-supervised approaches with
convolutional neural networks and vision transformers (ViT)
and three MIL-based approaches with and without an
additional attention module. Our results empirically
demonstrate that histological tumor subtyping of renal cell
carcinoma is an easy task in which all approaches achieve an
area under the receiver operating curve (AUROC) of above
0.9. In contrast, we report significant performance
differences for clinically relevant tasks of mutation
prediction in colorectal, gastric, and bladder cancer. In
these mutation prediction tasks, classical weakly-supervised
workflows outperformed MIL-based weakly-supervised methods
for mutation prediction, which is surprising given their
simplicity. This shows that new end-to-end image analysis
pipelines in computational pathology should be compared to
classical weakly-supervised methods. Also, these findings
motivate the development of new methods which combine the
elegant assumptions of MIL with the empirically observed
higher performance of classical weakly-supervised
approaches. We make all source codes publicly available at
https://github.com/KatherLab/HIA, allowing easy application
of all methods to any similar task.},
keywords = {Artificial intelligence (Other) / Computational pathology
(Other) / Convolutional neural networks (Other) /
Multiple-Instance Learning (Other) / Vision transformers
(Other) / Weakly-supervised deep learning (Other)},
cin = {C070 / C120 / HD01 / C020 / C140},
ddc = {610},
cid = {I:(DE-He78)C070-20160331 / I:(DE-He78)C120-20160331 /
I:(DE-He78)HD01-20160331 / I:(DE-He78)C020-20160331 /
I:(DE-He78)C140-20160331},
pnm = {313 - Krebsrisikofaktoren und Prävention (POF4-313)},
pid = {G:(DE-HGF)POF4-313},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:35588568},
doi = {10.1016/j.media.2022.102474},
url = {https://inrepo02.dkfz.de/record/180070},
}