% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Stoyan:132674,
author = {D. Stoyan and A. Pommerening and M. Hummel$^*$ and A.
Kopp-Schneider$^*$},
title = {{M}ultiple-rater kappas for binary data: {M}odels and
interpretation.},
journal = {Biometrical journal},
volume = {60},
number = {2},
issn = {0323-3847},
address = {Berlin},
publisher = {Wiley-VCH},
reportid = {DKFZ-2018-00334},
pages = {381 - 394},
year = {2018},
abstract = {Interrater agreement on binary measurements with more than
two raters is often assessed using Fleiss' κ, which is
known to be difficult to interpret. In situations where the
same raters rate all items, however, the far less known κ
suggested by Conger, Hubert, and Schouten is more
appropriate. We try to support the interpretation of these
characteristics by investigating various models or scenarios
of rating. Our analysis, which is restricted to binary data,
shows that conclusions concerning interrater agreement by κ
heavily depend on the population of items or subjects
considered, even if the raters have identical behavior. The
standard scale proposed by Landis and Koch, which verbally
interprets numerical values of κ, appears to be rather
subjective. On the basis of one of the models for rater
behavior, we suggest an alternative verbal interpretation
for kappa. Finally, we reconsider a classical example from
pathology to illustrate the application of our methods and
models. We also look for subgroups of raters with similar
rating behavior using hierarchical clustering.},
cin = {C060},
ddc = {570},
cid = {I:(DE-He78)C060-20160331},
pnm = {313 - Cancer risk factors and prevention (POF3-313)},
pid = {G:(DE-HGF)POF3-313},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:29280179},
doi = {10.1002/bimj.201600267},
url = {https://inrepo02.dkfz.de/record/132674},
}