% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Stoyan:132674,
      author       = {D. Stoyan and A. Pommerening and M. Hummel$^*$ and A.
                      Kopp-Schneider$^*$},
      title        = {{M}ultiple-rater kappas for binary data: {M}odels and
                      interpretation.},
      journal      = {Biometrical journal},
      volume       = {60},
      number       = {2},
      issn         = {0323-3847},
      address      = {Berlin},
      publisher    = {Wiley-VCH},
      reportid     = {DKFZ-2018-00334},
      pages        = {381 - 394},
      year         = {2018},
      abstract     = {Interrater agreement on binary measurements with more than
                      two raters is often assessed using Fleiss' κ, which is
                      known to be difficult to interpret. In situations where the
                      same raters rate all items, however, the far less known κ
                      suggested by Conger, Hubert, and Schouten is more
                      appropriate. We try to support the interpretation of these
                      characteristics by investigating various models or scenarios
                      of rating. Our analysis, which is restricted to binary data,
                      shows that conclusions concerning interrater agreement by κ
                      heavily depend on the population of items or subjects
                      considered, even if the raters have identical behavior. The
                      standard scale proposed by Landis and Koch, which verbally
                      interprets numerical values of κ, appears to be rather
                      subjective. On the basis of one of the models for rater
                      behavior, we suggest an alternative verbal interpretation
                      for kappa. Finally, we reconsider a classical example from
                      pathology to illustrate the application of our methods and
                      models. We also look for subgroups of raters with similar
                      rating behavior using hierarchical clustering.},
      cin          = {C060},
      ddc          = {570},
      cid          = {I:(DE-He78)C060-20160331},
      pnm          = {313 - Cancer risk factors and prevention (POF3-313)},
      pid          = {G:(DE-HGF)POF3-313},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:29280179},
      doi          = {10.1002/bimj.201600267},
      url          = {https://inrepo02.dkfz.de/record/132674},
}