% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Rohde:166796,
author = {F. Rohde and M. Franke and Z. Sehili and M. Lablans$^*$ and
E. Rahm},
title = {{O}ptimization of the {M}ainzelliste software for fast
privacy-preserving record linkage.},
journal = {Journal of translational medicine},
volume = {19},
number = {1},
issn = {1479-5876},
address = {London},
publisher = {BioMed Central},
reportid = {DKFZ-2021-00126},
pages = {33},
year = {2021},
abstract = {Data analysis for biomedical research often requires a
record linkage step to identify records from multiple data
sources referring to the same person. Due to the lack of
unique personal identifiers across these sources, record
linkage relies on the similarity of personal data such as
first and last names or birth dates. However, the exchange
of such identifying data with a third party, as is the case
in record linkage, is generally subject to strict privacy
requirements. This problem is addressed by
privacy-preserving record linkage (PPRL) and
pseudonymization services. Mainzelliste is an open-source
record linkage and pseudonymization service used to carry
out PPRL processes in real-world use cases.We evaluate the
linkage quality and performance of the linkage process using
several real and near-real datasets with different
properties w.r.t. size and error-rate of matching records.
We conduct a comparison between (plaintext) record linkage
and PPRL based on encoded records (Bloom filters).
Furthermore, since the Mainzelliste software offers no
blocking mechanism, we extend it by phonetic blocking as
well as novel blocking schemes based on locality-sensitive
hashing (LSH) to improve runtime for both standard and
privacy-preserving record linkage.The Mainzelliste achieves
high linkage quality for PPRL using field-level Bloom
filters due to the use of an error-tolerant matching
algorithm that can handle variances in names, in particular
missing or transposed name compounds. However, due to the
absence of blocking, the runtimes are unacceptable for real
use cases with larger datasets. The newly implemented
blocking approaches improve runtimes by orders of magnitude
while retaining high linkage quality.We conduct the first
comprehensive evaluation of the record linkage facilities of
the Mainzelliste software and extend it with blocking
methods to improve its runtime. We observed a very high
linkage quality for both plaintext as well as encoded data
even in the presence of errors. The provided blocking
methods provide order of magnitude improvements regarding
runtime performance thus facilitating the use in research
projects with large datasets and many participants.},
keywords = {Blocking (Other) / Locality-sensitive hashing (Other) /
Mainzelliste (Other) / Privacy-preserving record linkage
(Other)},
cin = {E260},
ddc = {610},
cid = {I:(DE-He78)E260-20160331},
pnm = {315 - Bildgebung und Radioonkologie (POF4-315)},
pid = {G:(DE-HGF)POF4-315},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:33451317},
doi = {10.1186/s12967-020-02678-1},
url = {https://inrepo02.dkfz.de/record/166796},
}