% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Sill:127522,
author = {M. Sill$^*$ and M. Saadati$^*$ and A. Benner$^*$},
title = {{A}pplying stability selection to consistently estimate
sparse principal components in high-dimensional molecular
data.},
journal = {Bioinformatics},
volume = {31},
number = {16},
issn = {1460-2059},
address = {Oxford},
publisher = {Oxford Univ. Press},
reportid = {DKFZ-2017-03545},
pages = {2683 - 2690},
year = {2015},
abstract = {Principal component analysis (PCA) is a basic tool often
used in bioinformatics for visualization and dimension
reduction. However, it is known that PCA may not
consistently estimate the true direction of maximal
variability in high-dimensional, low sample size settings,
which are typical for molecular data. Assuming that the
underlying signal is sparse, i.e. that only a fraction of
features contribute to a principal component (PC), this
estimation consistency can be retained. Most existing sparse
PCA methods use L1-penalization, i.e. the lasso, to perform
feature selection. But, the lasso is known to lack variable
selection consistency in high dimensions and therefore a
subsequent interpretation of selected features can give
misleading results.We present S4VDPCA, a sparse PCA method
that incorporates a subsampling approach, namely stability
selection. S4VDPCA can consistently select the truly
relevant variables contributing to a sparse PC while also
consistently estimate the direction of maximal variability.
The performance of the S4VDPCA is assessed in a simulation
study and compared to other PCA approaches, as well as to a
hypothetical oracle PCA that knows the truly relevant
features in advance and thus finds optimal, unbiased sparse
PCs. S4VDPCA is computationally efficient and performs best
in simulations regarding parameter estimation consistency
and feature selection consistency. Furthermore, S4VDPCA is
applied to a publicly available gene expression data set of
medulloblastoma brain tumors. Features contributing to the
first two estimated sparse PCs represent genes significantly
over-represented in pathways typically deregulated between
molecular subgroups of medulloblastoma.Software is available
at
https://github.com/mwsill/s4vdpca.m.sill@dkfz.deSupplementary
data are available at Bioinformatics online.},
cin = {C060},
ddc = {004},
cid = {I:(DE-He78)C060-20160331},
pnm = {313 - Cancer risk factors and prevention (POF3-313)},
pid = {G:(DE-HGF)POF3-313},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:25861969},
pmc = {pmc:PMC4528629},
doi = {10.1093/bioinformatics/btv197},
url = {https://inrepo02.dkfz.de/record/127522},
}