forked from gabraham/flashpca
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflashpca.Rd
124 lines (101 loc) · 3.66 KB
/
flashpca.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/flashpca.R
\name{flashpca}
\alias{flashpca}
\title{Principal Component Analysis using FlashPCA}
\usage{
flashpca(
X,
ndim = 10,
stand = c("binom2", "binom", "sd", "center", "none"),
divisor = c("p", "n1", "none"),
maxiter = 100,
tol = 1e-04,
seed = 1,
block_size = 1000,
verbose = FALSE,
do_loadings = FALSE,
check_geno = TRUE,
return_scale = TRUE
)
}
\arguments{
\item{X}{A numeric matrix to perform PCA on, or a
character string pointing to a PLINK dataset.}
\item{ndim}{Integer. How many dimensions to return in results.}
\item{stand}{A character string indicating how to standardise X before PCA,
one of "binom" (old Eigenstrat-style), "binom2" (new Eigenstrat-style),
"sd" (zero-mean unit-variance), "center" (zero mean), or "none".}
\item{divisor}{A character string indicating whether to divide the
eigenvalues by number of columns of X ("p"), the number of
rows of X minus 1 ("n1") or none ("none").}
\item{maxiter}{Integer. How many iterations to use in PCA.}
\item{tol}{Numeric. Tolerance for determining PCA convergence.}
\item{seed}{Integer. Seed for random number generator.}
\item{block_size}{Integer. Block size for PCA on PLINK files.}
\item{verbose}{logical. More verbose output.}
\item{do_loadings}{Logical. Whether to compute loadings (matrix V from SVD).}
\item{check_geno}{Logical. Whether to explicitly check if the matrix X
contains values other than {0, 1, 2}, when stand="binom". This can be
be set to FALSE if you are sure your matrix only contains these values
(only matters when using stand="binom").}
\item{return_scale}{Logical. Whether to return the means and standard
deviations used in standardizing the matrix X.}
}
\value{
\code{flashpca} returns a list containing the following components:
\describe{
\item{values:}{a numeric vector. The eigenvalues of X X' / m.}
\item{vectors:}{a numeric matrix. The eigenvectors of X X' / m.}
\item{projection:}{a numeric matrix. Equivalent to X V.}
\item{loadings:}{a numeric matrix. The matrix of variable loadings, i.e., V
from SVD.}
\item{scale:}{a list of two elements, ``center'' and ''scale'', which was
used to standardise the input matrix X.}
}
}
\description{
Principal Component Analysis using FlashPCA
}
\details{
The default decomposition is of X X' / m, where m is the number of SNPs
(the denominator can be changed using the 'divisor' argument).
The data is standardised by default. \code{stand = "binom"} uses the old Eigensoft
(Price 2006) formulation of
\deqn{p_j = sum_{i=1}^n X_{i,j} / (2 * n)}
\deqn{mean_j = 2 * p}
\deqn{sd_j = sqrt(p * (1 - p)),}
where j is the index for the SNP and i is the index for the sample.
Alternatively, `stand = "binom2"' uses the newer formula, which is
similar to the above except for
\deqn{sd_j = sqrt(2 * p * (1 - p)).}
}
\examples{
set.seed(123)
#######################
## HapMap3 chr1 example
data(hm3.chr1)
ndim <- 10
X <- scale2(hm3.chr1$bed)
f1 <- flashpca(X, ndim = ndim, stand = "none")
# prcomp's is too slow for this example
r <- eigen(tcrossprod(X) / ncol(X), symmetric = TRUE)
## Compare eigenvalues
eval <- cbind(r$values[1:ndim], f1$values)
cor(eval)
mean((eval[, 1] - eval[, 2])^2)
## Compare eigenvectors
diag(cor(r$vectors[, 1:ndim], f1$vectors))
####################################
# HapMap3 chr1 example, PLINK format
bedf <- gsub(
"\\\\.bed", "",
system.file("extdata", "data_chr1.bed", package = "flashpcaR")
)
f2 <- flashpca(bedf, ndim = ndim)
eval <- cbind(r$values[1:ndim], f2$values)
cor(eval)
mean((eval[, 1] - eval[, 2])^2)
## Compare eigenvectors
diag(cor(r$vectors[, 1:ndim], f2$vectors))
}