update readme for v0.4.9

Zilong-Li · Dec 26, 2024 · e2ed3a0 · e2ed3a0
1 parent 9544799
commit e2ed3a0
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 101 deletions.
diff --git a/PCAone.pdf b/PCAone.pdf
diff --git a/README.org b/README.org
@@ -1,5 +1,5 @@
 #+TITLE: Principal Component Analysis All in One
-#+subtitle: Version 0.4.8
+#+subtitle: Version 0.4.9
 #+author: Zilong Li 
 #+email: [email protected]
 #+options: toc:2 num:nil email:t -:nil ^:nil
@@ -13,7 +13,6 @@
 #+latex_header: \hypersetup{colorlinks=true, linkcolor=blue}
 #+latex: \clearpage
 
-
 [[https://github.com/Zilong-Li/PCAone/actions/workflows/linux.yml/badge.svg]]
 [[https://github.com/Zilong-Li/PCAone/actions/workflows/mac.yml/badge.svg]]
 [[https://bioconda.github.io/recipes/pcaone/README.html][https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat]]
@@ -43,11 +42,9 @@ missingness and uncertainty. The PDF manual can be downloaded [[https://github.c
 * Table of Contents :toc:quote:noexport:
 #+BEGIN_QUOTE
 - [[#introduction][Introduction]]
-- [[#cite-the-work][Cite the work]]
 - [[#features][Features]]
+- [[#cite-the-work][Cite the work]]
 - [[#quick-start][Quick start]]
-  - [[#download-pcaone][Download PCAone]]
-  - [[#download-example-dataset][Download example dataset]]
 - [[#installation][Installation]]
   - [[#download-compiled-binary][Download compiled binary]]
   - [[#via-conda][Via Conda]]
@@ -72,16 +69,6 @@ missingness and uncertainty. The PDF manual can be downloaded [[https://github.c
 - [[#acknowledgements][Acknowledgements]]
 #+END_QUOTE
 
-* Cite the work
-
-- If you use PCAone, please first cite our paper on genome reseach [[https://genome.cshlp.org/content/early/2023/10/05/gr.277525.122][Fast and accurate out-of-core PCA framework for large scale biobank data]].
-
-- If using the EMU algorithm, please also cite [[https://academic.oup.com/bioinformatics/article/37/13/1868/6103565][Large-scale inference of population structure in presence of missingness using PCA]].
-
-- If using the PCAngsd algorithm, please also cite [[https://www.genetics.org/content/210/2/719][Inferring Population Structure and Admixture Proportions in Low-Depth NGS Data]].
-
-- If using the ancestry ajusted LD statistics for pruning and clumping, please also cite [[https://doi.org/10.1101/2024.05.02.592187][Measuring linkage disequilibrium and improvement of pruning and clumping in structured populations]].
-
 * Features
 
 See [[file:CHANGELOG.org][change log]] here.
@@ -93,11 +80,21 @@ See [[file:CHANGELOG.org][change log]] here.
 - Supports a general comma separated CSV format for single cell RNA-seq or bulk RNA-seq data compressed by [[https://github.com/facebook/zstd][zstd]].
 - Supports [[https://github.com/Rosemeis/emu][EMU]] algorithm for scenario with large proportion of missingness.
 - Supports [[https://github.com/Rosemeis/pcangsd][PCAngsd]] algorithm for low coverage sequencing scenario with genotype likelihood as input.
-- Novel [[https://www.biorxiv.org/content/10.1101/2024.05.02.592187v1][LD]] prunning and clumping method for admixed population.
+- Novel [[https://www.biorxiv.org/content/10.1101/2024.05.02.592187v1][LD]] prunning and clumping method that accounts for population structure in the data.
+- Projection support for data with missingness.
+- HWE test taking population structure into account.
 
-* Quick start
+* Cite the work
 
-We can run the following on Linux to have a quick start.
+- If you use PCAone, please first cite our paper on genome reseach [[https://genome.cshlp.org/content/early/2023/10/05/gr.277525.122][Fast and accurate out-of-core PCA framework for large scale biobank data]].
+
+- If using the EMU algorithm, please also cite [[https://academic.oup.com/bioinformatics/article/37/13/1868/6103565][Large-scale inference of population structure in presence of missingness using PCA]].
+
+- If using the PCAngsd algorithm, please also cite [[https://www.genetics.org/content/210/2/719][Inferring Population Structure and Admixture Proportions in Low-Depth NGS Data]].
+
+- If using the ancestry ajusted LD statistics for pruning and clumping, please also cite [[https://doi.org/10.1101/2024.05.02.592187][Measuring linkage disequilibrium and improvement of pruning and clumping in structured populations]].
+
+* Quick start
 
 #+begin_src shell
 pkg=https://github.com/Zilong-Li/PCAone/releases/latest/download/PCAone-avx2-Linux.zip
@@ -112,7 +109,7 @@ plot(d[,1:2+2], col=factor(d[,1]), xlab="PC1", ylab="PC2");
 legend("topright", legend=unique(d[,1]), col=1:4, pch = 21, cex=1.2);'
 #+end_src
 
-We will find those files in current directory.
+You will find these files in current directory.
 
 #+begin_src shell
 .
@@ -125,9 +122,17 @@ We will find those files in current directory.
 └── pcaone.log        # log file
 #+end_src
 
-** Download PCAone
+\newpage
 
-For most modern CPUs and Linux systems, download the one named with =avx2=.
+* Installation
+
+There are 3 ways to install PCAone.
+
+** Download compiled binary
+
+There are compiled binaries provided for both Linux and Mac platform. Check
+[[https://github.com/Zilong-Li/PCAone/releases][the releases page]] to download one. For most modern CPUs and Linux systems,
+download the one named with =avx2=.
 
 #+begin_src shell
 pkg=https://github.com/Zilong-Li/PCAone/releases/latest/download/PCAone-avx2-Linux.zip
@@ -143,25 +148,6 @@ wget $pkg || curl -LO $pkg
 unzip -o PCAone-x64-Linux.zip
 #+end_src
 
-** Download example dataset
-
-#+begin_src shell
-pkg=http://popgen.dk/zilong/datahub/pca/example.tar.gz
-wget $pkg || curl -LO $pkg
-tar -xzf example.tar.gz && rm -f example.tar.gz
-#+end_src
-
-You should find a fold named =example= with some example data.
-
-* Installation
-
-There are 3 ways to install PCAone.
-
-** Download compiled binary
-
-There are compiled binaries provided for both Linux and Mac platform. Check
-[[https://github.com/Zilong-Li/PCAone/releases][the releases page]] to download one.
-
 ** Via Conda
 
 PCAone is also available from [[https://anaconda.org/bioconda/pcaone][bioconda]].
@@ -235,62 +221,71 @@ If this doesn't work because the server is too outdated, run =make clean && make
 * Documentation
 ** Options
 
-Run =./PCAone --help= to see all options including hidden advanced options. The below are common useful options.
+Run =PCAone --groff > pcaone.1 && man ./pcaone.1= or =PCAone --help= to read the manual. Here are common options.
 
 #+begin_src example
-Main options:
+General options:
   -h, --help                     print all options including hidden advanced options
-  -d, --svd arg (=2)             SVD method to be applied. default 2 is recommended for big data.
-                                 0: the Implicitly Restarted Arnoldi Method (IRAM)
-                                 1: the Yu's single-pass Randomized SVD with power iterations
-                                 2: the accurate window-based Randomized SVD method (PCAone)
-                                 3: the full Singular Value Decomposition.
-  -b, --bfile arg                prefix to PLINK .bed/.bim/.fam files
-  -B, --binary arg               path of binary file
-  -c, --csv arg                  path of comma seperated CSV file compressed by zstd
-  -g, --bgen arg                 path of BGEN file compressed by gzip/zstd
-  -G, --beagle arg               path of BEAGLE file compressed by gzip
-  --USV arg                      prefix to PCAone .eigvecs/.eigvals/.loadings/.mbim
-  --read-U arg                   path of file with left singular vectors (.eigvecs)
-  --read-V arg                   path of file with right singular vectors (.loadings)
-  --read-S arg                   path of file with eigen values (.eigvals)
-  -k, --pc arg (=10)             top k principal components (PCs) to be calculated
   -m, --memory arg (=0)          RAM usage in GB unit for out-of-core mode. default is in-core mode
   -n, --threads arg (=12)        the number of threads to be used
-  -o, --out arg (=pcaone)        prefix to output files. default [pcaone]
-  -p, --maxp arg (=40)           maximum number of power iterations for RSVD algorithm
-  -S, --no-shuffle               do not shuffle columns of data for --svd 2 (if not locally correlated)
-  -v, --verbose                  verbose message output
-  -V, --printv                   output the right eigenvectors with suffix .loadings
-  -C, --scale arg (=0)           do scaling for input file.
-                                 0: do just centering
-                                 1: do log transformation eg. log(x+0.01) for RNA-seq data
-                                 2: do count per median log transformation (CPMED) for scRNAs
-  --emu                          use EMU algorithm for genotype input with missingness
-  --pcangsd                      use PCAngsd algorithm for genotype likelihood input
+  -v, --verbose arg (=1)         verbosity level for logs. any level x includes messages for all levels (1...x). Options are
+                                 0: silent, no messages on screen;
+                                 1: concise messages to screen;
+                                 2: more verbose information;
+                                 3: enable debug information.
+
+PCA algorithms:
+  -d, --svd arg (=2)             SVD method to be applied. default 2 is recommended for big data. Options are
+                                 0: the Implicitly Restarted Arnoldi Method (IRAM);
+                                 1: the Yu's single-pass Randomized SVD with power iterations;
+                                 2: the accurate window-based Randomized SVD method (PCAone);
+                                 3: the full Singular Value Decomposition.
+  -k, --pc arg (=10)             top k principal components (PCs) to be calculated
+  -C, --scale arg (=0)           do scaling for input file. Options are
+                                 0: do just centering;
+                                 1: do log transformation eg. log(x+0.01) for RNA-seq data;
+                                 2: do count per median log transformation (CPMED) for scRNAs.
+  -p, --maxp arg (=40)           maximum number of power iterations for RSVD algorithm.
+  -S, --no-shuffle               do not shuffle columns of data for --svd 2 (if not locally correlated).
+  --emu                          use EMU algorithm for genotype input with missingness.
+  --pcangsd                      use PCAngsd algorithm for genotype likelihood input.
+
+Input options:
+  -b, --bfile arg                prefix of PLINK .bed/.bim/.fam files.
+  -B, --binary arg               path of binary file.
+  -c, --csv arg                  path of comma seperated CSV file compressed by zstd.
+  -g, --bgen arg                 path of BGEN file compressed by gzip/zstd.
+  -G, --beagle arg               path of BEAGLE file compressed by gzip.
+  -f, --match-bim arg            the .mbim file to be matched, where the 7th column is allele frequency.
+  --USV arg                      prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim.
+
+Output options:
+  -o, --out arg (=pcaone)        prefix of output files. default [pcaone].
+  -V, --printv                   output the right eigenvectors with suffix .loadings.
+  -D, --ld                       output a binary matrix for downstream LD related analysis.
+  -R, --print-r2                 print LD R2 to *.ld.gz file for pairwise SNPs within a window controlled by --ld-bp.
+
+Misc options:
   --maf arg (=0)                 exclude variants with MAF lower than this value
-  --match-bim arg                the .mbim file to be matched, where the 7th column is allele frequency
-  --project arg (=0)             project the new samples onto the existing PCs.
-                                 0: disabled
-                                 1: by multiplying the loadings with mean imputation for missing genotypes
-                                 2: by solving the least squares system Vx=g. skip sites with missingness
-                                 3: by Augmentation, Decomposition and Procrusters transformation
-  --inbreed arg (=0)             compute the inbreeding coefficient accounting for population structure.
-                                 0: disabled
-                                 1: compute per-site inbreeding coefficient and HWE test
-  --ld                           output a binary matrix for downstream LD related analysis
-  --ld-r2 arg (=0)               r2 cutoff for LD-based pruning. (usually 0.2)
-  --ld-bp arg (=0)               physical distance threshold in bases for LD. (usually 1000000)
-  --ld-stats arg (=0)            statistics to calculate LD r2 for pairwise SNPs.
-                                 0: the ancestry adjusted, i.e. correlation between residuals
-                                 1: the standard, i.e. correlation between two alleles
-  --print-r2                     print LD r2 to *.ld.gz file for pairwise SNPs within a window
-  --clump arg                    assoc-like file with target variants and pvalues for clumping
-  --clump-names arg (=CHR,BP,P)  column names in assoc-like file for locating chr, pos and pvalue
-  --clump-p1 arg (=0.0001)       significance threshold for index SNPs
-  --clump-p2 arg (=0.01)         secondary significance threshold for clumped SNPs
-  --clump-r2 arg (=0.5)          r2 cutoff for LD-based clumping
-  --clump-bp arg (=250000)       physical distance threshold in bases for clumping
+  --project arg (=0)             project the new samples onto the existing PCs. Options are
+                                 0: disabled;
+                                 1: by multiplying the loadings with mean imputation for missing genotypes;
+                                 2: by solving the least squares system Vx=g. skip sites with missingness;
+                                 3: by Augmentation, Decomposition and Procrusters transformation.
+  --inbreed arg (=0)             compute the inbreeding coefficient accounting for population structure. Options are
+                                 0: disabled;
+                                 1: compute per-site inbreeding coefficient and HWE test.
+  --ld-r2 arg (=0)               R2 cutoff for LD-based pruning (usually 0.2).
+  --ld-bp arg (=0)               physical distance threshold in bases for LD window (usually 1000000).
+  --ld-stats arg (=0)            statistics to compute LD R2 for pairwise SNPs. Options are
+                                 0: the ancestry adjusted, i.e. correlation between residuals;
+                                 1: the standard, i.e. correlation between two alleles.
+  --clump arg                    assoc-like file with target variants and pvalues for clumping.
+  --clump-names arg (=CHR,BP,P)  column names in assoc-like file for locating chr, pos and pvalue.
+  --clump-p1 arg (=0.0001)       significance threshold for index SNPs.
+  --clump-p2 arg (=0.01)         secondary significance threshold for clumped SNPs.
+  --clump-r2 arg (=0.5)          r2 cutoff for LD-based clumping.
+  --clump-bp arg (=250000)       physical distance threshold in bases for clumping.
 #+end_src
 
 \newpage

diff --git a/misc/architecture.org b/misc/architecture.org
@@ -13,6 +13,8 @@ listfonts
 
 * UML 
 
+generated by plantuml-1.2024.4.jar
+
 #+begin_src plantuml :file architecture.png
 skinparam dpi 300
 skinparam defaultFontName FiraCode Nerd Font Med
@@ -64,10 +66,11 @@ package "PCA methods" as PCA {
 }
 
 class "PCA Related" as PCAem {
-  +EMU()
-  +PCAngsd()
-  +LD_pruning()
-  +LD_clumping()
+  +EMU
+  +PCAngsd
+  +Projection
+  +LD
+  +HWE
 }
 FullSVD <-- Data: <:astonished:>
 IRAM <-- Data: <:smile:>

diff --git a/misc/architecture.png b/misc/architecture.png
diff --git a/src/Cmd.cpp b/src/Cmd.cpp
@@ -16,13 +16,13 @@ Param::Param(int argc, char **argv) {
   std::string copyr{"PCA All In One (v" + (std::string)VERSION + ")        https://github.com/Zilong-Li/PCAone\n" +
                     "(C) 2021-2024 Zilong Li        GNU General Public License v3\n" +
   "\n" +
-                    "Usage: use PLINK files as input and apply default window-based RSVD method:\n" +
+                    "Usage: 1) use PLINK files as input and apply default window-based RSVD method\n" +
                     "       $ PCAone -b plink \n\n" +
-                    "       use CSV file as input and apply the Implicitly Restarted Arnoldi Method:\n" +
+                    "       2) use CSV file as input and apply the Implicitly Restarted Arnoldi Method\n" +
                     "       $ PCAone -c csv.zst -d 0 \n\n" +
-                    "       compute ancestry adjusted LD matrix:\n" +
+                    "       3) compute ancestry adjusted LD matrix and R2\n" +
                     "       $ PCAone -b plink -k 2 -D -o adj \n" +
-                    "       $ PCAone -B adj -f adj.mbim --print-r2 -o adj \n" +
+                    "       $ PCAone -B adj -f adj.mbim -R" +
   "\n"};
   OptionParser opts(copyr);
   opts.add<Value<std::string>, Attribute::headline>("","PCAone","General options:");
@@ -81,7 +81,7 @@ Param::Param(int argc, char **argv) {
   opts.add<Value<std::string>>("o", "out", "prefix of output files. default [pcaone].", fileout, &fileout);
   opts.add<Switch>("V", "printv", "output the right eigenvectors with suffix .loadings.", &printv);
   opts.add<Switch>("D", "ld", "output a binary matrix for downstream LD related analysis.", &ld);
-  opts.add<Switch>("R", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window.", &print_r2);
+  opts.add<Switch>("R", "print-r2", "print LD R2 to *.ld.gz file for pairwise SNPs within a window controlled by --ld-bp.", &print_r2);
 
   opts.add<Value<std::string>, Attribute::headline>("","MISC","Misc options:");
   opts.add<Value<double>>("", "maf", "exclude variants with MAF lower than this value", maf, &maf);
@@ -93,8 +93,8 @@ Param::Param(int argc, char **argv) {
   opts.add<Value<int>>("", "inbreed", "compute the inbreeding coefficient accounting for population structure. Options are\n"
                                       "0: disabled;\n"
                                       "1: compute per-site inbreeding coefficient and HWE test.\n", inbreed, &inbreed);
-  opts.add<Value<double>>("", "ld-r2", "r2 cutoff for LD-based pruning (usually 0.2).", ld_r2, &ld_r2);
-  opts.add<Value<uint>>("", "ld-bp", "physical distance threshold in bases for LD (usually 1000000).", ld_bp, &ld_bp);
+  opts.add<Value<double>>("", "ld-r2", "R2 cutoff for LD-based pruning (usually 0.2).", ld_r2, &ld_r2);
+  opts.add<Value<uint>>("", "ld-bp", "physical distance threshold in bases for LD window (usually 1000000).", ld_bp, &ld_bp);
   opts.add<Value<int>>("", "ld-stats", "statistics to compute LD R2 for pairwise SNPs. Options are\n"
                                        "0: the ancestry adjusted, i.e. correlation between residuals;\n"
                                        "1: the standard, i.e. correlation between two alleles.\n", ld_stats, &ld_stats);