From 4050a4d39fbaa1578288bf57ea063a0a349e3080 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 1 Sep 2023 14:58:11 +0200 Subject: [PATCH] update docs --- Rcpp/readme.org | 31 +++++----------------- Rcpp/vcfpp-read.cpp | 64 ++++++++++++++++++++++++++++++++++++++++----- doc/paper.org | 6 ++--- news.org | 4 +++ readme.org | 25 ++++++++++-------- test/bcf-reader.cpp | 49 +++++++++++++++++++++++++++++++--- 6 files changed, 132 insertions(+), 47 deletions(-) diff --git a/Rcpp/readme.org b/Rcpp/readme.org index 2a505d9..5495c6d 100644 --- a/Rcpp/readme.org +++ b/Rcpp/readme.org @@ -4,36 +4,19 @@ * Build R package -If you want to build a R package using vcfpp, here are the steps: +There is a R package https://github.com/Zilong-Li/vcfppR demonstrates +how to build your own R package with vcfpp. Here are the important steps: - download [[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in =src= folder. -- create an =Makevars= file in =src= folder with - #+begin_src R - PKG_CPPFLAGS = -I. -Ihtslib/ - PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) htslib/libhts.a -fPIC -lz -lbz2 -llzma -lcurl - - .PHONY: all - all : $(SHLIB) - $(SHLIB) : HTSLIB - - CC=$(shell "R CMD config CC") - CXX=$(shell "R CMD config CXX") - CPPFLAGS=$(shell "R CMD config CPPFLAGS") - LDFLAGS=$(shell "R CMD config LDFLAGS") - - HTSLIB: - cd htslib && autoreconf -i && ./configure --with-libdeflate=no && $(MAKE) libhts.a CXX="$(CXX)" CC="$(CC)" CPPFLAGS="$(CPPFLAGS) -fPIC " && cd .. - #+end_src - -Check https://github.com/rwdavies/STITCH for an example. +- copy the [[https://github.com/Zilong-Li/vcfppR/blob/main/src/Makevars][Makevars]] file in =src= folder * Write R scripts If you don't want to build a R package, then you just need to install -[[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in your R system enviroment. For example, assume -you have a conda enviroment =~/mambaforge/envs/R=, the system -envrionment are =~/mambaforge/envs/R/include= for header files and -=~/mambaforge/envs/R/lib= for library file. Then you can compile and run +[[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in your system environment. For example, assume +you have a conda environment =~/mambaforge/envs/R=, the system +environment are =~/mambaforge/envs/R/include= for header files and +=~/mambaforge/envs/R/lib= for library files. Then you can compile and run functions from the =vcfpp-read.cpp= file in R by #+begin_src R diff --git a/Rcpp/vcfpp-read.cpp b/Rcpp/vcfpp-read.cpp index bfb4c74..8593a34 100644 --- a/Rcpp/vcfpp-read.cpp +++ b/Rcpp/vcfpp-read.cpp @@ -1,8 +1,16 @@ #include -#include +#include "vcfpp.h" + using namespace Rcpp; using namespace std; +//' parse GT of a VCF file into tables in R +//' +//' @param vcffile path to the VCF file with index +//' @param region region to extract +//' @param samples samples to extract +//' @return A list of genotypes and other fixed fields in VCF +//' @export // [[Rcpp::export]] List tableGT(std::string vcffile, std::string region, std::string samples="-") { @@ -17,8 +25,6 @@ List tableGT(std::string vcffile, std::string region, std::string samples="-") for(int i = 0; i < nsnps; i++) { vcf.getNextVariant(var); - var.getGenotypes(gt); - GT[i] = gt; pos(i) = var.POS(); qual(i) = var.QUAL(); chr(i) = var.CHROM(); @@ -27,12 +33,21 @@ List tableGT(std::string vcffile, std::string region, std::string samples="-") alt(i) = var.ALT(); filter(i) = var.FILTER(); info(i) = var.INFO(); + var.getGenotypes(gt); + GT[i] = gt; } return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref, Named("alt") = alt, Named("qual") = qual, Named("filter") = filter, Named("info") = info, Named("gt") = GT); } +//' parse GL of a VCF file into tables in R +//' +//' @param vcffile path to the VCF file with index +//' @param region region to extract +//' @param samples samples to extract +//' @return A list of genotypes and other fixed fields in VCF +//' @export // [[Rcpp::export]] List tableGL(std::string vcffile, std::string region, std::string samples = "-") { @@ -42,13 +57,11 @@ List tableGL(std::string vcffile, std::string region, std::string samples = "-") CharacterVector chr(nsnps), ref(nsnps), alt(nsnps), id(nsnps), filter(nsnps), info(nsnps); IntegerVector pos(nsnps); NumericVector qual(nsnps); - vector> GL(nsnps); // R only have double + vector> GL(nsnps); vector gl; for(int i = 0; i < nsnps; i++) { vcf.getNextVariant(var); - var.getFORMAT("GL",gl); - GL[i] = gl; pos(i) = var.POS(); qual(i) = var.QUAL(); chr(i) = var.CHROM(); @@ -57,9 +70,48 @@ List tableGL(std::string vcffile, std::string region, std::string samples = "-") alt(i) = var.ALT(); filter(i) = var.FILTER(); info(i) = var.INFO(); + var.getFORMAT("GL",gl); + GL[i] = gl; } return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref, Named("alt") = alt, Named("qual") = qual, Named("filter") = filter, Named("info") = info, Named("gl") = GL); } + +//' parse PL of a VCF file into tables in R +//' +//' @param vcffile path to the VCF file with index +//' @param region region to extract +//' @param samples samples to extract +//' @return A list of genotypes and other fixed fields in VCF +//' @export +// [[Rcpp::export]] +List tablePL(std::string vcffile, std::string region, std::string samples = "-") +{ + vcfpp::BcfReader vcf(vcffile, samples); + vcfpp::BcfRecord var(vcf.header); + int nsnps = vcf.getRegionIndex(region); + CharacterVector chr(nsnps), ref(nsnps), alt(nsnps), id(nsnps), filter(nsnps), info(nsnps); + IntegerVector pos(nsnps); + NumericVector qual(nsnps); + vector> PL(nsnps); + vector pl; + for(int i = 0; i < nsnps; i++) + { + vcf.getNextVariant(var); + pos(i) = var.POS(); + qual(i) = var.QUAL(); + chr(i) = var.CHROM(); + id(i) = var.ID(); + ref(i) = var.REF(); + alt(i) = var.ALT(); + filter(i) = var.FILTER(); + info(i) = var.INFO(); + var.getFORMAT("PL",pl); + PL[i] = pl; + } + return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref, + Named("alt") = alt, Named("qual") = qual, Named("filter") = filter, + Named("info") = info, Named("pl") = PL); +} diff --git a/doc/paper.org b/doc/paper.org index 1cf06e4..cd6944d 100644 --- a/doc/paper.org +++ b/doc/paper.org @@ -229,9 +229,9 @@ https://github.com/Zilong-Li/vcfpp/tree/main/tools. * Acknowledgments -I would like to thank Anders Albrechtsen and Robert W Davies for -helpful comments. Also, this project is largely inspired by open -source projects SeqLib and cyvcf2. +I would like to thank Anders Albrechtsen at Copenhagen University and +Robert W Davies at Oxford University for helpful comments. Also, this +project is inspired by open source projects SeqLib and cyvcf2. #+print_bibliography: diff --git a/news.org b/news.org index f1635d1..a7b5362 100644 --- a/news.org +++ b/news.org @@ -1,5 +1,9 @@ #+title: News and Changes +* v0.2.0 +- major release +* v0.1.8 +- fix BcfWriter * v0.1.7 - add split_string - add INFO() diff --git a/readme.org b/readme.org index ca14089..d930d65 100644 --- a/readme.org +++ b/readme.org @@ -21,11 +21,11 @@ Features: - [[#installation][Installation]] - [[#usage][Usage]] - [[#reading-vcf][Reading VCF]] - - [[#genotypes][Genotypes]] + - [[#genotypes-coding][Genotypes Coding]] - [[#writing-vcf][Writing VCF]] - [[#variants-operation][Variants Operation]] - [[#header-operation][Header Operation]] -- [[#rcpp-examples][Rcpp Examples]] +- [[#working-with-r][Working with R]] - [[#command-line-tools][Command Line Tools]] #+END_QUOTE @@ -69,23 +69,26 @@ int main(int argc, char* argv[]) } #+end_src -** Genotypes -There are 3 types used for genotypes, ie vector, vector -and vector. One can use vector and vector for +** Genotypes Coding + +There are 3 types used for genotypes, ie. =vector=, =vector= and =vector=. One can use =vector= and =vector= for memory-effcient goal. The downside is that it only stores 0 and 1. And -vector can store missing values and multialleles. +=vector= can store missing values and multialleles. *** Code genotypes with missing allele as heterozygous. If you use =vector= and =vector= to store the genotypes, then there is no way to represent missing values. Hence the returned genotypes always have 0s and 1s. And genotypes with missing allele -(eg. =0/.=, =./0=, =1/.=, =./1=, =./.=) are codes as =1/0=. +(eg. =0/.=, =./0=, =1/.=, =./1=, =./.=) are codes as =1/0=. It's recommended to +use =var.isNoneMissing()= to check if there is missing value. *** Code missing allele as -9 -If you use =vector= to store the genotypes, then any missing allele -will be coded as =-9=, following the design of PLINK. +If this default behavior for =vector= and =vector= is not what +you want, you should use =vector= to store the genotypes, then any +missing allele will be coded as =-9=. *Note* you should take the missing +value =-9= into account for downstream analysis. ** Writing VCF @@ -156,9 +159,9 @@ var.POS(), var.setPOS(); // get POS or modify POS All variants related API can be found in [[https://zilongli.org/proj/vcfpp/classvcfpp_1_1_bcf_header][BcfHeader]]. -* Rcpp Examples +* Working with R -Example functions using Rcpp are in folder [[Rcpp]]. +Examples of vcfpp working with R are in folder [[Rcpp]] and https://github.com/Zilong-Li/vcfppR. * Command Line Tools diff --git a/test/bcf-reader.cpp b/test/bcf-reader.cpp index 0b06636..e43c581 100644 --- a/test/bcf-reader.cpp +++ b/test/bcf-reader.cpp @@ -66,14 +66,15 @@ TEST_CASE("parse vcf with missing genotypes diploid - vector", "[bcf-reader TEST_CASE("parse vcf with multialleles - vector", "[bcf-reader]") { - BcfWriter bw("test-multialleles.vcf", "VCF4.3"); + string vcffile{"test-multialleles.vcf.gz"}; + BcfWriter bw(vcffile, "VCF4.2"); bw.header.addContig("chr20"); - bw.header.addFORMAT("GT", "2", "String", "Genotype"); bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); + bw.header.addFORMAT("GT", "1", "String", "Genotype"); for(auto & s : {"id01", "id02", "id03"}) bw.header.addSample(s); // add 3 samples bw.writeLine("chr20\t2006060\trs146931526\tG\tA,T\t100\tPASS\tAF=0.02\tGT\t1|2\t1|1\t0|2"); bw.close(); - BcfReader br("test-multialleles.vcf"); + BcfReader br("test-multialleles.vcf.gz"); BcfRecord var(br.header); vector gt; while(br.getNextVariant(var)) @@ -82,3 +83,45 @@ TEST_CASE("parse vcf with multialleles - vector", "[bcf-reader]") for(auto g : gt) cout << g << endl; } } + +TEST_CASE("parse PL in vcf - vector", "[bcf-reader]") +{ + string vcffile{"test-GL.vcf.gz"}; + BcfWriter bw(vcffile, "VCF4.2"); + bw.header.addContig("chr20"); + bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); + bw.header.addFORMAT("GT", "1", "String", "Genotype"); + bw.header.addFORMAT("PL", "G", "Integer", "List of Phred-scaled genotype likelihoods"); + for(auto & s : {"id01", "id02"}) bw.header.addSample(s); // add 3 samples + bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:PL\t0/0:0,9,75\t0/0:0,3,30"); + bw.close(); + BcfReader br(vcffile); + BcfRecord var(br.header); + vector pl; + while(br.getNextVariant(var)) + { + var.getFORMAT("PL",pl); + for(auto g : pl) cout << g << endl; + } +} + +TEST_CASE("parse GL in vcf - vector", "[bcf-reader]") +{ + string vcffile{"test-GL.vcf.gz"}; + BcfWriter bw(vcffile, "VCF4.2"); + bw.header.addContig("chr20"); + bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); + bw.header.addFORMAT("GT", "1", "String", "Genotype"); + bw.header.addFORMAT("GL", "G", "Float", "List of log scale genotype likelihoods"); + for(auto & s : {"id01", "id02"}) bw.header.addSample(s); // add 3 samples + bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:GL\t0/1:-323.03,-99.29,-802.53\t1/1:-133.03,-299.29,-902.53"); + bw.close(); + BcfReader br(vcffile); + BcfRecord var(br.header); + vector gl; + while(br.getNextVariant(var)) + { + var.getFORMAT("GL",gl); + for(auto g : gl) cout << g << endl; + } +}