Skip to content

Commit

Permalink
update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Sep 1, 2023
1 parent 8c21403 commit 4050a4d
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 47 deletions.
31 changes: 7 additions & 24 deletions Rcpp/readme.org
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,19 @@

* Build R package

If you want to build a R package using vcfpp, here are the steps:
There is a R package https://github.com/Zilong-Li/vcfppR demonstrates
how to build your own R package with vcfpp. Here are the important steps:

- download [[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in =src= folder.
- create an =Makevars= file in =src= folder with
#+begin_src R
PKG_CPPFLAGS = -I. -Ihtslib/
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) htslib/libhts.a -fPIC -lz -lbz2 -llzma -lcurl

.PHONY: all
all : $(SHLIB)
$(SHLIB) : HTSLIB

CC=$(shell "R CMD config CC")
CXX=$(shell "R CMD config CXX")
CPPFLAGS=$(shell "R CMD config CPPFLAGS")
LDFLAGS=$(shell "R CMD config LDFLAGS")

HTSLIB:
cd htslib && autoreconf -i && ./configure --with-libdeflate=no && $(MAKE) libhts.a CXX="$(CXX)" CC="$(CC)" CPPFLAGS="$(CPPFLAGS) -fPIC " && cd ..
#+end_src

Check https://github.com/rwdavies/STITCH for an example.
- copy the [[https://github.com/Zilong-Li/vcfppR/blob/main/src/Makevars][Makevars]] file in =src= folder

* Write R scripts

If you don't want to build a R package, then you just need to install
[[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in your R system enviroment. For example, assume
you have a conda enviroment =~/mambaforge/envs/R=, the system
envrionment are =~/mambaforge/envs/R/include= for header files and
=~/mambaforge/envs/R/lib= for library file. Then you can compile and run
[[https://github.com/Zilong-Li/vcfpp/releases/latest][vcfpp.h]] and [[https://github.com/samtools/htslib][htslib]] in your system environment. For example, assume
you have a conda environment =~/mambaforge/envs/R=, the system
environment are =~/mambaforge/envs/R/include= for header files and
=~/mambaforge/envs/R/lib= for library files. Then you can compile and run
functions from the =vcfpp-read.cpp= file in R by

#+begin_src R
Expand Down
64 changes: 58 additions & 6 deletions Rcpp/vcfpp-read.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
#include <Rcpp.h>
#include <vcfpp.h>
#include "vcfpp.h"

using namespace Rcpp;
using namespace std;

//' parse GT of a VCF file into tables in R
//'
//' @param vcffile path to the VCF file with index
//' @param region region to extract
//' @param samples samples to extract
//' @return A list of genotypes and other fixed fields in VCF
//' @export
// [[Rcpp::export]]
List tableGT(std::string vcffile, std::string region, std::string samples="-")
{
Expand All @@ -17,8 +25,6 @@ List tableGT(std::string vcffile, std::string region, std::string samples="-")
for(int i = 0; i < nsnps; i++)
{
vcf.getNextVariant(var);
var.getGenotypes(gt);
GT[i] = gt;
pos(i) = var.POS();
qual(i) = var.QUAL();
chr(i) = var.CHROM();
Expand All @@ -27,12 +33,21 @@ List tableGT(std::string vcffile, std::string region, std::string samples="-")
alt(i) = var.ALT();
filter(i) = var.FILTER();
info(i) = var.INFO();
var.getGenotypes(gt);
GT[i] = gt;
}
return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref,
Named("alt") = alt, Named("qual") = qual, Named("filter") = filter,
Named("info") = info, Named("gt") = GT);
}

//' parse GL of a VCF file into tables in R
//'
//' @param vcffile path to the VCF file with index
//' @param region region to extract
//' @param samples samples to extract
//' @return A list of genotypes and other fixed fields in VCF
//' @export
// [[Rcpp::export]]
List tableGL(std::string vcffile, std::string region, std::string samples = "-")
{
Expand All @@ -42,13 +57,11 @@ List tableGL(std::string vcffile, std::string region, std::string samples = "-")
CharacterVector chr(nsnps), ref(nsnps), alt(nsnps), id(nsnps), filter(nsnps), info(nsnps);
IntegerVector pos(nsnps);
NumericVector qual(nsnps);
vector<vector<double>> GL(nsnps); // R only have double
vector<vector<float>> GL(nsnps);
vector<float> gl;
for(int i = 0; i < nsnps; i++)
{
vcf.getNextVariant(var);
var.getFORMAT("GL",gl);
GL[i] = gl;
pos(i) = var.POS();
qual(i) = var.QUAL();
chr(i) = var.CHROM();
Expand All @@ -57,9 +70,48 @@ List tableGL(std::string vcffile, std::string region, std::string samples = "-")
alt(i) = var.ALT();
filter(i) = var.FILTER();
info(i) = var.INFO();
var.getFORMAT("GL",gl);
GL[i] = gl;
}
return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref,
Named("alt") = alt, Named("qual") = qual, Named("filter") = filter,
Named("info") = info, Named("gl") = GL);
}


//' parse PL of a VCF file into tables in R
//'
//' @param vcffile path to the VCF file with index
//' @param region region to extract
//' @param samples samples to extract
//' @return A list of genotypes and other fixed fields in VCF
//' @export
// [[Rcpp::export]]
List tablePL(std::string vcffile, std::string region, std::string samples = "-")
{
vcfpp::BcfReader vcf(vcffile, samples);
vcfpp::BcfRecord var(vcf.header);
int nsnps = vcf.getRegionIndex(region);
CharacterVector chr(nsnps), ref(nsnps), alt(nsnps), id(nsnps), filter(nsnps), info(nsnps);
IntegerVector pos(nsnps);
NumericVector qual(nsnps);
vector<vector<int>> PL(nsnps);
vector<int> pl;
for(int i = 0; i < nsnps; i++)
{
vcf.getNextVariant(var);
pos(i) = var.POS();
qual(i) = var.QUAL();
chr(i) = var.CHROM();
id(i) = var.ID();
ref(i) = var.REF();
alt(i) = var.ALT();
filter(i) = var.FILTER();
info(i) = var.INFO();
var.getFORMAT("PL",pl);
PL[i] = pl;
}
return List::create(Named("chr") = chr, Named("pos") = pos, Named("id") = id, Named("ref") = ref,
Named("alt") = alt, Named("qual") = qual, Named("filter") = filter,
Named("info") = info, Named("pl") = PL);
}
6 changes: 3 additions & 3 deletions doc/paper.org
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,9 @@ https://github.com/Zilong-Li/vcfpp/tree/main/tools.

* Acknowledgments

I would like to thank Anders Albrechtsen and Robert W Davies for
helpful comments. Also, this project is largely inspired by open
source projects SeqLib and cyvcf2.
I would like to thank Anders Albrechtsen at Copenhagen University and
Robert W Davies at Oxford University for helpful comments. Also, this
project is inspired by open source projects SeqLib and cyvcf2.

#+print_bibliography:

Expand Down
4 changes: 4 additions & 0 deletions news.org
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#+title: News and Changes

* v0.2.0
- major release
* v0.1.8
- fix BcfWriter
* v0.1.7
- add split_string
- add INFO()
Expand Down
25 changes: 14 additions & 11 deletions readme.org
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ Features:
- [[#installation][Installation]]
- [[#usage][Usage]]
- [[#reading-vcf][Reading VCF]]
- [[#genotypes][Genotypes]]
- [[#genotypes-coding][Genotypes Coding]]
- [[#writing-vcf][Writing VCF]]
- [[#variants-operation][Variants Operation]]
- [[#header-operation][Header Operation]]
- [[#rcpp-examples][Rcpp Examples]]
- [[#working-with-r][Working with R]]
- [[#command-line-tools][Command Line Tools]]
#+END_QUOTE

Expand Down Expand Up @@ -69,23 +69,26 @@ int main(int argc, char* argv[])
}
#+end_src

** Genotypes
There are 3 types used for genotypes, ie vector<bool>, vector<char>
and vector<int>. One can use vector<bool> and vector<char> for
** Genotypes Coding

There are 3 types used for genotypes, ie. =vector<bool>=, =vector<char>= and =vector<int>=. One can use =vector<bool>= and =vector<char>= for
memory-effcient goal. The downside is that it only stores 0 and 1. And
vector<int> can store missing values and multialleles.
=vector<int>= can store missing values and multialleles.

*** Code genotypes with missing allele as heterozygous.

If you use =vector<bool>= and =vector<char>= to store the genotypes, then
there is no way to represent missing values. Hence the returned
genotypes always have 0s and 1s. And genotypes with missing allele
(eg. =0/.=, =./0=, =1/.=, =./1=, =./.=) are codes as =1/0=.
(eg. =0/.=, =./0=, =1/.=, =./1=, =./.=) are codes as =1/0=. It's recommended to
use =var.isNoneMissing()= to check if there is missing value.

*** Code missing allele as -9

If you use =vector<int>= to store the genotypes, then any missing allele
will be coded as =-9=, following the design of PLINK.
If this default behavior for =vector<bool>= and =vector<char>= is not what
you want, you should use =vector<int>= to store the genotypes, then any
missing allele will be coded as =-9=. *Note* you should take the missing
value =-9= into account for downstream analysis.

** Writing VCF

Expand Down Expand Up @@ -156,9 +159,9 @@ var.POS(), var.setPOS(); // get POS or modify POS

All variants related API can be found in [[https://zilongli.org/proj/vcfpp/classvcfpp_1_1_bcf_header][BcfHeader]].

* Rcpp Examples
* Working with R

Example functions using Rcpp are in folder [[Rcpp]].
Examples of vcfpp working with R are in folder [[Rcpp]] and https://github.com/Zilong-Li/vcfppR.

* Command Line Tools

Expand Down
49 changes: 46 additions & 3 deletions test/bcf-reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,15 @@ TEST_CASE("parse vcf with missing genotypes diploid - vector<int>", "[bcf-reader

TEST_CASE("parse vcf with multialleles - vector<int>", "[bcf-reader]")
{
BcfWriter bw("test-multialleles.vcf", "VCF4.3");
string vcffile{"test-multialleles.vcf.gz"};
BcfWriter bw(vcffile, "VCF4.2");
bw.header.addContig("chr20");
bw.header.addFORMAT("GT", "2", "String", "Genotype");
bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)");
bw.header.addFORMAT("GT", "1", "String", "Genotype");
for(auto & s : {"id01", "id02", "id03"}) bw.header.addSample(s); // add 3 samples
bw.writeLine("chr20\t2006060\trs146931526\tG\tA,T\t100\tPASS\tAF=0.02\tGT\t1|2\t1|1\t0|2");
bw.close();
BcfReader br("test-multialleles.vcf");
BcfReader br("test-multialleles.vcf.gz");
BcfRecord var(br.header);
vector<int> gt;
while(br.getNextVariant(var))
Expand All @@ -82,3 +83,45 @@ TEST_CASE("parse vcf with multialleles - vector<int>", "[bcf-reader]")
for(auto g : gt) cout << g << endl;
}
}

TEST_CASE("parse PL in vcf - vector<int>", "[bcf-reader]")
{
string vcffile{"test-GL.vcf.gz"};
BcfWriter bw(vcffile, "VCF4.2");
bw.header.addContig("chr20");
bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)");
bw.header.addFORMAT("GT", "1", "String", "Genotype");
bw.header.addFORMAT("PL", "G", "Integer", "List of Phred-scaled genotype likelihoods");
for(auto & s : {"id01", "id02"}) bw.header.addSample(s); // add 3 samples
bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:PL\t0/0:0,9,75\t0/0:0,3,30");
bw.close();
BcfReader br(vcffile);
BcfRecord var(br.header);
vector<int> pl;
while(br.getNextVariant(var))
{
var.getFORMAT("PL",pl);
for(auto g : pl) cout << g << endl;
}
}

TEST_CASE("parse GL in vcf - vector<float>", "[bcf-reader]")
{
string vcffile{"test-GL.vcf.gz"};
BcfWriter bw(vcffile, "VCF4.2");
bw.header.addContig("chr20");
bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)");
bw.header.addFORMAT("GT", "1", "String", "Genotype");
bw.header.addFORMAT("GL", "G", "Float", "List of log scale genotype likelihoods");
for(auto & s : {"id01", "id02"}) bw.header.addSample(s); // add 3 samples
bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:GL\t0/1:-323.03,-99.29,-802.53\t1/1:-133.03,-299.29,-902.53");
bw.close();
BcfReader br(vcffile);
BcfRecord var(br.header);
vector<float> gl;
while(br.getNextVariant(var))
{
var.getFORMAT("GL",gl);
for(auto g : gl) cout << g << endl;
}
}

0 comments on commit 4050a4d

Please sign in to comment.