diff --git a/doc/latex.org b/doc/latex.org index d8d062a..ffa1293 100644 --- a/doc/latex.org +++ b/doc/latex.org @@ -8,6 +8,6 @@ #+latex_header: \DeclareCaptionFormat{listing}{\parbox{\textwidth}{\colorbox{gray}{\parbox{\textwidth}{#1#2#3}}\vskip-4pt}} #+latex_header: \captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white} #+latex_header: \newlength\tdima \newlength\tdimb \setlength\tdima{ \fboxsep+\fboxrule} \setlength\tdimb{-\fboxsep+\fboxrule} -#+latex_header: \lstset{frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}} +#+latex_header: \lstset{frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}, keywordstyle=\ttfamily} #+latex_header: \usepackage[backend=biber,style=authoryear-comp,date=year,sorting=nyt,sortlocale=auto,maxcitenames=1,maxbibnames=10,uniquename=init,maxitems=1,giveninits=true,terseinits=true,dashed=false,doi=true,isbn=false,url=false] {biblatex} #+latex_header: \addbibresource{ref.bib} diff --git a/doc/paper.org b/doc/paper.org index a259b51..da8a80e 100644 --- a/doc/paper.org +++ b/doc/paper.org @@ -1,6 +1,7 @@ #+title: vcfpp: a C++ API for scripting rapid variant analysis #+setupfile: latex.org #+language: en +#+startup: show2levels indent hidestars hideblocks #+options: title:nil toc:nil H:4 author:nil #+bibliography: "ref.bib" @@ -12,8 +13,8 @@ Email: zilong.dk@gamil.com.}} Given the complexity and popularity of the VCF/BCF format as well as ever-growing data size, there is always a need for fast and flexible methods to manipulate it in different programming languages. Static - languages like C++ has the superiority of performance as C, and its - modern standards strive to provide enlarged standard library for + languages like C++ has the superiority of performance as C, and + modern C++ standards strive to provide enlarged standard library for developing program quickly and easily. In this work, we present vcfpp, a C++ API of htslib, providing a rapid and intuitive interface to manipulate VCF/BCF files safely and as easily as @@ -30,22 +31,35 @@ Email: zilong.dk@gamil.com.}} * Introduction -The VCF format [cite:@danecek2011] is the standard for representing -genetic variation observed in DNA sequencing studies. The strength -of the VCF format is its ability to represent the location of a -variant, the genotype of the sequenced individual at each locus, as -well as extensive variant metadata. However, the consequence of this -flexibility and the rather complicated specification of the VCF -format, is that researchers require powerful software libraries to -access, query and manipulate variants from VCF files. Many efforts from computaional biologists - -\newpage +Many efforts have been contributed by computational biologists, aim +for making analyses of genetic variants no pain. The genetic +variants are typically stored in the VCF format, which becomes the +standard for representing genetic variation information with +complicated specification [cite:@danecek2011]. Later on, as the +ever-growing data size, BCF format is designed to query and store +big data efficiently. The C API of htslib provide the full +functionalities to manipulate VCF and BCF format for both compressed +and uncompressed files. As the C API is hard to use for +in-proficient programmers, API of other languages are +invented. Existing popular libraries include vcfR[cite:@brian2017] +for R, cyvcf2 for python [cite:@pedersen2017a], hts-nim +[cite:@pedersen2018] for nim and vcflib for C++ +[cite:@garrison2022]. All are useful for corresponding language +community and have advantages and disadvantages. In particular, +vcflib is both an API and a large collection of command line tools +to manipulate VCF file, with the main disadvantage being not +supporting BCF file. Many methods designed for large sample size can +not even take the BCF file that is tailed for big data as input like +Syllbable-PBWT [cite:@wang2023]. The motivation of vcfpp is to offer +full functionalities as htslib and provide easy-to-use API in a +single header file that can be easily integrated for writing scripts +quickly in C++, python and R. * Features vcfpp is implemented as a single header file for being easily intergrated and compiled. There are four core classes for -manipulating VCF/BCF as showed in [[tb:class]]. +manipulating VCF/BCF as summarized in Table [[tb:class]]. #+caption: vcfpp capabilities and implemented C++ class #+name: tb:class @@ -62,23 +76,39 @@ manipulating VCF/BCF as showed in [[tb:class]]. In an effort to demonstrate the power and performance of vcfpp, the following sections highlight typical VCF analyses and illustrate -commonly used features in cyvcf2. Other examples and further details +commonly used features in vcfpp. Other examples and further details of the vcfpp can be found at https://github.com/Zilong-Li/vcfpp. ** Python-like API In this example, we count the number of heterozygous sites for -each sample in all records. The core is *just 10 lines*. +each sample in all records. The following code first includes a +single vcfpp.h file (line 1), opens a compressed BCF file with subset +samples and a constrained region (line 2), and creates a +variant record object associated with the header information in +the BCF (line 3). Then it defines several types of objects to +collect the results we want (line 4-6). Taking advantage of +generic templates in C++, type of genotype can be very +flexible. Then it starts iterating over the BCF file and process +each variant record in the loop (line 7). We ignore variant of +other types (INDEL, SV), with missing genotypes and with QUAL +value smaller than 9, while we can also retrieve more information +of the variant and do filtering (line 9-10). Finally, we count the +heterozygous sites for each diploid sample (line 11-13). The core +is just 13 lines. #+caption: Counting heterozygous genotypes per sample on chr21 #+attr_latex: :options captionpos=t #+begin_src C++ -n -vcfpp::BcfReader vcf("bcf.gz", "id1,id2,id3", "chr21"); -vcfpp::BcfRecord var(vcf.header); +#include +vcfpp::BcfReader vcf("bcf.gz", "chr21", "id1,id2,id3"); +vcfpp::BcfRecord var(vcf.header); // create a variant object vector gt; // genotype can be of bool, char or int type vector hetsum(vcf.nsamples, 0); // store the het counts +float hwe, exchet; // capture more INFO while(vcf.getNextVariant(var)){ var.getGenotypes(gt); + var.getINFO("HWE",hwe), var.getINFO("ExcHet",exchet);//more if(!var.isSNP()||!var.isNoneMissing()||var.QUAL()<9) continue; assert(var.ploidy()==2); // make sure it is diploidy for(int i=0; i gt; // genotype can be of bool, char or int type vector hetsum(vcf.nsamples, 0); @@ -18,10 +18,12 @@ TEST_CASE("Calculate the heterozygosity rate", "[bcf-reader]") if(!var.isSNP()) continue; // skip other type of variants var.getGenotypes(gt); // analyze SNP variant with no genotype missingness - if (!var.isSNP() || !var.isNoneMissing()) continue; - assert(var.ploidy()==2); // make sure it is diploidy - for(int i = 0; i < gt.size() / 2; i++) - hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]); + if(!var.isSNP() || !var.isNoneMissing()) continue; + assert(var.ploidy() == 2); // make sure it is diploidy + for(int i = 0; i < gt.size() / 2; i++) hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]); + } + for(auto i : hetsum) + { + cout << i << endl; } - for (auto i : hetsum) { cout << i << endl; } } diff --git a/tools/dosage.cpp b/tools/dosage.cpp index 128b2c8..8dc9c63 100644 --- a/tools/dosage.cpp +++ b/tools/dosage.cpp @@ -38,7 +38,7 @@ int main(int argc, char* argv[]) region = args[++i]; } // ========= core calculation part =========================================== - BcfReader vcf(invcf, samples, region); + BcfReader vcf(invcf, region, samples); BcfRecord var(vcf.header); BcfWriter bw(outvcf, vcf.header); bw.header.addFORMAT("DS", "1", "Float", "Diploid Genotype Dosage"); // add DS tag into the header diff --git a/tools/flipGT.cpp b/tools/flipGT.cpp index 375c90c..2136fe6 100644 --- a/tools/flipGT.cpp +++ b/tools/flipGT.cpp @@ -38,7 +38,7 @@ int main(int argc, char* argv[]) region = args[++i]; } // ========= core calculation part =========================================== - BcfReader vcf(invcf, samples, region); + BcfReader vcf(invcf, region, samples); BcfRecord var(vcf.header); BcfWriter bw(outvcf, vcf.header); std::vector gts; diff --git a/tools/increaseDupPos.cpp b/tools/increaseDupPos.cpp index 5b2aa6e..b473ef2 100644 --- a/tools/increaseDupPos.cpp +++ b/tools/increaseDupPos.cpp @@ -39,7 +39,7 @@ int main(int argc, char* argv[]) region = args[++i]; } // ========= core calculation part =========================================== - BcfReader vcf(invcf, samples, region); + BcfReader vcf(invcf, region, samples); BcfRecord var(vcf.header); BcfWriter bw(outvcf, vcf.header); int64_t pos{-1}, count{0}; diff --git a/tools/multiallelics.cpp b/tools/multiallelics.cpp index 46d6493..0f04cf4 100644 --- a/tools/multiallelics.cpp +++ b/tools/multiallelics.cpp @@ -39,7 +39,7 @@ int main(int argc, char* argv[]) region = args[++i]; } // ========= core calculation part =========================================== - BcfReader vcf(invcf, samples, region); + BcfReader vcf(invcf, region, samples); BcfRecord var(vcf.header); BcfWriter bw(outvcf, vcf.header); while (vcf.getNextVariant(var)) diff --git a/vcfpp.h b/vcfpp.h index 4dda816..7720440 100644 --- a/vcfpp.h +++ b/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.1.5 + * @version v0.1.6 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -1091,31 +1091,26 @@ class BcfReader /** * @brief construct a vcf/bcf reader with subset samples * @param file the input vcf/bcf with suffix vcf(.gz)/bcf(.gz) or stdin "-" - * @param samples LIST samples to include or exclude as a comma-separated string. \n - * LIST : select samples in list \n - * ^LIST : exclude samples from list \n - * "-" : include all samples \n - * "" : exclude all samples + * @param region samtools-like region "chr:start-end", skip if empty */ - BcfReader(const std::string & file, const std::string & samples) : fname(file) + BcfReader(const std::string & file, const std::string & region) : fname(file) { open(file); - header.setSamples(samples); - nsamples = bcf_hdr_nsamples(header.hdr); + if(!region.empty()) setRegion(region); SamplesName = header.getSamples(); } /** * @brief construct a vcf/bcf reader with subset samples in target region * @param file the input vcf/bcf with suffix vcf(.gz) or bcf(.gz) + * @param region samtools-like region "chr:start-end", skip if empty * @param samples LIST samples to include or exclude as a comma-separated string. \n * LIST : select samples in list \n * ^LIST : exclude samples from list \n * "-" : include all samples \n * "" : exclude all samples - * @param region samtools-like region "chr:start-end", skip if empty */ - BcfReader(const std::string & file, const std::string & samples, const std::string & region) : fname(file) + BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file) { open(file); header.setSamples(samples); @@ -1168,8 +1163,8 @@ class BcfReader return hts_set_threads(fp, n); } - /** @brief get records of current contig to use */ - uint64_t get_region_records(const std::string & region) + /** @brief get the number of records of given region from index file */ + uint64_t getRegionIndex(const std::string & region) { setRegion(region); // only one chromosome int tid = 0, ret = 0, nseq = 0;