Skip to content

Commit

Permalink
setGenotypes works for missing values
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Dec 14, 2023
1 parent c9f75f2 commit 03f2c77
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ PROJECT_LOGO =
# entered, it will be relative to the location where doxygen was started. If
# left blank the current directory will be used.

OUTPUT_DIRECTORY = /Users/zilong/Project/mysite/content/proj
OUTPUT_DIRECTORY = /home/zilong/Projects/mysite/content/proj

# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# directories (in 2 levels) under the output directory of each output format and
Expand Down
4 changes: 2 additions & 2 deletions Pybind11/readme.org
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ conda install -c bioconda htslib
* Compiling the py_example.cpp

#+begin_src shell
g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix) -I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib -lhts
g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) py_example.cpp -o py_example$(python3-config --extension-suffix) -I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib -lhts
#+end_src

* Call the C++ function in ~Python~

#+begin_src python
import py_example
ret = py_example.heterzygosity(vcffile, region, samples)
ret = py_example.heterozygosity(vcffile, region, samples)
#+end_src
11 changes: 7 additions & 4 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
CXX = g++
# CXXFLAGS = -std=c++11 -g -Wall -fsanitize=address
CXXFLAGS = -std=c++11 -Wall
LDFLAGS += -lhts
LIBS = -lz -lm -lbz2 -llzma -lcurl -lpthread
HTSINC = /usr/local/include
HTSLIB = /usr/local/lib
LDFLAGS = -L$(HTSLIB) -Wl,-rpath,$(HTSLIB)
LIBS = -lhts -lz -lm -lbz2 -llzma -lcurl -lpthread
INC = -I. -I..

OBJS = test-main.o bcf-reader.o bcf-writer.o example.o
OBJS = test-main.o bcf-variant.o bcf-reader.o bcf-writer.o example.o

BINS = bcf-reader.bin bcf-writer.bin example.bin
BINS = bcf-variant.bin bcf-reader.bin bcf-writer.bin example.bin

.PHONY: all test clean

Expand All @@ -28,4 +30,5 @@ clean:
test:
./bcf-reader.bin --success
./bcf-writer.bin --success
./bcf-variant.bin --success
./example.bin --success
42 changes: 42 additions & 0 deletions test/bcf-variant.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "../vcfpp.h"
#include "catch.hh"

using namespace vcfpp;
using namespace std;

TEST_CASE("set genotypes without missing ", "[bcf-variant]")
{
BcfReader br("test-vcf-read.vcf.gz");
BcfRecord v(br.header);
vector<int> gt;
vector<int> gi{1, 1};
if(br.getNextVariant(v))
{
v.getGenotypes(gt);
for(auto g : gt) cerr << g << endl;
v.setGenotypes(gi);
v.getGenotypes(gt);
for(auto g : gt) cerr << g << endl;
}
}

TEST_CASE("set genotypes with missing ", "[bcf-variant]")
{
BcfReader br("test-vcf-read.vcf.gz");
BcfRecord v(br.header);
vector<int> gt;
vector<int> gi{1, -9};
vector<int> g2{-2147483648, -9};
br.getNextVariant(v);
if(br.getNextVariant(v))
{
v.getGenotypes(gt);
for(auto g : gt) cerr << g << endl;
v.setGenotypes(gi);
v.getGenotypes(gt);
for(auto g : gt) cerr << g << endl;
v.setGenotypes(g2);
v.getGenotypes(gt);
for(auto g : gt) cerr << g << endl;
}
}
2 changes: 1 addition & 1 deletion test/example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ TEST_CASE("Calculate the heterozygosity rate", "[bcf-reader]")
// analyze SNP variant with no genotype missingness
if(!var.isSNP() || !var.isNoneMissing()) continue;
assert(var.ploidy() == 2); // make sure it is diploidy
for(int i = 0; i < gt.size() / 2; i++) hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]);
for(size_t i = 0; i < gt.size() / 2; i++) hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]);
}
for(auto i : hetsum)
{
Expand Down
Binary file modified test/test-vcf-read.vcf.gz
Binary file not shown.
5 changes: 3 additions & 2 deletions tools/flipGT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int main(int argc, char* argv[])
return 1;
}
std::string invcf, outvcf = "-", samples = "-", region = "";
for (int i = 0; i < args.size(); i++)
for (size_t i = 0; i < args.size(); i++)
{
if (args[i] == "-i")
invcf = args[++i];
Expand All @@ -41,13 +41,14 @@ int main(int argc, char* argv[])
BcfReader vcf(invcf, region, samples);
BcfRecord var(vcf.header);
BcfWriter bw(outvcf, vcf.header);
std::vector<char> gts;
std::vector<int> gts;
while (vcf.getNextVariant(var))
{
var.getGenotypes(gts);
var.swap_REF_ALT();
for (auto& g : gts)
{
if(g == -9) continue;
g = !g;
}
var.setGenotypes(gts);
Expand Down
2 changes: 1 addition & 1 deletion tools/multiallelics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int main(int argc, char* argv[])
BcfWriter bw(outvcf, vcf.header);
while (vcf.getNextVariant(var))
{
if (var.isMultiAllelic())
if (var.isMultiAllelics())
bw.writeRecord(var);
}

Expand Down
55 changes: 9 additions & 46 deletions vcfpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,6 @@ using isValidGT = typename std::enable_if<std::is_same<T, std::vector<bool>>::va
|| std::is_same<T, std::vector<char>>::value,
bool>::type;

template<typename T>
using isGtVector = typename std::enable_if<
std::is_same<T, std::vector<bool>>::value || std::is_same<T, std::vector<char>>::value
|| std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value,
bool>::type;

template<typename T>
using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float>>::value
|| std::is_same<T, std::vector<char>>::value
Expand Down Expand Up @@ -497,7 +491,7 @@ type as noted in the other overloading function.
{
ndst = 0;
ret = bcf_get_genotypes(header.hdr, line, &gts, &ndst);
if(ret <= 0) throw std::runtime_error("genotypes not present");
if(ret <= 0) throw std::runtime_error("genotypes not present. make sure you initilized the variant object first\n");
v.resize(ret);
isGenoMissing.assign(nsamples, 0);
nploidy = ret / nsamples;
Expand Down Expand Up @@ -752,23 +746,24 @@ type as noted in the other overloading function.
}

/**
* @brief set genotypes from scratch assume genotype not present
* @param v valid input includevector<bool>, vector<char>, vector<int>, std::string
* @brief set genotypes from scratch even if genotypes not present
* @param v the genotypes of vector<int> type
* @return bool
* */
template<class T>
isGtVector<T> setGenotypes(const T & v)
bool setGenotypes(const std::vector<int> & v)
{
// bcf_gt_type
int i, j, k;
nploidy = v.size() / nsamples;
gts = (int *)malloc(nsamples * nploidy * sizeof(int));
gts = (int32_t *)malloc(v.size() * sizeof(int32_t));
for(i = 0; i < nsamples; i++)
{
for(j = 0; j < nploidy; j++)
{
k = i * nploidy + j;
if(gtPhase[i])
if(v[k] == -9 || v[k] == bcf_int32_missing)
gts[k] = bcf_gt_missing;
else if(gtPhase[i])
gts[k] = bcf_gt_phased(v[k]);
else
gts[k] = bcf_gt_unphased(v[k]);
Expand All @@ -780,38 +775,6 @@ type as noted in the other overloading function.
return true;
}

/**
* @brief update genotypes for current record, assume genotypes present
* @param v valid input includevector<bool>, vector<char>, vector<int>, std::string
* @return bool
* */
template<class T>
isGtVector<T> updateGenotypes(const T & v)
{
// bcf_gt_type
ndst = 0;
ret = bcf_get_genotypes(header.hdr, line, &gts, &ndst);
if(ret <= 0) throw std::runtime_error("genotypes not present for current record.\n");
assert(ret == v.size());
nploidy = ret / nsamples;
int i, j, k;
for(i = 0; i < nsamples; i++)
{
for(j = 0; j < nploidy; j++)
{
k = i * nploidy + j;
if(gtPhase[i])
gts[k] = bcf_gt_phased(v[k]);
else
gts[k] = bcf_gt_unphased(v[k]);
}
}
if(bcf_update_genotypes(header.hdr, line, gts, ret) < 0)
throw std::runtime_error("couldn't set genotypes correctly.\n");
else
return true;
}

/**
* @brief set phasing status for all diploid samples using given vector
* @param v valid input includes vector<char>
Expand Down Expand Up @@ -1059,7 +1022,7 @@ type as noted in the other overloading function.
{
bcf_update_id(header.hdr, line, s);
}

/** @brief set REF and ALT alleles given a string seperated by comma */
inline void setRefAlt(const char * alleles_string)
{
Expand Down

0 comments on commit 03f2c77

Please sign in to comment.