From a1c6276b122939f9cac5ea889bac27ffb54a3225 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 24 Nov 2022 10:27:57 +0100 Subject: [PATCH] dosage support gt and fix getFORMAT --- tools/dosage.cpp | 22 ++++++++++++++++------ vcfpp.h | 12 +++++++++++- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/tools/dosage.cpp b/tools/dosage.cpp index c8407e7..128b2c8 100644 --- a/tools/dosage.cpp +++ b/tools/dosage.cpp @@ -12,7 +12,7 @@ int main(int argc, char* argv[]) { std::cout << "Author: Zilong-Li (zilong.dk@gmail.com)\n" << "Description:\n" - << " create DS tag for diploid samples given GP tag from input vcf file\n\n" + << " create DS tag for diploid samples given GP or GT tag from input vcf file\n\n" << "Usage example:\n" << " " + (std::string)argv[0] + " -i in.bcf \n" << " " + (std::string)argv[0] + " -i in.bcf -o out.bcf -s ^S1,S2 -r chr1:1-1000 \n" @@ -25,7 +25,7 @@ int main(int argc, char* argv[]) << std::endl; return 1; } - std::string invcf, outvcf="-", samples = "-", region = ""; + std::string invcf, outvcf = "-", samples = "-", region = ""; for (int i = 0; i < args.size(); i++) { if (args[i] == "-i") @@ -43,13 +43,23 @@ int main(int argc, char* argv[]) BcfWriter bw(outvcf, vcf.header); bw.header.addFORMAT("DS", "1", "Float", "Diploid Genotype Dosage"); // add DS tag into the header int nsamples = vcf.nsamples; - std::vector gp, ds(nsamples); + vector gps, ds(nsamples); + vector gts; while (vcf.getNextVariant(var)) { - var.getFORMAT("GP", gp); // get GP values - for (int i = 0; i < nsamples; i++) + try { - ds[i] = gp[i * 3 + 1] + gp[i * 3 + 2] * 2; + var.getFORMAT("GP", gps); // try get GP values + for (int i = 0; i < nsamples; i++) + { + ds[i] = gps[i * 3 + 1] + gps[i * 3 + 2] * 2; + } + } + catch (const std::runtime_error& e) + { + var.getGenotypes(gts); + for (int i = 0; i < nsamples; i++) + ds[i] = gts[i * 2] + gts[i * 2 + 1]; } var.setFORMAT("DS", ds); bw.writeRecord(var); diff --git a/vcfpp.h b/vcfpp.h index 833254e..7ecca47 100644 --- a/vcfpp.h +++ b/vcfpp.h @@ -403,6 +403,8 @@ namespace vcfpp isFormatVector getFORMAT(std::string tag, T& v) { fmt = bcf_get_fmt(header.hdr, line, tag.c_str()); + if (!fmt) + throw std::runtime_error("there is no " + tag + " in FORMAT of this variant.\n"); nvalues = fmt->n; ndst = 0; S* dst = NULL; @@ -421,7 +423,7 @@ namespace vcfpp } else { - throw std::runtime_error("couldn't parse the " + tag + " format of this variant.\n"); + throw std::runtime_error("failed to parse the " + tag + " format of this variant.\n"); } } @@ -434,6 +436,8 @@ namespace vcfpp bool getFORMAT(std::string tag, std::vector& v) { fmt = bcf_get_fmt(header.hdr, line, tag.c_str()); + if (!fmt) + throw std::runtime_error("there is no " + tag + " in FORMAT of this variant.\n"); nvalues = fmt->n; // if ndst < (fmt->n+1)*nsmpl; then realloc is involved ret = -1, ndst = 0; @@ -467,6 +471,8 @@ namespace vcfpp isInfoVector getINFO(std::string tag, T& v) { info = bcf_get_info(header.hdr, line, tag.c_str()); + if (!info) + throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); S* dst = NULL; ndst = 0; ret = -1; @@ -495,6 +501,8 @@ namespace vcfpp isScalar getINFO(std::string tag, T& v) { info = bcf_get_info(header.hdr, line, tag.c_str()); + if (!info) + throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); // scalar value if (info->len == 1) { @@ -524,6 +532,8 @@ namespace vcfpp isString getINFO(std::string tag, T& v) { info = bcf_get_info(header.hdr, line, tag.c_str()); + if (!info) + throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); if (info->type == BCF_BT_CHAR) v = std::string(reinterpret_cast(info->vptr), info->vptr_len); else