Merge pull request #163 from apriha/develop

v2.7.0
apriha · Nov 4, 2022 · 8abca57 · 8abca57
2 parents a20170f + d8a1d58
commit 8abca57
Show file tree

Hide file tree

Showing 20 changed files with 1,069 additions and 153 deletions.
diff --git a/.gitignore b/.gitignore
@@ -109,7 +109,7 @@ output*/
 resources/*
 
 # snps tests
-tests/output/
+tests/output/*
 tests/resources/*
 !tests/resources/gsa_chrpos_map.txt
 !tests/resources/gsa_rsid_map.txt
@@ -119,3 +119,7 @@ tests/input/discrepant_snps[12].csv
 tests/input/ftdna.csv.gz
 tests/input/generic.fa.gz
 tests/input/testvcf.vcf.gz
+!tests/output/
+!tests/output/vcf_qc/
+!tests/output/vcf_chrom_prefix_chr.vcf
+!tests/output/vcf_generic.vcf
diff --git a/README.rst b/README.rst
@@ -29,6 +29,7 @@ Build / Assembly Detection and Remapping
 
 Data Cleaning
 `````````````
+- Perform quality control (QC) / filter low quality SNPs based on `chip clusters <https://doi.org/10.1016/j.csbj.2021.06.040>`_
 - Fix several common issues when loading SNPs
 - Sort SNPs based on chromosome and position
 - Deduplicate RSIDs
@@ -39,6 +40,7 @@ Data Cleaning
 Analysis
 ````````
 - Derive sex from SNPs
+- Detect deduced genotype / chip array and chip version based on `chip clusters <https://doi.org/10.1016/j.csbj.2021.06.040>`_
 - Predict ancestry from SNPs (when installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_)
 
 Supported Genotype Files
@@ -213,14 +215,14 @@ Ok, so far we've merged the SNPs from two files (ensuring the same build in the
 identifying discrepancies along the way). Then, we remapped the SNPs to Build 38. Now, let's save
 the merged and remapped dataset consisting of 1M+ SNPs to a tab-separated values (TSV) file:
 
->>> saved_snps = s.save("out.txt")
+>>> saved_snps = s.to_tsv("out.txt")
 Saving output/out.txt
 >>> print(saved_snps)
 output/out.txt
 
 Moreover, let's get the reference sequences for this assembly and save the SNPs as a VCF file:
 
->>> saved_snps = s.save("out.vcf", vcf=True)
+>>> saved_snps = s.to_vcf("out.vcf")
 Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
 Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz
 Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz

diff --git a/docs/snps.rst b/docs/snps.rst
@@ -11,7 +11,7 @@ SNPs
     :exclude-members: sort_snps, remap_snps, save_snps, snp_count, get_snp_count, not_null_snps,
        get_summary, get_assembly, get_chromosomes, get_chromosomes_summary, duplicate_snps,
        discrepant_XY_snps, heterozygous_MT_snps, heterozygous_snps, homozygous_snps,
-       discrepant_positions, discrepant_genotypes, discrepant_snps, is_valid
+       discrepant_positions, discrepant_genotypes, discrepant_snps, is_valid, save
 
 snps.ensembl
 ------------
@@ -36,6 +36,7 @@ snps.io.reader
     :members:
     :undoc-members:
     :show-inheritance:
+    :exclude-members: read_file
 
 snps.io.writer
 ~~~~~~~~~~~~~~
@@ -44,6 +45,7 @@ snps.io.writer
     :members:
     :undoc-members:
     :show-inheritance:
+    :exclude-members: write_file
 
 snps.resources
 --------------

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -42,6 +42,7 @@
 import logging
 import os
 import re
+import warnings
 import zipfile
 import zlib
 
@@ -211,31 +212,9 @@ def read(self):
 
     @classmethod
     def read_file(cls, file, only_detect_source, resources, rsids):
-        """Read `file`.
-
-        Parameters
-        ----------
-        file : str or bytes
-            path to file to load or bytes to load
-        only_detect_source : bool
-            only detect the source of the data
-        resources : Resources
-            instance of Resources
-        rsids : tuple
-            rsids to extract if loading a VCF file
-
-        Returns
-        -------
-        dict
-            dict with the following items:
-
-            snps (pandas.DataFrame)
-                dataframe of parsed SNPs
-            source (str)
-                detected source of SNPs
-            phased (bool)
-                flag indicating if SNPs are phased
-        """
+        warnings.warn(
+            "This method will be removed in a future release.", DeprecationWarning
+        )
         r = cls(file, only_detect_source, resources, rsids)
         return r.read()
 

diff --git a/src/snps/io/writer.py b/src/snps/io/writer.py
@@ -37,11 +37,13 @@
 
 import datetime
 import logging
+import warnings
 
 import numpy as np
 import pandas as pd
 
 import snps
+from snps.io import get_empty_snps_dataframe
 from snps.utils import save_df_as_csv, clean_str
 
 logger = logging.getLogger(__name__)
@@ -57,6 +59,9 @@ def __init__(
         vcf=False,
         atomic=True,
         vcf_alt_unavailable=".",
+        vcf_chrom_prefix="",
+        vcf_qc_only=False,
+        vcf_qc_filter=False,
         **kwargs,
     ):
         """Initialize a `Writer`.
@@ -73,6 +78,12 @@ def __init__(
             atomically write output to a file on local filesystem
         vcf_alt_unavailable : str
             representation of VCF ALT allele when ALT is not able to be determined
+        vcf_chrom_prefix : str
+            prefix for chromosomes in VCF CHROM column
+        vcf_qc_only : bool
+            for VCF, output only SNPs that pass quality control
+        vcf_qc_filter : bool
+            for VCF, populate VCF FILTER column based on quality control results
         **kwargs
             additional parameters to `pandas.DataFrame.to_csv`
         """
@@ -81,9 +92,21 @@ def __init__(
         self._vcf = vcf
         self._atomic = atomic
         self._vcf_alt_unavailable = vcf_alt_unavailable
+        self._vcf_chrom_prefix = vcf_chrom_prefix
+        self._vcf_qc_only = vcf_qc_only
+        self._vcf_qc_filter = vcf_qc_filter
         self._kwargs = kwargs
 
     def write(self):
+        """Write SNPs to file or buffer.
+
+        Returns
+        -------
+        str
+            path to file in output directory if SNPs were saved, else empty str
+        discrepant_vcf_position : pd.DataFrame
+            SNPs with discrepant positions discovered while saving VCF
+        """
         if self._vcf:
             return self._write_vcf()
         else:
@@ -97,38 +120,21 @@ def write_file(
         vcf=False,
         atomic=True,
         vcf_alt_unavailable=".",
+        vcf_qc_only=False,
+        vcf_qc_filter=False,
         **kwargs,
     ):
-        """Save SNPs to file.
-
-        Parameters
-        ----------
-        snps : SNPs
-            SNPs to save to file or write to buffer
-        filename : str or buffer
-            filename for file to save or buffer to write to
-        vcf : bool
-            flag to save file as VCF
-        atomic : bool
-            atomically write output to a file on local filesystem
-        vcf_alt_unavailable : str
-            representation of VCF ALT allele when ALT is not able to be determined
-        **kwargs
-            additional parameters to `pandas.DataFrame.to_csv`
-
-        Returns
-        -------
-        str
-            path to file in output directory if SNPs were saved, else empty str
-        discrepant_vcf_position : pd.DataFrame
-            SNPs with discrepant positions discovered while saving VCF
-        """
+        warnings.warn(
+            "This method will be removed in a future release.", DeprecationWarning
+        )
         w = cls(
             snps=snps,
             filename=filename,
             vcf=vcf,
             atomic=atomic,
             vcf_alt_unavailable=vcf_alt_unavailable,
+            vcf_qc_only=vcf_qc_only,
+            vcf_qc_filter=vcf_qc_filter,
             **kwargs,
         )
         return w.write()
@@ -257,6 +263,12 @@ def _write_vcf(self):
                     "assembly": self._snps.assembly,
                     "chrom": chrom,
                     "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]),
+                    "cluster": self._snps.cluster
+                    if self._vcf_qc_only or self._vcf_qc_filter
+                    else "",
+                    "low_quality_snps": self._snps.low_quality
+                    if self._vcf_qc_only or self._vcf_qc_filter
+                    else get_empty_snps_dataframe(),
                 }
             )
 
@@ -279,6 +291,10 @@ def _write_vcf(self):
         discrepant_vcf_position = pd.concat(discrepant_vcf_position)
 
         comment += "".join(contigs)
+
+        if self._vcf_qc_filter and self._snps.cluster:
+            comment += '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">\n'
+
         comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
         comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
 
@@ -302,6 +318,8 @@ def _create_vcf_representation(self, task):
         assembly = task["assembly"]
         chrom = task["chrom"]
         snps = task["snps"]
+        cluster = task["cluster"]
+        low_quality_snps = task["low_quality_snps"]
 
         if len(snps.loc[snps["genotype"].notnull()]) == 0:
             return {
@@ -315,6 +333,16 @@ def _create_vcf_representation(self, task):
 
         contig = f'##contig=<ID={seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">\n'
 
+        if self._vcf_qc_only and cluster:
+            # drop low quality SNPs if SNPs object maps to a cluster
+            snps = snps.drop(snps.index.intersection(low_quality_snps.index))
+
+        if self._vcf_qc_filter and cluster:
+            # initialize filter for  all SNPs if SNPs object maps to a cluster,
+            snps["filter"] = "PASS"
+            # then indicate SNPs that were identified as low quality
+            snps.loc[snps.index.intersection(low_quality_snps.index), "filter"] = "lq"
+
         snps = snps.reset_index()
 
         df = pd.DataFrame(
@@ -346,10 +374,13 @@ def _create_vcf_representation(self, task):
             }
         )
 
-        df["CHROM"] = snps["chrom"]
+        df["CHROM"] = self._vcf_chrom_prefix + snps["chrom"]
         df["POS"] = snps["pos"]
         df["ID"] = snps["rsid"]
 
+        if self._vcf_qc_filter and cluster:
+            df["FILTER"] = snps["filter"]
+
         # drop SNPs with discrepant positions (outside reference sequence)
         discrepant_vcf_position = snps.loc[
             (snps.pos - seq.start < 0) | (snps.pos - seq.start > seq.length - 1)

diff --git a/src/snps/resources.py b/src/snps/resources.py
@@ -90,6 +90,8 @@ def _init_resource_attributes(self):
         self._gsa_chrpos_map = None
         self._dbsnp_151_37_reverse = None
         self._opensnp_datadump_filenames = []
+        self._chip_clusters = None
+        self._low_quality_snps = None
 
     def get_reference_sequences(
         self,
@@ -235,6 +237,8 @@ def get_all_resources(self):
                 source, target
             )
         resources["gsa_resources"] = self.get_gsa_resources()
+        resources["chip_clusters"] = self.get_chip_clusters()
+        resources["low_quality_snps"] = self.get_low_quality_snps()
         return resources
 
     def get_all_reference_sequences(self, **kwargs):
@@ -269,6 +273,90 @@ def get_gsa_resources(self):
             "dbsnp_151_37_reverse": self.get_dbsnp_151_37_reverse(),
         }
 
+    def get_chip_clusters(self):
+        """Get resource for identifying deduced genotype / chip array based on chip clusters.
+
+        Returns
+        -------
+        pandas.DataFrame
+
+        References
+        ----------
+        1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
+           direct-to-consumer genotype data, and quality control tool
+           (GenomePrep) for research, Computational and Structural
+           Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
+           2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
+        """
+        if self._chip_clusters is None:
+            chip_clusters_path = self._download_file(
+                "https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/the_list.tsv.gz",
+                "chip_clusters.tsv.gz",
+            )
+
+            df = pd.read_csv(
+                chip_clusters_path,
+                sep="\t",
+                names=["locus", "clusters"],
+                dtype={"locus": object, "clusters": pd.CategoricalDtype(ordered=False)},
+            )
+            clusters = df.clusters
+            df = df.locus.str.split(":", expand=True)
+            df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
+            df.pos = df.pos.astype(np.uint32)
+            df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
+            df["clusters"] = clusters
+
+            self._chip_clusters = df
+
+        return self._chip_clusters
+
+    def get_low_quality_snps(self):
+        """Get listing of low quality SNPs for quality control based on chip clusters.
+
+        Returns
+        -------
+        pandas.DataFrame
+
+        References
+        ----------
+        1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
+           direct-to-consumer genotype data, and quality control tool
+           (GenomePrep) for research, Computational and Structural
+           Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
+           2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
+        """
+        if self._low_quality_snps is None:
+            low_quality_snps_path = self._download_file(
+                "https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/badalleles.tsv.gz",
+                "low_quality_snps.tsv.gz",
+            )
+
+            df = pd.read_csv(
+                low_quality_snps_path,
+                sep="\t",
+                names=["cluster", "loci"],
+            )
+
+            cluster_dfs = []
+            for row in df.itertuples():
+                loci = row.loci.split(",")
+                cluster_dfs.append(
+                    pd.DataFrame({"cluster": [row.cluster] * len(loci), "locus": loci})
+                )
+
+            df = pd.concat(cluster_dfs)
+            df.reset_index(inplace=True, drop=True)
+            cluster = df.cluster.astype(pd.CategoricalDtype(ordered=False))
+            df = df.locus.str.split(":", expand=True)
+            df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
+            df.pos = df.pos.astype(np.uint32)
+            df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
+            df["cluster"] = cluster
+            self._low_quality_snps = df
+
+        return self._low_quality_snps
+
     def get_dbsnp_151_37_reverse(self):
         """Get and load RSIDs that are on the reference reverse (-) strand in dbSNP 151 and lower.