Skip to content

Commit

Permalink
Merge pull request #163 from apriha/develop
Browse files Browse the repository at this point in the history
v2.7.0
  • Loading branch information
apriha authored Nov 4, 2022
2 parents a20170f + d8a1d58 commit 8abca57
Show file tree
Hide file tree
Showing 20 changed files with 1,069 additions and 153 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ output*/
resources/*

# snps tests
tests/output/
tests/output/*
tests/resources/*
!tests/resources/gsa_chrpos_map.txt
!tests/resources/gsa_rsid_map.txt
Expand All @@ -119,3 +119,7 @@ tests/input/discrepant_snps[12].csv
tests/input/ftdna.csv.gz
tests/input/generic.fa.gz
tests/input/testvcf.vcf.gz
!tests/output/
!tests/output/vcf_qc/
!tests/output/vcf_chrom_prefix_chr.vcf
!tests/output/vcf_generic.vcf
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Build / Assembly Detection and Remapping

Data Cleaning
`````````````
- Perform quality control (QC) / filter low quality SNPs based on `chip clusters <https://doi.org/10.1016/j.csbj.2021.06.040>`_
- Fix several common issues when loading SNPs
- Sort SNPs based on chromosome and position
- Deduplicate RSIDs
Expand All @@ -39,6 +40,7 @@ Data Cleaning
Analysis
````````
- Derive sex from SNPs
- Detect deduced genotype / chip array and chip version based on `chip clusters <https://doi.org/10.1016/j.csbj.2021.06.040>`_
- Predict ancestry from SNPs (when installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_)

Supported Genotype Files
Expand Down Expand Up @@ -213,14 +215,14 @@ Ok, so far we've merged the SNPs from two files (ensuring the same build in the
identifying discrepancies along the way). Then, we remapped the SNPs to Build 38. Now, let's save
the merged and remapped dataset consisting of 1M+ SNPs to a tab-separated values (TSV) file:

>>> saved_snps = s.save("out.txt")
>>> saved_snps = s.to_tsv("out.txt")
Saving output/out.txt
>>> print(saved_snps)
output/out.txt

Moreover, let's get the reference sequences for this assembly and save the SNPs as a VCF file:

>>> saved_snps = s.save("out.vcf", vcf=True)
>>> saved_snps = s.to_vcf("out.vcf")
Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz
Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz
Expand Down
4 changes: 3 additions & 1 deletion docs/snps.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ SNPs
:exclude-members: sort_snps, remap_snps, save_snps, snp_count, get_snp_count, not_null_snps,
get_summary, get_assembly, get_chromosomes, get_chromosomes_summary, duplicate_snps,
discrepant_XY_snps, heterozygous_MT_snps, heterozygous_snps, homozygous_snps,
discrepant_positions, discrepant_genotypes, discrepant_snps, is_valid
discrepant_positions, discrepant_genotypes, discrepant_snps, is_valid, save

snps.ensembl
------------
Expand All @@ -36,6 +36,7 @@ snps.io.reader
:members:
:undoc-members:
:show-inheritance:
:exclude-members: read_file

snps.io.writer
~~~~~~~~~~~~~~
Expand All @@ -44,6 +45,7 @@ snps.io.writer
:members:
:undoc-members:
:show-inheritance:
:exclude-members: write_file

snps.resources
--------------
Expand Down
29 changes: 4 additions & 25 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import logging
import os
import re
import warnings
import zipfile
import zlib

Expand Down Expand Up @@ -211,31 +212,9 @@ def read(self):

@classmethod
def read_file(cls, file, only_detect_source, resources, rsids):
"""Read `file`.
Parameters
----------
file : str or bytes
path to file to load or bytes to load
only_detect_source : bool
only detect the source of the data
resources : Resources
instance of Resources
rsids : tuple
rsids to extract if loading a VCF file
Returns
-------
dict
dict with the following items:
snps (pandas.DataFrame)
dataframe of parsed SNPs
source (str)
detected source of SNPs
phased (bool)
flag indicating if SNPs are phased
"""
warnings.warn(
"This method will be removed in a future release.", DeprecationWarning
)
r = cls(file, only_detect_source, resources, rsids)
return r.read()

Expand Down
81 changes: 56 additions & 25 deletions src/snps/io/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@

import datetime
import logging
import warnings

import numpy as np
import pandas as pd

import snps
from snps.io import get_empty_snps_dataframe
from snps.utils import save_df_as_csv, clean_str

logger = logging.getLogger(__name__)
Expand All @@ -57,6 +59,9 @@ def __init__(
vcf=False,
atomic=True,
vcf_alt_unavailable=".",
vcf_chrom_prefix="",
vcf_qc_only=False,
vcf_qc_filter=False,
**kwargs,
):
"""Initialize a `Writer`.
Expand All @@ -73,6 +78,12 @@ def __init__(
atomically write output to a file on local filesystem
vcf_alt_unavailable : str
representation of VCF ALT allele when ALT is not able to be determined
vcf_chrom_prefix : str
prefix for chromosomes in VCF CHROM column
vcf_qc_only : bool
for VCF, output only SNPs that pass quality control
vcf_qc_filter : bool
for VCF, populate VCF FILTER column based on quality control results
**kwargs
additional parameters to `pandas.DataFrame.to_csv`
"""
Expand All @@ -81,9 +92,21 @@ def __init__(
self._vcf = vcf
self._atomic = atomic
self._vcf_alt_unavailable = vcf_alt_unavailable
self._vcf_chrom_prefix = vcf_chrom_prefix
self._vcf_qc_only = vcf_qc_only
self._vcf_qc_filter = vcf_qc_filter
self._kwargs = kwargs

def write(self):
"""Write SNPs to file or buffer.
Returns
-------
str
path to file in output directory if SNPs were saved, else empty str
discrepant_vcf_position : pd.DataFrame
SNPs with discrepant positions discovered while saving VCF
"""
if self._vcf:
return self._write_vcf()
else:
Expand All @@ -97,38 +120,21 @@ def write_file(
vcf=False,
atomic=True,
vcf_alt_unavailable=".",
vcf_qc_only=False,
vcf_qc_filter=False,
**kwargs,
):
"""Save SNPs to file.
Parameters
----------
snps : SNPs
SNPs to save to file or write to buffer
filename : str or buffer
filename for file to save or buffer to write to
vcf : bool
flag to save file as VCF
atomic : bool
atomically write output to a file on local filesystem
vcf_alt_unavailable : str
representation of VCF ALT allele when ALT is not able to be determined
**kwargs
additional parameters to `pandas.DataFrame.to_csv`
Returns
-------
str
path to file in output directory if SNPs were saved, else empty str
discrepant_vcf_position : pd.DataFrame
SNPs with discrepant positions discovered while saving VCF
"""
warnings.warn(
"This method will be removed in a future release.", DeprecationWarning
)
w = cls(
snps=snps,
filename=filename,
vcf=vcf,
atomic=atomic,
vcf_alt_unavailable=vcf_alt_unavailable,
vcf_qc_only=vcf_qc_only,
vcf_qc_filter=vcf_qc_filter,
**kwargs,
)
return w.write()
Expand Down Expand Up @@ -257,6 +263,12 @@ def _write_vcf(self):
"assembly": self._snps.assembly,
"chrom": chrom,
"snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]),
"cluster": self._snps.cluster
if self._vcf_qc_only or self._vcf_qc_filter
else "",
"low_quality_snps": self._snps.low_quality
if self._vcf_qc_only or self._vcf_qc_filter
else get_empty_snps_dataframe(),
}
)

Expand All @@ -279,6 +291,10 @@ def _write_vcf(self):
discrepant_vcf_position = pd.concat(discrepant_vcf_position)

comment += "".join(contigs)

if self._vcf_qc_filter and self._snps.cluster:
comment += '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">\n'

comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"

Expand All @@ -302,6 +318,8 @@ def _create_vcf_representation(self, task):
assembly = task["assembly"]
chrom = task["chrom"]
snps = task["snps"]
cluster = task["cluster"]
low_quality_snps = task["low_quality_snps"]

if len(snps.loc[snps["genotype"].notnull()]) == 0:
return {
Expand All @@ -315,6 +333,16 @@ def _create_vcf_representation(self, task):

contig = f'##contig=<ID={seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">\n'

if self._vcf_qc_only and cluster:
# drop low quality SNPs if SNPs object maps to a cluster
snps = snps.drop(snps.index.intersection(low_quality_snps.index))

if self._vcf_qc_filter and cluster:
# initialize filter for all SNPs if SNPs object maps to a cluster,
snps["filter"] = "PASS"
# then indicate SNPs that were identified as low quality
snps.loc[snps.index.intersection(low_quality_snps.index), "filter"] = "lq"

snps = snps.reset_index()

df = pd.DataFrame(
Expand Down Expand Up @@ -346,10 +374,13 @@ def _create_vcf_representation(self, task):
}
)

df["CHROM"] = snps["chrom"]
df["CHROM"] = self._vcf_chrom_prefix + snps["chrom"]
df["POS"] = snps["pos"]
df["ID"] = snps["rsid"]

if self._vcf_qc_filter and cluster:
df["FILTER"] = snps["filter"]

# drop SNPs with discrepant positions (outside reference sequence)
discrepant_vcf_position = snps.loc[
(snps.pos - seq.start < 0) | (snps.pos - seq.start > seq.length - 1)
Expand Down
88 changes: 88 additions & 0 deletions src/snps/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def _init_resource_attributes(self):
self._gsa_chrpos_map = None
self._dbsnp_151_37_reverse = None
self._opensnp_datadump_filenames = []
self._chip_clusters = None
self._low_quality_snps = None

def get_reference_sequences(
self,
Expand Down Expand Up @@ -235,6 +237,8 @@ def get_all_resources(self):
source, target
)
resources["gsa_resources"] = self.get_gsa_resources()
resources["chip_clusters"] = self.get_chip_clusters()
resources["low_quality_snps"] = self.get_low_quality_snps()
return resources

def get_all_reference_sequences(self, **kwargs):
Expand Down Expand Up @@ -269,6 +273,90 @@ def get_gsa_resources(self):
"dbsnp_151_37_reverse": self.get_dbsnp_151_37_reverse(),
}

def get_chip_clusters(self):
"""Get resource for identifying deduced genotype / chip array based on chip clusters.
Returns
-------
pandas.DataFrame
References
----------
1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
direct-to-consumer genotype data, and quality control tool
(GenomePrep) for research, Computational and Structural
Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
"""
if self._chip_clusters is None:
chip_clusters_path = self._download_file(
"https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/the_list.tsv.gz",
"chip_clusters.tsv.gz",
)

df = pd.read_csv(
chip_clusters_path,
sep="\t",
names=["locus", "clusters"],
dtype={"locus": object, "clusters": pd.CategoricalDtype(ordered=False)},
)
clusters = df.clusters
df = df.locus.str.split(":", expand=True)
df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
df.pos = df.pos.astype(np.uint32)
df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
df["clusters"] = clusters

self._chip_clusters = df

return self._chip_clusters

def get_low_quality_snps(self):
"""Get listing of low quality SNPs for quality control based on chip clusters.
Returns
-------
pandas.DataFrame
References
----------
1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
direct-to-consumer genotype data, and quality control tool
(GenomePrep) for research, Computational and Structural
Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
"""
if self._low_quality_snps is None:
low_quality_snps_path = self._download_file(
"https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/badalleles.tsv.gz",
"low_quality_snps.tsv.gz",
)

df = pd.read_csv(
low_quality_snps_path,
sep="\t",
names=["cluster", "loci"],
)

cluster_dfs = []
for row in df.itertuples():
loci = row.loci.split(",")
cluster_dfs.append(
pd.DataFrame({"cluster": [row.cluster] * len(loci), "locus": loci})
)

df = pd.concat(cluster_dfs)
df.reset_index(inplace=True, drop=True)
cluster = df.cluster.astype(pd.CategoricalDtype(ordered=False))
df = df.locus.str.split(":", expand=True)
df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
df.pos = df.pos.astype(np.uint32)
df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
df["cluster"] = cluster
self._low_quality_snps = df

return self._low_quality_snps

def get_dbsnp_151_37_reverse(self):
"""Get and load RSIDs that are on the reference reverse (-) strand in dbSNP 151 and lower.
Expand Down
Loading

0 comments on commit 8abca57

Please sign in to comment.