Skip to content

Commit

Permalink
fix assert crashes
Browse files Browse the repository at this point in the history
  • Loading branch information
Anton Kulaga committed Sep 5, 2023
1 parent 9176610 commit 314f0d8
Show file tree
Hide file tree
Showing 6 changed files with 290 additions and 22 deletions.
3 changes: 2 additions & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ dependencies:
- ucsc-gtftogenepred
- pip
- pip:
- pycomfort>=0.0.12
- pycomfort>=0.0.14
- polars
- genomepy
- loguru
- git+https://github.com/MrTomRod/DnaFeaturesViewer #to fix bug with biopython 1.8
219 changes: 219 additions & 0 deletions examples/explore_mitochondria.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"collapsed": true,
"is_executing": true,
"ExecuteTime": {
"start_time": "2023-09-05T23:25:16.251706600Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading the genome with annotations from Ensembl, this may take a while. The results are cached\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"02:25:18 | INFO | Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/fasta/acanthochromis_polyacanthus/dna/Acanthochromis_polyacanthus.ASM210954v1.dna_sm.toplevel.fa.gz...\n",
"Download: 100%|██████████| 265M/265M [00:15<00:00, 18.0MB/s] \n",
"02:25:34 | INFO | Genome download successful, starting post processing...\n",
"02:25:39 | INFO | name: ASM210954v1\n",
"02:25:39 | INFO | local name: ASM210954v1\n",
"02:25:39 | INFO | fasta: /home/antonkulaga/.local/share/genomes/ASM210954v1/ASM210954v1.fa\n",
"Filtering Fasta: 16.6M lines [00:05, 3.09M lines/s]\n",
"02:26:06 | INFO | Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/gtf/acanthochromis_polyacanthus/Acanthochromis_polyacanthus.ASM210954v1.110.gtf.gz...\n",
"Download: 100%|██████████| 11.7M/11.7M [00:01<00:00, 11.0MB/s]\n",
"02:26:11 | INFO | Annotation download successful\n"
]
}
],
"source": [
"from pathlib import *\n",
"from pycomfort.files import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import sys\n",
"\n",
"base = Path(\"..\")\n",
"local = (base / \"genotations\").resolve()\n",
"if local.exists():\n",
" sys.path.insert(0, Path(\"..\").absolute().as_posix())\n",
" sys.path.insert(0, local)\n",
" print(sys.path)\n",
"else:\n",
" base = Path(\".\")\n",
"%load_ext autoreload\n",
"%autoreload 2"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "1f3e9d797e482da8"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import polars as pl\n",
"from genotations import genomes\n",
"from genotations.genomes import Annotations\n",
"from genotations import *\n",
"from genotations import ensembl\n",
"from genotations.quantification import *\n",
"from genotations.genomes import *"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "1dac2ca0035ac603"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"pl.Config.set_tbl_width_chars(10000)\n",
"pl.Config.set_fmt_str_lengths(1000)\n",
"pl.Config.set_tbl_rows(20)"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "bd54e6bb80abcde4"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"human = ensembl.human"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "87b39d6be6dc123e"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"chimpanzee = ensembl.chimpanzee"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "47f84c5329edbc04"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"chimpanzee"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "4dc51d36a272cd5c"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"chimpanzee.annotations"
],
"metadata": {
"collapsed": false,
"is_executing": true
},
"id": "aeae01cc8f522bd7"
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"01:59:22 | ERROR | assembly GCA_944319715.1 should be in assembly genomes!\n",
"01:59:22 | ERROR | assembly GCA_944319725.1 should be in assembly genomes!\n"
]
},
{
"data": {
"text/plain": "'/home/antonkulaga/.local/share/genomes/Pan_tro_3.0/Pan_tro_3.0.annotation.gtf'"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chimpanzee.genome.annotation_gtf_file"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-09-05T22:59:22.122219627Z",
"start_time": "2023-09-05T22:59:18.611651653Z"
}
},
"id": "d8c2d705f97d9607"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "499ee96ebc107cd5"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
79 changes: 63 additions & 16 deletions genotations/ensembl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from functools import cached_property
from typing import Optional

from pathlib import Path
from typing import Optional, Union
from pathlib import Path
import shutil
import gzip
from typing import Union
import genomepy
import loguru

from genotations.genomes import Annotations

Expand All @@ -26,20 +31,60 @@ class SpeciesInfo:
common_name: str
species_name: str
genomes_dir: Optional[str]
broken: bool = False
gtf_path: str



@staticmethod
def download_and_ungzip_if_needed(annotation_gtf_file: str, gtf_path: str) -> Union[str, None]:
# Convert the string paths to Path objects
annotation_gtf_path = Path(annotation_gtf_file)
gtf_file_path = Path(gtf_path)

# Get the parent folder from the annotation_gtf_file
parent_folder = annotation_gtf_path.parent

# Create the full path where the gtf_path file will be saved
destination_path = parent_folder / gtf_file_path.name

# Download the file (simulating the download with a file copy for now)
shutil.copy(gtf_file_path, destination_path)

# If the file ends with .gz, ungzip it
if destination_path.suffix == '.gz':
with gzip.open(destination_path, 'rb') as f_in:
with open(destination_path.with_suffix(''), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
destination_path.unlink()
return str(destination_path.with_suffix(''))

return str(destination_path)



def __init__(self, common_name: str, assembly_name: str, genomes_dir: Optional[str] = None):
def __init__(self, common_name: str, assembly_name: str, genomes_dir: Optional[str] = None, alternative_gtf: Optional[Union[Path, str]] = None):
"""
Loads genome and annotations from genomepy in a more organized way
:param common_name: common name of the species
:param assembly_name: name of the genome assembly
"""
assert assembly_name in ens.genomes, "assembly should be in assembly genomes!"
if assembly_name not in ens.genomes:
loguru.logger.error(f"assembly {assembly_name} should be in assembly genomes!")
self.broken = True
self.assembly_name = assembly_name
self.common_name = common_name
self.assembly = ens.genomes[assembly_name]
self.species_name = self.assembly["name"]
self.genomes_dir = genomes_dir
if not self.broken:
self.assembly = ens.genomes[assembly_name]
self.species_name = self.assembly["name"]
self.genomes_dir = genomes_dir
if alternative_gtf is not None:
if "http" in alternative_gtf or "ftp" in alternative_gtf:
self.gtf_path = self.download_and_ungzip_if_needed(alternative_gtf)
else:
self.gtf_path = alternative_gtf
else:
self.gtf_path = self.genome.annotation_gtf_file

@cached_property
def genome(self):
Expand All @@ -59,7 +104,7 @@ def annotations(self) -> Annotations:
NOTE: if the genome is not downloaded, also starts the download
:return: Annotation class instance for chained calls
"""
return Annotations(self.genome.annotation_gtf_file)
return Annotations(self.gtf_path)

species: dict[str, SpeciesInfo] = {
'Acanthochromis_polyacanthus': SpeciesInfo("Spiny chromis", "ASM210954v1"),
Expand Down Expand Up @@ -137,7 +182,7 @@ def annotations(self) -> Annotations:
'Coturnix_japonica': SpeciesInfo("Japanese quail", "Coturnix_japonica_2.0"),
'Cricetulus_griseus_chok1gshd': SpeciesInfo("Chinese hamster CHOK1GS", "CHOK1GS_HDv1"),
'Cricetulus_griseus_crigri': SpeciesInfo("Chinese hamster CriGri", "CriGri_1.0"),
'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICR"),
'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICRH-1.0"),
'Crocodylus_porosus': SpeciesInfo("Australian saltwater crocodile", "CroPor_comp1"),
'Cyanistes_caeruleus': SpeciesInfo("Blue tit", "cyaCae2"),
'Cyclopterus_lumpus': SpeciesInfo("Lumpfish", "fCycLum1.pri"),
Expand All @@ -154,7 +199,7 @@ def annotations(self) -> Annotations:
'Dicentrarchus_labrax': SpeciesInfo("European seabass", "dlabrax2021"),
'Dipodomys_ordii': SpeciesInfo("Kangaroo rat", "Dord_2.0"),
'Dromaius_novaehollandiae': SpeciesInfo("Emu", "droNov1"),
'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.32"),
'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.46"),
'Echeneis_naucrates': SpeciesInfo("Live sharksucker", "fEcheNa1.1"),
'Echinops_telfairi': SpeciesInfo("Lesser hedgehog tenrec", "TENREC"),
'Electrophorus_electricus': SpeciesInfo("Electric eel", "Ee_SOAP_WITH_SSPACE"),
Expand Down Expand Up @@ -182,10 +227,10 @@ def annotations(self) -> Annotations:
'Gorilla_gorilla': SpeciesInfo("Gorilla", "gorGor4"),
'Gouania_willdenowi': SpeciesInfo("Blunt-snouted clingfish", "fGouWil2.1"),
'Haplochromis_burtoni': SpeciesInfo("Burton's mouthbrooder", "AstBur1.0"),
'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "HetGla_female_1.0"),
'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "HetGla_1.0"),
'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "GCA_944319715.1"),
'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "GCA_944319725.1"),
'Hippocampus_comes': SpeciesInfo("Tiger tail seahorse", "H_comes_QL1_v1"),
'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p13"),
'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p14"),
'Hucho_hucho': SpeciesInfo("Huchen", "ASM331708v1"),
'Ictalurus_punctatus': SpeciesInfo("Channel catfish", "IpCoco_1.2"),
'Ictidomys_tridecemlineatus': SpeciesInfo("Squirrel", "SpeTri2.0"),
Expand Down Expand Up @@ -277,7 +322,7 @@ def annotations(self) -> Annotations:
'Ovis_aries': SpeciesInfo("Sheep (texel)", "Oar_v3.1"),
'Ovis_aries_rambouillet': SpeciesInfo("Sheep", "Oar_rambouillet_v1.0"),
'Pan_paniscus': SpeciesInfo("Bonobo", "panpan1.1"),
'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0"),
'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0", alternative_gtf="http://ftp.ensembl.org/pub/release-110/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.110.chr.gtf.gz"),
'Panthera_leo': SpeciesInfo("Lion", "PanLeo1.0"),
'Panthera_pardus': SpeciesInfo("Leopard", "PanPar1.0"),
'Panthera_tigris_altaica': SpeciesInfo("Tiger", "PanTig1.0"),
Expand Down Expand Up @@ -381,8 +426,10 @@ def annotations(self) -> Annotations:

}

human = species["Homo_sapiens"] # used for faster access to common human genome
mouse = species["Mus_musculus"] # used for faster access to common mouse genome
human = species["Homo_sapiens"] # used for faster access to common human genome
mouse = species["Mus_musculus"] # used for faster access to common mouse genome
gorilla = species["Gorilla_gorilla"] # used for faster access to common gorilla genome
chimpanzee = species["Pan_troglodytes"] # used for faster access to common chimpanzee genome
rat = species["Rattus_norvegicus"] # rats
cow = species["Bos_taurus"]
chlamy = species['Chlamydomonas reinhardtii']
Expand Down
3 changes: 2 additions & 1 deletion genotations/genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def __init__(self, gtf: Union[Path, str, pl.DataFrame]):
# if we already have the dataframe (for example in chained calls) than just assign it
self.annotations_df = gtf


def transform(self, fun: Callable[[pl.DataFrame], pl.DataFrame]):
return Annotations(fun(self.annotations_df))

Expand All @@ -121,7 +122,7 @@ def read_GTF(self, path: Union[str, Path]) -> pl.DataFrame:
:return: polards datafra
"""
att = pl.col("attribute")
loaded = pl.read_csv(str(path), has_header=False, comment_char="#", sep="\t",
loaded = pl.read_csv(str(path), has_header=False, comment_char="#", separator="\t",
new_columns=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"],
dtypes={
#sometimes polars makes mistakes in automatic type derivation with some fields
Expand Down
2 changes: 1 addition & 1 deletion genotations/quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def read_quant(path: Path, transcripts: bool) -> pl.DataFrame:
alias = "transcript" if transcripts else "gene"
gene = pl.col("Name").str.split(".").apply(lambda s: s[0]).alias(alias)
dtypes={"TPM": pl.datatypes.Float64, "EffectiveLength": pl.datatypes.Float64, "NumReads": pl.datatypes.Float64}
return pl.read_csv(path, sep="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")])
return pl.read_csv(path, separator="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")])


def quant_from_run(run: Path, name_part: str = "quant.sf", dir_part: str = "quant_") -> Optional[pl.DataFrame]:
Expand Down
Loading

0 comments on commit 314f0d8

Please sign in to comment.