From 314f0d8e3192c569d5308d464309c03b6245774d Mon Sep 17 00:00:00 2001 From: Anton Kulaga Date: Wed, 6 Sep 2023 02:26:30 +0300 Subject: [PATCH] fix assert crashes --- environment.yaml | 3 +- examples/explore_mitochondria.ipynb | 219 ++++++++++++++++++++++++++++ genotations/ensembl.py | 79 ++++++++-- genotations/genomes.py | 3 +- genotations/quantification.py | 2 +- setup.py | 6 +- 6 files changed, 290 insertions(+), 22 deletions(-) create mode 100644 examples/explore_mitochondria.ipynb diff --git a/environment.yaml b/environment.yaml index 2a0f0da..fc6347d 100644 --- a/environment.yaml +++ b/environment.yaml @@ -19,7 +19,8 @@ dependencies: - ucsc-gtftogenepred - pip - pip: - - pycomfort>=0.0.12 + - pycomfort>=0.0.14 - polars - genomepy + - loguru - git+https://github.com/MrTomRod/DnaFeaturesViewer #to fix bug with biopython 1.8 diff --git a/examples/explore_mitochondria.ipynb b/examples/explore_mitochondria.ipynb new file mode 100644 index 0000000..8364a4c --- /dev/null +++ b/examples/explore_mitochondria.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true, + "is_executing": true, + "ExecuteTime": { + "start_time": "2023-09-05T23:25:16.251706600Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading the genome with annotations from Ensembl, this may take a while. The results are cached\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "02:25:18 | INFO | Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/fasta/acanthochromis_polyacanthus/dna/Acanthochromis_polyacanthus.ASM210954v1.dna_sm.toplevel.fa.gz...\n", + "Download: 100%|██████████| 265M/265M [00:15<00:00, 18.0MB/s] \n", + "02:25:34 | INFO | Genome download successful, starting post processing...\n", + "02:25:39 | INFO | name: ASM210954v1\n", + "02:25:39 | INFO | local name: ASM210954v1\n", + "02:25:39 | INFO | fasta: /home/antonkulaga/.local/share/genomes/ASM210954v1/ASM210954v1.fa\n", + "Filtering Fasta: 16.6M lines [00:05, 3.09M lines/s]\n", + "02:26:06 | INFO | Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/gtf/acanthochromis_polyacanthus/Acanthochromis_polyacanthus.ASM210954v1.110.gtf.gz...\n", + "Download: 100%|██████████| 11.7M/11.7M [00:01<00:00, 11.0MB/s]\n", + "02:26:11 | INFO | Annotation download successful\n" + ] + } + ], + "source": [ + "from pathlib import *\n", + "from pycomfort.files import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "base = Path(\"..\")\n", + "local = (base / \"genotations\").resolve()\n", + "if local.exists():\n", + " sys.path.insert(0, Path(\"..\").absolute().as_posix())\n", + " sys.path.insert(0, local)\n", + " print(sys.path)\n", + "else:\n", + " base = Path(\".\")\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "1f3e9d797e482da8" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import polars as pl\n", + "from genotations import genomes\n", + "from genotations.genomes import Annotations\n", + "from genotations import *\n", + "from genotations import ensembl\n", + "from genotations.quantification import *\n", + "from genotations.genomes import *" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "1dac2ca0035ac603" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "pl.Config.set_tbl_width_chars(10000)\n", + "pl.Config.set_fmt_str_lengths(1000)\n", + "pl.Config.set_tbl_rows(20)" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "bd54e6bb80abcde4" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "human = ensembl.human" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "87b39d6be6dc123e" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "chimpanzee = ensembl.chimpanzee" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "47f84c5329edbc04" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "chimpanzee" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "4dc51d36a272cd5c" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "chimpanzee.annotations" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "aeae01cc8f522bd7" + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01:59:22 | ERROR | assembly GCA_944319715.1 should be in assembly genomes!\n", + "01:59:22 | ERROR | assembly GCA_944319725.1 should be in assembly genomes!\n" + ] + }, + { + "data": { + "text/plain": "'/home/antonkulaga/.local/share/genomes/Pan_tro_3.0/Pan_tro_3.0.annotation.gtf'" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chimpanzee.genome.annotation_gtf_file" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-05T22:59:22.122219627Z", + "start_time": "2023-09-05T22:59:18.611651653Z" + } + }, + "id": "d8c2d705f97d9607" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "499ee96ebc107cd5" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/genotations/ensembl.py b/genotations/ensembl.py index 512da11..acfa896 100644 --- a/genotations/ensembl.py +++ b/genotations/ensembl.py @@ -1,7 +1,12 @@ from functools import cached_property -from typing import Optional - +from pathlib import Path +from typing import Optional, Union +from pathlib import Path +import shutil +import gzip +from typing import Union import genomepy +import loguru from genotations.genomes import Annotations @@ -26,20 +31,60 @@ class SpeciesInfo: common_name: str species_name: str genomes_dir: Optional[str] + broken: bool = False + gtf_path: str + + + + @staticmethod + def download_and_ungzip_if_needed(annotation_gtf_file: str, gtf_path: str) -> Union[str, None]: + # Convert the string paths to Path objects + annotation_gtf_path = Path(annotation_gtf_file) + gtf_file_path = Path(gtf_path) + + # Get the parent folder from the annotation_gtf_file + parent_folder = annotation_gtf_path.parent + + # Create the full path where the gtf_path file will be saved + destination_path = parent_folder / gtf_file_path.name + + # Download the file (simulating the download with a file copy for now) + shutil.copy(gtf_file_path, destination_path) + + # If the file ends with .gz, ungzip it + if destination_path.suffix == '.gz': + with gzip.open(destination_path, 'rb') as f_in: + with open(destination_path.with_suffix(''), 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + destination_path.unlink() + return str(destination_path.with_suffix('')) + + return str(destination_path) + - def __init__(self, common_name: str, assembly_name: str, genomes_dir: Optional[str] = None): + def __init__(self, common_name: str, assembly_name: str, genomes_dir: Optional[str] = None, alternative_gtf: Optional[Union[Path, str]] = None): """ Loads genome and annotations from genomepy in a more organized way :param common_name: common name of the species :param assembly_name: name of the genome assembly """ - assert assembly_name in ens.genomes, "assembly should be in assembly genomes!" + if assembly_name not in ens.genomes: + loguru.logger.error(f"assembly {assembly_name} should be in assembly genomes!") + self.broken = True self.assembly_name = assembly_name self.common_name = common_name - self.assembly = ens.genomes[assembly_name] - self.species_name = self.assembly["name"] - self.genomes_dir = genomes_dir + if not self.broken: + self.assembly = ens.genomes[assembly_name] + self.species_name = self.assembly["name"] + self.genomes_dir = genomes_dir + if alternative_gtf is not None: + if "http" in alternative_gtf or "ftp" in alternative_gtf: + self.gtf_path = self.download_and_ungzip_if_needed(alternative_gtf) + else: + self.gtf_path = alternative_gtf + else: + self.gtf_path = self.genome.annotation_gtf_file @cached_property def genome(self): @@ -59,7 +104,7 @@ def annotations(self) -> Annotations: NOTE: if the genome is not downloaded, also starts the download :return: Annotation class instance for chained calls """ - return Annotations(self.genome.annotation_gtf_file) + return Annotations(self.gtf_path) species: dict[str, SpeciesInfo] = { 'Acanthochromis_polyacanthus': SpeciesInfo("Spiny chromis", "ASM210954v1"), @@ -137,7 +182,7 @@ def annotations(self) -> Annotations: 'Coturnix_japonica': SpeciesInfo("Japanese quail", "Coturnix_japonica_2.0"), 'Cricetulus_griseus_chok1gshd': SpeciesInfo("Chinese hamster CHOK1GS", "CHOK1GS_HDv1"), 'Cricetulus_griseus_crigri': SpeciesInfo("Chinese hamster CriGri", "CriGri_1.0"), - 'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICR"), + 'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICRH-1.0"), 'Crocodylus_porosus': SpeciesInfo("Australian saltwater crocodile", "CroPor_comp1"), 'Cyanistes_caeruleus': SpeciesInfo("Blue tit", "cyaCae2"), 'Cyclopterus_lumpus': SpeciesInfo("Lumpfish", "fCycLum1.pri"), @@ -154,7 +199,7 @@ def annotations(self) -> Annotations: 'Dicentrarchus_labrax': SpeciesInfo("European seabass", "dlabrax2021"), 'Dipodomys_ordii': SpeciesInfo("Kangaroo rat", "Dord_2.0"), 'Dromaius_novaehollandiae': SpeciesInfo("Emu", "droNov1"), - 'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.32"), + 'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.46"), 'Echeneis_naucrates': SpeciesInfo("Live sharksucker", "fEcheNa1.1"), 'Echinops_telfairi': SpeciesInfo("Lesser hedgehog tenrec", "TENREC"), 'Electrophorus_electricus': SpeciesInfo("Electric eel", "Ee_SOAP_WITH_SSPACE"), @@ -182,10 +227,10 @@ def annotations(self) -> Annotations: 'Gorilla_gorilla': SpeciesInfo("Gorilla", "gorGor4"), 'Gouania_willdenowi': SpeciesInfo("Blunt-snouted clingfish", "fGouWil2.1"), 'Haplochromis_burtoni': SpeciesInfo("Burton's mouthbrooder", "AstBur1.0"), - 'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "HetGla_female_1.0"), - 'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "HetGla_1.0"), + 'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "GCA_944319715.1"), + 'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "GCA_944319725.1"), 'Hippocampus_comes': SpeciesInfo("Tiger tail seahorse", "H_comes_QL1_v1"), - 'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p13"), + 'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p14"), 'Hucho_hucho': SpeciesInfo("Huchen", "ASM331708v1"), 'Ictalurus_punctatus': SpeciesInfo("Channel catfish", "IpCoco_1.2"), 'Ictidomys_tridecemlineatus': SpeciesInfo("Squirrel", "SpeTri2.0"), @@ -277,7 +322,7 @@ def annotations(self) -> Annotations: 'Ovis_aries': SpeciesInfo("Sheep (texel)", "Oar_v3.1"), 'Ovis_aries_rambouillet': SpeciesInfo("Sheep", "Oar_rambouillet_v1.0"), 'Pan_paniscus': SpeciesInfo("Bonobo", "panpan1.1"), - 'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0"), + 'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0", alternative_gtf="http://ftp.ensembl.org/pub/release-110/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.110.chr.gtf.gz"), 'Panthera_leo': SpeciesInfo("Lion", "PanLeo1.0"), 'Panthera_pardus': SpeciesInfo("Leopard", "PanPar1.0"), 'Panthera_tigris_altaica': SpeciesInfo("Tiger", "PanTig1.0"), @@ -381,8 +426,10 @@ def annotations(self) -> Annotations: } -human = species["Homo_sapiens"] # used for faster access to common human genome -mouse = species["Mus_musculus"] # used for faster access to common mouse genome +human = species["Homo_sapiens"] # used for faster access to common human genome +mouse = species["Mus_musculus"] # used for faster access to common mouse genome +gorilla = species["Gorilla_gorilla"] # used for faster access to common gorilla genome +chimpanzee = species["Pan_troglodytes"] # used for faster access to common chimpanzee genome rat = species["Rattus_norvegicus"] # rats cow = species["Bos_taurus"] chlamy = species['Chlamydomonas reinhardtii'] diff --git a/genotations/genomes.py b/genotations/genomes.py index 26dc88d..81b912d 100644 --- a/genotations/genomes.py +++ b/genotations/genomes.py @@ -106,6 +106,7 @@ def __init__(self, gtf: Union[Path, str, pl.DataFrame]): # if we already have the dataframe (for example in chained calls) than just assign it self.annotations_df = gtf + def transform(self, fun: Callable[[pl.DataFrame], pl.DataFrame]): return Annotations(fun(self.annotations_df)) @@ -121,7 +122,7 @@ def read_GTF(self, path: Union[str, Path]) -> pl.DataFrame: :return: polards datafra """ att = pl.col("attribute") - loaded = pl.read_csv(str(path), has_header=False, comment_char="#", sep="\t", + loaded = pl.read_csv(str(path), has_header=False, comment_char="#", separator="\t", new_columns=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"], dtypes={ #sometimes polars makes mistakes in automatic type derivation with some fields diff --git a/genotations/quantification.py b/genotations/quantification.py index 6adae3e..84a170d 100644 --- a/genotations/quantification.py +++ b/genotations/quantification.py @@ -23,7 +23,7 @@ def read_quant(path: Path, transcripts: bool) -> pl.DataFrame: alias = "transcript" if transcripts else "gene" gene = pl.col("Name").str.split(".").apply(lambda s: s[0]).alias(alias) dtypes={"TPM": pl.datatypes.Float64, "EffectiveLength": pl.datatypes.Float64, "NumReads": pl.datatypes.Float64} - return pl.read_csv(path, sep="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")]) + return pl.read_csv(path, separator="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")]) def quant_from_run(run: Path, name_part: str = "quant.sf", dir_part: str = "quant_") -> Optional[pl.DataFrame]: diff --git a/setup.py b/setup.py index a97c9c5..930d2d5 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh: long_description = "\n" + fh.read() -VERSION = '0.1.6' +VERSION = '0.1.7' DESCRIPTION = 'Genotations - python library to work with genomes and primers' LONG_DESCRIPTION = 'Genotations - python library to work with genomes and primers' @@ -21,8 +21,8 @@ long_description_content_type="text/markdown", long_description=long_description, packages=find_packages(), - install_requires=['pyfunctional', 'more-itertools', 'click', 'pycomfort', 'polars', 'genomepy', 'primer3-py', "pyarrow"], - keywords=['python', 'utils', 'files'], + install_requires=['pyfunctional', 'more-itertools', 'click', 'pycomfort', 'polars', 'genomepy', 'primer3-py', "pyarrow", "loguru"], + keywords=['python', 'utils', 'files', "genetics", "ensembl", "genomes", "annotations"], classifiers=[ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers",