fix assert crashes

antonkulaga · Sep 5, 2023 · 314f0d8 · 314f0d8
1 parent 9176610
commit 314f0d8
Show file tree

Hide file tree

Showing 6 changed files with 290 additions and 22 deletions.
diff --git a/environment.yaml b/environment.yaml
@@ -19,7 +19,8 @@ dependencies:
     - ucsc-gtftogenepred
     - pip
     - pip:
-        - pycomfort>=0.0.12
+        - pycomfort>=0.0.14
         - polars
         - genomepy
+        - loguru
         - git+https://github.com/MrTomRod/DnaFeaturesViewer #to fix bug with biopython 1.8
diff --git a/examples/explore_mitochondria.ipynb b/examples/explore_mitochondria.ipynb
@@ -0,0 +1,219 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "is_executing": true,
+    "ExecuteTime": {
+     "start_time": "2023-09-05T23:25:16.251706600Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading the genome with annotations from Ensembl, this may take a while. The results are cached\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "02:25:18 | INFO | Downloading genome from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/fasta/acanthochromis_polyacanthus/dna/Acanthochromis_polyacanthus.ASM210954v1.dna_sm.toplevel.fa.gz...\n",
+      "Download: 100%|██████████| 265M/265M [00:15<00:00, 18.0MB/s] \n",
+      "02:25:34 | INFO | Genome download successful, starting post processing...\n",
+      "02:25:39 | INFO | name: ASM210954v1\n",
+      "02:25:39 | INFO | local name: ASM210954v1\n",
+      "02:25:39 | INFO | fasta: /home/antonkulaga/.local/share/genomes/ASM210954v1/ASM210954v1.fa\n",
+      "Filtering Fasta: 16.6M lines [00:05, 3.09M lines/s]\n",
+      "02:26:06 | INFO | Downloading annotation from Ensembl. Target URL: http://ftp.ensembl.org/pub/release-110/gtf/acanthochromis_polyacanthus/Acanthochromis_polyacanthus.ASM210954v1.110.gtf.gz...\n",
+      "Download: 100%|██████████| 11.7M/11.7M [00:01<00:00, 11.0MB/s]\n",
+      "02:26:11 | INFO | Annotation download successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import *\n",
+    "from pycomfort.files import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "base = Path(\"..\")\n",
+    "local = (base / \"genotations\").resolve()\n",
+    "if local.exists():\n",
+    "    sys.path.insert(0, Path(\"..\").absolute().as_posix())\n",
+    "    sys.path.insert(0, local)\n",
+    "    print(sys.path)\n",
+    "else:\n",
+    "    base = Path(\".\")\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "1f3e9d797e482da8"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "from genotations import genomes\n",
+    "from genotations.genomes import Annotations\n",
+    "from genotations import *\n",
+    "from genotations import ensembl\n",
+    "from genotations.quantification import *\n",
+    "from genotations.genomes import *"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "1dac2ca0035ac603"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "pl.Config.set_tbl_width_chars(10000)\n",
+    "pl.Config.set_fmt_str_lengths(1000)\n",
+    "pl.Config.set_tbl_rows(20)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "bd54e6bb80abcde4"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "human = ensembl.human"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "87b39d6be6dc123e"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "chimpanzee = ensembl.chimpanzee"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "47f84c5329edbc04"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "chimpanzee"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "4dc51d36a272cd5c"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "chimpanzee.annotations"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true
+   },
+   "id": "aeae01cc8f522bd7"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "01:59:22 | ERROR | assembly GCA_944319715.1 should be in assembly genomes!\n",
+      "01:59:22 | ERROR | assembly GCA_944319725.1 should be in assembly genomes!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "'/home/antonkulaga/.local/share/genomes/Pan_tro_3.0/Pan_tro_3.0.annotation.gtf'"
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chimpanzee.genome.annotation_gtf_file"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-09-05T22:59:22.122219627Z",
+     "start_time": "2023-09-05T22:59:18.611651653Z"
+    }
+   },
+   "id": "d8c2d705f97d9607"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "499ee96ebc107cd5"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/genotations/ensembl.py b/genotations/ensembl.py
@@ -1,7 +1,12 @@
 from functools import cached_property
-from typing import Optional
-
+from pathlib import Path
+from typing import Optional, Union
+from pathlib import Path
+import shutil
+import gzip
+from typing import Union
 import genomepy
+import loguru
 
 from genotations.genomes import Annotations
 
@@ -26,20 +31,60 @@ class SpeciesInfo:
     common_name: str
     species_name: str
     genomes_dir: Optional[str]
+    broken: bool = False
+    gtf_path: str
+
+
+
+    @staticmethod
+    def download_and_ungzip_if_needed(annotation_gtf_file: str, gtf_path: str) -> Union[str, None]:
+        # Convert the string paths to Path objects
+        annotation_gtf_path = Path(annotation_gtf_file)
+        gtf_file_path = Path(gtf_path)
+
+        # Get the parent folder from the annotation_gtf_file
+        parent_folder = annotation_gtf_path.parent
+
+        # Create the full path where the gtf_path file will be saved
+        destination_path = parent_folder / gtf_file_path.name
+
+        # Download the file (simulating the download with a file copy for now)
+        shutil.copy(gtf_file_path, destination_path)
+
+        # If the file ends with .gz, ungzip it
+        if destination_path.suffix == '.gz':
+            with gzip.open(destination_path, 'rb') as f_in:
+                with open(destination_path.with_suffix(''), 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            destination_path.unlink()
+            return str(destination_path.with_suffix(''))
+
+        return str(destination_path)
+
 
 
-    def __init__(self, common_name: str,  assembly_name: str, genomes_dir: Optional[str] = None):
+    def __init__(self, common_name: str,  assembly_name: str, genomes_dir: Optional[str] = None, alternative_gtf: Optional[Union[Path, str]] =  None):
         """
         Loads genome and annotations from genomepy in a more organized way
         :param common_name: common name of the species
         :param assembly_name: name of the genome assembly
         """
-        assert assembly_name in ens.genomes, "assembly should be in assembly genomes!"
+        if assembly_name not in ens.genomes:
+            loguru.logger.error(f"assembly {assembly_name} should be in assembly genomes!")
+            self.broken = True
         self.assembly_name = assembly_name
         self.common_name = common_name
-        self.assembly = ens.genomes[assembly_name]
-        self.species_name = self.assembly["name"]
-        self.genomes_dir = genomes_dir
+        if not self.broken:
+            self.assembly = ens.genomes[assembly_name]
+            self.species_name = self.assembly["name"]
+            self.genomes_dir = genomes_dir
+            if alternative_gtf is not None:
+                if "http" in alternative_gtf or "ftp" in alternative_gtf:
+                    self.gtf_path = self.download_and_ungzip_if_needed(alternative_gtf)
+                else:
+                    self.gtf_path = alternative_gtf
+            else:
+                self.gtf_path = self.genome.annotation_gtf_file
 
     @cached_property
     def genome(self):
@@ -59,7 +104,7 @@ def annotations(self) -> Annotations:
         NOTE: if the genome is not downloaded, also starts the download
         :return: Annotation class instance for chained calls
         """
-        return Annotations(self.genome.annotation_gtf_file)
+        return Annotations(self.gtf_path)
 
 species: dict[str, SpeciesInfo] = {
     'Acanthochromis_polyacanthus': SpeciesInfo("Spiny chromis", "ASM210954v1"),
@@ -137,7 +182,7 @@ def annotations(self) -> Annotations:
     'Coturnix_japonica': SpeciesInfo("Japanese quail", "Coturnix_japonica_2.0"),
     'Cricetulus_griseus_chok1gshd': SpeciesInfo("Chinese hamster CHOK1GS", "CHOK1GS_HDv1"),
     'Cricetulus_griseus_crigri': SpeciesInfo("Chinese hamster CriGri", "CriGri_1.0"),
-    'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICR"),
+    'Cricetulus_griseus_picr': SpeciesInfo("Chinese hamster PICR", "CriGri-PICRH-1.0"),
     'Crocodylus_porosus': SpeciesInfo("Australian saltwater crocodile", "CroPor_comp1"),
     'Cyanistes_caeruleus': SpeciesInfo("Blue tit", "cyaCae2"),
     'Cyclopterus_lumpus': SpeciesInfo("Lumpfish", "fCycLum1.pri"),
@@ -154,7 +199,7 @@ def annotations(self) -> Annotations:
     'Dicentrarchus_labrax': SpeciesInfo("European seabass", "dlabrax2021"),
     'Dipodomys_ordii': SpeciesInfo("Kangaroo rat", "Dord_2.0"),
     'Dromaius_novaehollandiae': SpeciesInfo("Emu", "droNov1"),
-    'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.32"),
+    'Drosophila_melanogaster': SpeciesInfo("Drosophila melanogaster", "BDGP6.46"),
     'Echeneis_naucrates': SpeciesInfo("Live sharksucker", "fEcheNa1.1"),
     'Echinops_telfairi': SpeciesInfo("Lesser hedgehog tenrec", "TENREC"),
     'Electrophorus_electricus': SpeciesInfo("Electric eel", "Ee_SOAP_WITH_SSPACE"),
@@ -182,10 +227,10 @@ def annotations(self) -> Annotations:
     'Gorilla_gorilla': SpeciesInfo("Gorilla", "gorGor4"),
     'Gouania_willdenowi': SpeciesInfo("Blunt-snouted clingfish", "fGouWil2.1"),
     'Haplochromis_burtoni': SpeciesInfo("Burton's mouthbrooder", "AstBur1.0"),
-    'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "HetGla_female_1.0"),
-    'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "HetGla_1.0"),
+    'Heterocephalus_glaber_female': SpeciesInfo("Naked mole-rat female", "GCA_944319715.1"),
+    'Heterocephalus_glaber_male': SpeciesInfo("Naked mole-rat male", "GCA_944319725.1"),
     'Hippocampus_comes': SpeciesInfo("Tiger tail seahorse", "H_comes_QL1_v1"),
-    'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p13"),
+    'Homo_sapiens': SpeciesInfo("Human", "GRCh38.p14"),
     'Hucho_hucho': SpeciesInfo("Huchen", "ASM331708v1"),
     'Ictalurus_punctatus': SpeciesInfo("Channel catfish", "IpCoco_1.2"),
     'Ictidomys_tridecemlineatus': SpeciesInfo("Squirrel", "SpeTri2.0"),
@@ -277,7 +322,7 @@ def annotations(self) -> Annotations:
     'Ovis_aries': SpeciesInfo("Sheep (texel)", "Oar_v3.1"),
     'Ovis_aries_rambouillet': SpeciesInfo("Sheep", "Oar_rambouillet_v1.0"),
     'Pan_paniscus': SpeciesInfo("Bonobo", "panpan1.1"),
-    'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0"),
+    'Pan_troglodytes': SpeciesInfo("Chimpanzee", "Pan_tro_3.0", alternative_gtf="http://ftp.ensembl.org/pub/release-110/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.110.chr.gtf.gz"),
     'Panthera_leo': SpeciesInfo("Lion", "PanLeo1.0"),
     'Panthera_pardus': SpeciesInfo("Leopard", "PanPar1.0"),
     'Panthera_tigris_altaica': SpeciesInfo("Tiger", "PanTig1.0"),
@@ -381,8 +426,10 @@ def annotations(self) -> Annotations:
 
 }
 
-human = species["Homo_sapiens"] # used for faster access to common human genome
-mouse = species["Mus_musculus"] # used for faster access to common mouse genome
+human = species["Homo_sapiens"]  # used for faster access to common human genome
+mouse = species["Mus_musculus"]  # used for faster access to common mouse genome
+gorilla = species["Gorilla_gorilla"]  # used for faster access to common gorilla genome
+chimpanzee = species["Pan_troglodytes"] # used for faster access to common chimpanzee genome
 rat = species["Rattus_norvegicus"] # rats
 cow = species["Bos_taurus"]
 chlamy = species['Chlamydomonas reinhardtii']

diff --git a/genotations/genomes.py b/genotations/genomes.py
@@ -106,6 +106,7 @@ def __init__(self, gtf: Union[Path, str, pl.DataFrame]):
             # if we already have the dataframe (for example in chained calls) than just assign it
             self.annotations_df = gtf
 
+
     def transform(self, fun: Callable[[pl.DataFrame], pl.DataFrame]):
         return Annotations(fun(self.annotations_df))
 
@@ -121,7 +122,7 @@ def read_GTF(self, path: Union[str, Path]) -> pl.DataFrame:
         :return: polards datafra
         """
         att = pl.col("attribute")
-        loaded = pl.read_csv(str(path), has_header=False, comment_char="#", sep="\t",
+        loaded = pl.read_csv(str(path), has_header=False, comment_char="#", separator="\t",
                              new_columns=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"],
                              dtypes={
                                  #sometimes polars makes mistakes in automatic type derivation with some fields

diff --git a/genotations/quantification.py b/genotations/quantification.py
@@ -23,7 +23,7 @@ def read_quant(path: Path, transcripts: bool) -> pl.DataFrame:
     alias = "transcript" if transcripts else "gene"
     gene = pl.col("Name").str.split(".").apply(lambda s: s[0]).alias(alias)
     dtypes={"TPM": pl.datatypes.Float64, "EffectiveLength": pl.datatypes.Float64, "NumReads": pl.datatypes.Float64}
-    return pl.read_csv(path, sep="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")])
+    return pl.read_csv(path, separator="\t", dtypes = dtypes).with_columns([gene]).select([gene, pl.col("TPM"), pl.col("EffectiveLength"), pl.col("NumReads")])
 
 
 def quant_from_run(run: Path, name_part: str = "quant.sf", dir_part: str = "quant_") -> Optional[pl.DataFrame]: