Substrate citation dm (#29)

* added citation and substrate to ProteinInfo * removed unused imports * removed parsing of citation * fixed UUID database issue * updated example * update
PyEED · Nov 27, 2023 · fe094b9 · fe094b9
1 parent a45badc
commit fe094b9
Show file tree

Hide file tree

Showing 13 changed files with 913 additions and 591 deletions.
diff --git a/examples/basics/query_and_blast.ipynb b/examples/basics/query_and_blast.ipynb
diff --git a/examples/test.ipynb b/examples/test.ipynb
@@ -13,61 +13,63 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aldolase = ProteinInfo.from_ncbi(\"UCS38941.1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "JAGTRX010000119.1:1..1200\n",
-      "['JAGTRX010000119.1:1..1200']\n",
-      "\u001b[4mProteinInfo\u001b[0m\n",
-      "├── \u001b[94mid\u001b[0m = proteininfo0\n",
-      "├── \u001b[94msource_id\u001b[0m = MBS1198664.1\n",
-      "├── \u001b[94mname\u001b[0m = depolymerase family esterase\n",
-      "├── \u001b[94msequence\u001b[0m = LQDAYVYVPKNAAPAVLGGKRALMLTMHGCGQTASGNVIGTKFNWETTAEQYGMVVVAPTVPSGTTSTRSVSGCWDWFGSAHTRTGRDAVPLKKLLDSVKARTNLDIDPNQIYVTGLSSGGGETIVMGCSFPEYFAGVGINAGPALGSASGDISVEPKVTAAQVASYCKAAATSTYTPYFATQITNAVYGTSDYLVKPNHNVRNIQGMAVVYGMTMGTPVTSSVAGGGTAKVYKDANGKERLSDLAVTGMGHAWPAGGGAGAAYVDTSHVNFPVYITKWFFDNNLRVSGGSTTTTTAATTTTTGGATTTTGAATTTTAAATTTTTRASTTTTTTTTTTTAGACYKTSNYAHVTAGRAYNSLGTAKAKGSNQSMGLNNTFYITKLRMTGTAYYVIDATCP\n",
-      "├── \u001b[94morganism\u001b[0m\n",
-      "│   └── \u001b[4mOrganism\u001b[0m\n",
-      "│       ├── \u001b[94mid\u001b[0m = organism3\n",
-      "│       ├── \u001b[94mname\u001b[0m = Pseudomonadota bacterium\n",
-      "│       ├── \u001b[94mtaxonomy_id\u001b[0m = taxon:1977087\n",
-      "│       ├── \u001b[94mdomain\u001b[0m = Bacteria\n",
-      "│       ├── \u001b[94mkingdom\u001b[0m = Pseudomonadota\n",
-      "│       └── \u001b[94mspecies\u001b[0m = bacterium\n",
-      "└── \u001b[94mcoding_sequence_ref\u001b[0m\n",
-      "    └── \u001b[4mDNARegion\u001b[0m\n",
-      "        ├── \u001b[94mid\u001b[0m = JAGTRX010000119.1\n",
-      "        ├── \u001b[94mspans\u001b[0m\n",
-      "        │   └── 0\n",
-      "        │       └── \u001b[4mSpan\u001b[0m\n",
-      "        │           ├── \u001b[94mid\u001b[0m = span0\n",
-      "        │           ├── \u001b[94mstart\u001b[0m = 1\n",
-      "        │           └── \u001b[94mend\u001b[0m = 1200\n",
-      "        └── \u001b[94mtype\u001b[0m = coding sequence\n",
-      "\n"
-     ]
+     "data": {
+      "text/plain": [
+       "Substrate(id='substrate0', name='GAPDH', inchi=None, smiles=None, chebi_id=None)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "aldolase = ProteinInfo.from_ncbi(\"MBS1198664.1\")\n",
-    "print(aldolase)"
+    "aldolase.add_to_substrates(\"GAPDH\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'NoneType' object has no attribute 'pblast'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m/Users/max/Documents/GitHub/pyeed/examples/test.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/Documents/GitHub/pyeed/examples/test.ipynb#W3sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m blast_results \u001b[39m=\u001b[39m aldolase\u001b[39m.\u001b[39;49mpblast(n_hits\u001b[39m=\u001b[39m\u001b[39m50\u001b[39m, e_value\u001b[39m=\u001b[39m\u001b[39m1e-50\u001b[39m)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'pblast'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼‍♀️ Running PBLAST\n",
+      "╭── protein name: TEM1\n",
+      "├── accession: UCS38941.1\n",
+      "├── organism: Nakaseomyces glabratus\n",
+      "├── e-value: 1e-50\n",
+      "╰── max hits: 50\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "⬇️ Fetching protein sequences: 100%|██████████| 50/50 [00:04<00:00, 11.25it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🎉 Done\n",
+      "\n"
      ]
     }
    ],

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyEED/core/__init__.py b/pyEED/core/__init__.py
@@ -1,6 +1,9 @@
 from .proteininfo import ProteinInfo
 from .dnainfo import DNAInfo
 from .abstractregion import AbstractRegion
+from .citation import Citation
+from .author import Author
+from .substrate import Substrate
 from .dnaregion import DNARegion
 from .proteinregion import ProteinRegion
 from .span import Span
@@ -16,6 +19,9 @@
     "ProteinInfo",
     "DNAInfo",
     "AbstractRegion",
+    "Citation",
+    "Author",
+    "Substrate",
     "DNARegion",
     "ProteinRegion",
     "Span",

diff --git a/pyEED/core/author.py b/pyEED/core/author.py
@@ -0,0 +1,26 @@
+import sdRDM
+
+from typing import Optional
+from pydantic import Field
+from sdRDM.base.utils import forge_signature, IDGenerator
+
+
+@forge_signature
+class Author(sdRDM.DataModel):
+    """"""
+
+    id: Optional[str] = Field(
+        description="Unique identifier of the given object.",
+        default_factory=IDGenerator("authorINDEX"),
+        xml="@id",
+    )
+
+    given_name: Optional[str] = Field(
+        default=None,
+        description="Given name of the author",
+    )
+
+    family_name: Optional[str] = Field(
+        default=None,
+        description="Family name of the author",
+    )
diff --git a/pyEED/core/citation.py b/pyEED/core/citation.py
@@ -0,0 +1,64 @@
+import sdRDM
+
+from typing import List, Optional
+from pydantic import Field
+from sdRDM.base.listplus import ListPlus
+from sdRDM.base.utils import forge_signature, IDGenerator
+from .author import Author
+
+
+@forge_signature
+class Citation(sdRDM.DataModel):
+    """Information on publication of the entry 📖"""
+
+    id: Optional[str] = Field(
+        description="Unique identifier of the given object.",
+        default_factory=IDGenerator("citationINDEX"),
+        xml="@id",
+    )
+
+    doi: Optional[str] = Field(
+        default=None,
+        description="DOI of the publication",
+    )
+
+    pubmed_id: Optional[str] = Field(
+        default=None,
+        description="PubMed ID of the publication",
+    )
+
+    medline_id: Optional[str] = Field(
+        default=None,
+        description="Medline ID of the publication",
+    )
+
+    year: Optional[int] = Field(
+        default=None,
+        description="Year of publication",
+    )
+
+    authors: List[Author] = Field(
+        description="Authors of the publication",
+        default_factory=ListPlus,
+        multiple=True,
+    )
+
+    def add_to_authors(
+        self,
+        given_name: Optional[str] = None,
+        family_name: Optional[str] = None,
+        id: Optional[str] = None,
+    ) -> None:
+        """
+        This method adds an object of type 'Author' to attribute authors
+
+        Args:
+            id (str): Unique identifier of the 'Author' object. Defaults to 'None'.
+            given_name (): Given name of the author. Defaults to None
+            family_name (): Family name of the author. Defaults to None
+        """
+        params = {"given_name": given_name, "family_name": family_name}
+        if id is not None:
+            params["id"] = id
+        self.authors.append(Author(**params))
+        return self.authors[-1]
diff --git a/pyEED/core/dnainfo.py b/pyEED/core/dnainfo.py
@@ -4,10 +4,10 @@
 from pydantic import Field
 from sdRDM.base.listplus import ListPlus
 from sdRDM.base.utils import forge_signature, IDGenerator
-from .organism import Organism
-from .dnaregiontype import DNARegionType
 from .dnaregion import DNARegion
 from .span import Span
+from .organism import Organism
+from .dnaregiontype import DNARegionType
 from ..ncbi.seq_io import get_ncbi_entry, _seqio_to_dna_info
 
 

diff --git a/pyEED/core/dnaregion.py b/pyEED/core/dnaregion.py
@@ -2,8 +2,8 @@
 from typing import Optional
 from pydantic import Field
 from sdRDM.base.utils import forge_signature, IDGenerator
-from .dnaregiontype import DNARegionType
 from .abstractregion import AbstractRegion
+from .dnaregiontype import DNARegionType
 
 
 @forge_signature

diff --git a/pyEED/core/proteininfo.py b/pyEED/core/proteininfo.py
@@ -6,13 +6,15 @@
 from sdRDM.base.utils import forge_signature, IDGenerator
 from Bio.Blast import NCBIWWW, NCBIXML
 from pyEED.core.dnainfo import DNAInfo
-from .site import Site
+from .dnaregion import DNARegion
 from .proteinsitetype import ProteinSiteType
-from .organism import Organism
 from .proteinregion import ProteinRegion
-from .dnaregion import DNARegion
 from .span import Span
+from .citation import Citation
+from .substrate import Substrate
+from .organism import Organism
 from .proteinregiontype import ProteinRegionType
+from .site import Site
 from ..ncbi.seq_io import _seqio_to_nucleotide_info, get_ncbi_entry, get_ncbi_entrys
 
 
@@ -59,8 +61,8 @@ class ProteinInfo(sdRDM.DataModel):
     )
 
     coding_sequence_ref: Optional[DNARegion] = Field(
-        default=DNARegion(),
         description="Defines the coding sequence of the protein",
+        default_factory=DNARegion,
     )
 
     ec_number: Optional[str] = Field(
@@ -73,6 +75,17 @@ class ProteinInfo(sdRDM.DataModel):
         description="Calculated molecular weight of the protein",
     )
 
+    substrates: List[Substrate] = Field(
+        description="Promiscuous substrates of the protein",
+        default_factory=ListPlus,
+        multiple=True,
+    )
+
+    citation: Optional[Citation] = Field(
+        description="Publication on the protein",
+        default_factory=Citation,
+    )
+
     def add_to_regions(
         self,
         type: Optional[ProteinRegionType] = None,
@@ -134,6 +147,30 @@ def add_to_sites(
         self.sites.append(Site(**params))
         return self.sites[-1]
 
+    def add_to_substrates(
+        self,
+        name: Optional[str] = None,
+        inchi: Optional[str] = None,
+        smiles: Optional[str] = None,
+        chebi_id: Optional[str] = None,
+        id: Optional[str] = None,
+    ) -> None:
+        """
+        This method adds an object of type 'Substrate' to attribute substrates
+
+        Args:
+            id (str): Unique identifier of the 'Substrate' object. Defaults to 'None'.
+            name (): Name of the substrate. Defaults to None
+            inchi (): InChI code of the substrate. Defaults to None
+            smiles (): SMILES code of the substrate. Defaults to None
+            chebi_id (): ChEBI ID of the substrate. Defaults to None
+        """
+        params = {"name": name, "inchi": inchi, "smiles": smiles, "chebi_id": chebi_id}
+        if id is not None:
+            params["id"] = id
+        self.substrates.append(Substrate(**params))
+        return self.substrates[-1]
+
     @classmethod
     def from_ncbi(cls, accession_id: str) -> "ProteinInfo":
         """

diff --git a/pyEED/core/substrate.py b/pyEED/core/substrate.py
@@ -0,0 +1,36 @@
+import sdRDM
+
+from typing import Optional
+from pydantic import Field
+from sdRDM.base.utils import forge_signature, IDGenerator
+
+
+@forge_signature
+class Substrate(sdRDM.DataModel):
+    """Promiscuous substrate of an enzyme 🧪"""
+
+    id: Optional[str] = Field(
+        description="Unique identifier of the given object.",
+        default_factory=IDGenerator("substrateINDEX"),
+        xml="@id",
+    )
+
+    name: Optional[str] = Field(
+        default=None,
+        description="Name of the substrate",
+    )
+
+    inchi: Optional[str] = Field(
+        default=None,
+        description="InChI code of the substrate",
+    )
+
+    smiles: Optional[str] = Field(
+        default=None,
+        description="SMILES code of the substrate",
+    )
+
+    chebi_id: Optional[str] = Field(
+        default=None,
+        description="ChEBI ID of the substrate",
+    )
diff --git a/pyEED/ncbi/seq_io.py b/pyEED/ncbi/seq_io.py
@@ -1,10 +1,10 @@
 import re
 import secrets
-import time
+from datetime import datetime
 from tqdm import tqdm
 from typing import List
 from Bio import SeqIO, Entrez
-from Bio.SeqFeature import FeatureLocation, CompoundLocation
+from pyEED.core.citation import Citation
 from pyEED.core.dnaregion import DNARegion
 from pyEED.core.dnaregiontype import DNARegionType
 from pyEED.core.proteinregion import ProteinRegion