Skip to content

Commit

Permalink
Substrate citation dm (#29)
Browse files Browse the repository at this point in the history
* added citation and substrate to ProteinInfo

* removed unused imports

* removed parsing of citation

* fixed UUID database issue

* updated example

* update
  • Loading branch information
haeussma authored Nov 27, 2023
1 parent a45badc commit fe094b9
Show file tree
Hide file tree
Showing 13 changed files with 913 additions and 591 deletions.
149 changes: 89 additions & 60 deletions examples/basics/query_and_blast.ipynb

Large diffs are not rendered by default.

86 changes: 44 additions & 42 deletions examples/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,61 +13,63 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"aldolase = ProteinInfo.from_ncbi(\"UCS38941.1\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAGTRX010000119.1:1..1200\n",
"['JAGTRX010000119.1:1..1200']\n",
"\u001b[4mProteinInfo\u001b[0m\n",
"├── \u001b[94mid\u001b[0m = proteininfo0\n",
"├── \u001b[94msource_id\u001b[0m = MBS1198664.1\n",
"├── \u001b[94mname\u001b[0m = depolymerase family esterase\n",
"├── \u001b[94msequence\u001b[0m = LQDAYVYVPKNAAPAVLGGKRALMLTMHGCGQTASGNVIGTKFNWETTAEQYGMVVVAPTVPSGTTSTRSVSGCWDWFGSAHTRTGRDAVPLKKLLDSVKARTNLDIDPNQIYVTGLSSGGGETIVMGCSFPEYFAGVGINAGPALGSASGDISVEPKVTAAQVASYCKAAATSTYTPYFATQITNAVYGTSDYLVKPNHNVRNIQGMAVVYGMTMGTPVTSSVAGGGTAKVYKDANGKERLSDLAVTGMGHAWPAGGGAGAAYVDTSHVNFPVYITKWFFDNNLRVSGGSTTTTTAATTTTTGGATTTTGAATTTTAAATTTTTRASTTTTTTTTTTTAGACYKTSNYAHVTAGRAYNSLGTAKAKGSNQSMGLNNTFYITKLRMTGTAYYVIDATCP\n",
"├── \u001b[94morganism\u001b[0m\n",
"│ └── \u001b[4mOrganism\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = organism3\n",
"│ ├── \u001b[94mname\u001b[0m = Pseudomonadota bacterium\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = taxon:1977087\n",
"│ ├── \u001b[94mdomain\u001b[0m = Bacteria\n",
"│ ├── \u001b[94mkingdom\u001b[0m = Pseudomonadota\n",
"│ └── \u001b[94mspecies\u001b[0m = bacterium\n",
"└── \u001b[94mcoding_sequence_ref\u001b[0m\n",
" └── \u001b[4mDNARegion\u001b[0m\n",
" ├── \u001b[94mid\u001b[0m = JAGTRX010000119.1\n",
" ├── \u001b[94mspans\u001b[0m\n",
" │ └── 0\n",
" │ └── \u001b[4mSpan\u001b[0m\n",
" │ ├── \u001b[94mid\u001b[0m = span0\n",
" │ ├── \u001b[94mstart\u001b[0m = 1\n",
" │ └── \u001b[94mend\u001b[0m = 1200\n",
" └── \u001b[94mtype\u001b[0m = coding sequence\n",
"\n"
]
"data": {
"text/plain": [
"Substrate(id='substrate0', name='GAPDH', inchi=None, smiles=None, chebi_id=None)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aldolase = ProteinInfo.from_ncbi(\"MBS1198664.1\")\n",
"print(aldolase)"
"aldolase.add_to_substrates(\"GAPDH\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'pblast'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/max/Documents/GitHub/pyeed/examples/test.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/Documents/GitHub/pyeed/examples/test.ipynb#W3sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m blast_results \u001b[39m=\u001b[39m aldolase\u001b[39m.\u001b[39;49mpblast(n_hits\u001b[39m=\u001b[39m\u001b[39m50\u001b[39m, e_value\u001b[39m=\u001b[39m\u001b[39m1e-50\u001b[39m)\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'pblast'"
"name": "stdout",
"output_type": "stream",
"text": [
"🏃🏼‍♀️ Running PBLAST\n",
"╭── protein name: TEM1\n",
"├── accession: UCS38941.1\n",
"├── organism: Nakaseomyces glabratus\n",
"├── e-value: 1e-50\n",
"╰── max hits: 50\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"⬇️ Fetching protein sequences: 100%|██████████| 50/50 [00:04<00:00, 11.25it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"🎉 Done\n",
"\n"
]
}
],
Expand Down
987 changes: 507 additions & 480 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions pyEED/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from .proteininfo import ProteinInfo
from .dnainfo import DNAInfo
from .abstractregion import AbstractRegion
from .citation import Citation
from .author import Author
from .substrate import Substrate
from .dnaregion import DNARegion
from .proteinregion import ProteinRegion
from .span import Span
Expand All @@ -16,6 +19,9 @@
"ProteinInfo",
"DNAInfo",
"AbstractRegion",
"Citation",
"Author",
"Substrate",
"DNARegion",
"ProteinRegion",
"Span",
Expand Down
26 changes: 26 additions & 0 deletions pyEED/core/author.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sdRDM

from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator


@forge_signature
class Author(sdRDM.DataModel):
""""""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("authorINDEX"),
xml="@id",
)

given_name: Optional[str] = Field(
default=None,
description="Given name of the author",
)

family_name: Optional[str] = Field(
default=None,
description="Family name of the author",
)
64 changes: 64 additions & 0 deletions pyEED/core/citation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sdRDM

from typing import List, Optional
from pydantic import Field
from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from .author import Author


@forge_signature
class Citation(sdRDM.DataModel):
"""Information on publication of the entry 📖"""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("citationINDEX"),
xml="@id",
)

doi: Optional[str] = Field(
default=None,
description="DOI of the publication",
)

pubmed_id: Optional[str] = Field(
default=None,
description="PubMed ID of the publication",
)

medline_id: Optional[str] = Field(
default=None,
description="Medline ID of the publication",
)

year: Optional[int] = Field(
default=None,
description="Year of publication",
)

authors: List[Author] = Field(
description="Authors of the publication",
default_factory=ListPlus,
multiple=True,
)

def add_to_authors(
self,
given_name: Optional[str] = None,
family_name: Optional[str] = None,
id: Optional[str] = None,
) -> None:
"""
This method adds an object of type 'Author' to attribute authors
Args:
id (str): Unique identifier of the 'Author' object. Defaults to 'None'.
given_name (): Given name of the author. Defaults to None
family_name (): Family name of the author. Defaults to None
"""
params = {"given_name": given_name, "family_name": family_name}
if id is not None:
params["id"] = id
self.authors.append(Author(**params))
return self.authors[-1]
4 changes: 2 additions & 2 deletions pyEED/core/dnainfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from pydantic import Field
from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from .organism import Organism
from .dnaregiontype import DNARegionType
from .dnaregion import DNARegion
from .span import Span
from .organism import Organism
from .dnaregiontype import DNARegionType
from ..ncbi.seq_io import get_ncbi_entry, _seqio_to_dna_info


Expand Down
2 changes: 1 addition & 1 deletion pyEED/core/dnaregion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator
from .dnaregiontype import DNARegionType
from .abstractregion import AbstractRegion
from .dnaregiontype import DNARegionType


@forge_signature
Expand Down
45 changes: 41 additions & 4 deletions pyEED/core/proteininfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
from sdRDM.base.utils import forge_signature, IDGenerator
from Bio.Blast import NCBIWWW, NCBIXML
from pyEED.core.dnainfo import DNAInfo
from .site import Site
from .dnaregion import DNARegion
from .proteinsitetype import ProteinSiteType
from .organism import Organism
from .proteinregion import ProteinRegion
from .dnaregion import DNARegion
from .span import Span
from .citation import Citation
from .substrate import Substrate
from .organism import Organism
from .proteinregiontype import ProteinRegionType
from .site import Site
from ..ncbi.seq_io import _seqio_to_nucleotide_info, get_ncbi_entry, get_ncbi_entrys


Expand Down Expand Up @@ -59,8 +61,8 @@ class ProteinInfo(sdRDM.DataModel):
)

coding_sequence_ref: Optional[DNARegion] = Field(
default=DNARegion(),
description="Defines the coding sequence of the protein",
default_factory=DNARegion,
)

ec_number: Optional[str] = Field(
Expand All @@ -73,6 +75,17 @@ class ProteinInfo(sdRDM.DataModel):
description="Calculated molecular weight of the protein",
)

substrates: List[Substrate] = Field(
description="Promiscuous substrates of the protein",
default_factory=ListPlus,
multiple=True,
)

citation: Optional[Citation] = Field(
description="Publication on the protein",
default_factory=Citation,
)

def add_to_regions(
self,
type: Optional[ProteinRegionType] = None,
Expand Down Expand Up @@ -134,6 +147,30 @@ def add_to_sites(
self.sites.append(Site(**params))
return self.sites[-1]

def add_to_substrates(
self,
name: Optional[str] = None,
inchi: Optional[str] = None,
smiles: Optional[str] = None,
chebi_id: Optional[str] = None,
id: Optional[str] = None,
) -> None:
"""
This method adds an object of type 'Substrate' to attribute substrates
Args:
id (str): Unique identifier of the 'Substrate' object. Defaults to 'None'.
name (): Name of the substrate. Defaults to None
inchi (): InChI code of the substrate. Defaults to None
smiles (): SMILES code of the substrate. Defaults to None
chebi_id (): ChEBI ID of the substrate. Defaults to None
"""
params = {"name": name, "inchi": inchi, "smiles": smiles, "chebi_id": chebi_id}
if id is not None:
params["id"] = id
self.substrates.append(Substrate(**params))
return self.substrates[-1]

@classmethod
def from_ncbi(cls, accession_id: str) -> "ProteinInfo":
"""
Expand Down
36 changes: 36 additions & 0 deletions pyEED/core/substrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sdRDM

from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator


@forge_signature
class Substrate(sdRDM.DataModel):
"""Promiscuous substrate of an enzyme 🧪"""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("substrateINDEX"),
xml="@id",
)

name: Optional[str] = Field(
default=None,
description="Name of the substrate",
)

inchi: Optional[str] = Field(
default=None,
description="InChI code of the substrate",
)

smiles: Optional[str] = Field(
default=None,
description="SMILES code of the substrate",
)

chebi_id: Optional[str] = Field(
default=None,
description="ChEBI ID of the substrate",
)
4 changes: 2 additions & 2 deletions pyEED/ncbi/seq_io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
import secrets
import time
from datetime import datetime
from tqdm import tqdm
from typing import List
from Bio import SeqIO, Entrez
from Bio.SeqFeature import FeatureLocation, CompoundLocation
from pyEED.core.citation import Citation
from pyEED.core.dnaregion import DNARegion
from pyEED.core.dnaregiontype import DNARegionType
from pyEED.core.proteinregion import ProteinRegion
Expand Down
Loading

0 comments on commit fe094b9

Please sign in to comment.