Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Substrate citation dm #29

Merged
merged 6 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 89 additions & 60 deletions examples/basics/query_and_blast.ipynb

Large diffs are not rendered by default.

86 changes: 44 additions & 42 deletions examples/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,61 +13,63 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"aldolase = ProteinInfo.from_ncbi(\"UCS38941.1\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JAGTRX010000119.1:1..1200\n",
"['JAGTRX010000119.1:1..1200']\n",
"\u001b[4mProteinInfo\u001b[0m\n",
"├── \u001b[94mid\u001b[0m = proteininfo0\n",
"├── \u001b[94msource_id\u001b[0m = MBS1198664.1\n",
"├── \u001b[94mname\u001b[0m = depolymerase family esterase\n",
"├── \u001b[94msequence\u001b[0m = LQDAYVYVPKNAAPAVLGGKRALMLTMHGCGQTASGNVIGTKFNWETTAEQYGMVVVAPTVPSGTTSTRSVSGCWDWFGSAHTRTGRDAVPLKKLLDSVKARTNLDIDPNQIYVTGLSSGGGETIVMGCSFPEYFAGVGINAGPALGSASGDISVEPKVTAAQVASYCKAAATSTYTPYFATQITNAVYGTSDYLVKPNHNVRNIQGMAVVYGMTMGTPVTSSVAGGGTAKVYKDANGKERLSDLAVTGMGHAWPAGGGAGAAYVDTSHVNFPVYITKWFFDNNLRVSGGSTTTTTAATTTTTGGATTTTGAATTTTAAATTTTTRASTTTTTTTTTTTAGACYKTSNYAHVTAGRAYNSLGTAKAKGSNQSMGLNNTFYITKLRMTGTAYYVIDATCP\n",
"├── \u001b[94morganism\u001b[0m\n",
"│ └── \u001b[4mOrganism\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = organism3\n",
"│ ├── \u001b[94mname\u001b[0m = Pseudomonadota bacterium\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = taxon:1977087\n",
"│ ├── \u001b[94mdomain\u001b[0m = Bacteria\n",
"│ ├── \u001b[94mkingdom\u001b[0m = Pseudomonadota\n",
"│ └── \u001b[94mspecies\u001b[0m = bacterium\n",
"└── \u001b[94mcoding_sequence_ref\u001b[0m\n",
" └── \u001b[4mDNARegion\u001b[0m\n",
" ├── \u001b[94mid\u001b[0m = JAGTRX010000119.1\n",
" ├── \u001b[94mspans\u001b[0m\n",
" │ └── 0\n",
" │ └── \u001b[4mSpan\u001b[0m\n",
" │ ├── \u001b[94mid\u001b[0m = span0\n",
" │ ├── \u001b[94mstart\u001b[0m = 1\n",
" │ └── \u001b[94mend\u001b[0m = 1200\n",
" └── \u001b[94mtype\u001b[0m = coding sequence\n",
"\n"
]
"data": {
"text/plain": [
"Substrate(id='substrate0', name='GAPDH', inchi=None, smiles=None, chebi_id=None)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aldolase = ProteinInfo.from_ncbi(\"MBS1198664.1\")\n",
"print(aldolase)"
"aldolase.add_to_substrates(\"GAPDH\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'pblast'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/max/Documents/GitHub/pyeed/examples/test.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/Documents/GitHub/pyeed/examples/test.ipynb#W3sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m blast_results \u001b[39m=\u001b[39m aldolase\u001b[39m.\u001b[39;49mpblast(n_hits\u001b[39m=\u001b[39m\u001b[39m50\u001b[39m, e_value\u001b[39m=\u001b[39m\u001b[39m1e-50\u001b[39m)\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'pblast'"
"name": "stdout",
"output_type": "stream",
"text": [
"🏃🏼‍♀️ Running PBLAST\n",
"╭── protein name: TEM1\n",
"├── accession: UCS38941.1\n",
"├── organism: Nakaseomyces glabratus\n",
"├── e-value: 1e-50\n",
"╰── max hits: 50\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"⬇️ Fetching protein sequences: 100%|██████████| 50/50 [00:04<00:00, 11.25it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"🎉 Done\n",
"\n"
]
}
],
Expand Down
987 changes: 507 additions & 480 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions pyEED/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from .proteininfo import ProteinInfo
from .dnainfo import DNAInfo
from .abstractregion import AbstractRegion
from .citation import Citation
from .author import Author
from .substrate import Substrate
from .dnaregion import DNARegion
from .proteinregion import ProteinRegion
from .span import Span
Expand All @@ -16,6 +19,9 @@
"ProteinInfo",
"DNAInfo",
"AbstractRegion",
"Citation",
"Author",
"Substrate",
"DNARegion",
"ProteinRegion",
"Span",
Expand Down
26 changes: 26 additions & 0 deletions pyEED/core/author.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sdRDM

from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator


@forge_signature
class Author(sdRDM.DataModel):
""""""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("authorINDEX"),
xml="@id",
)

given_name: Optional[str] = Field(
default=None,
description="Given name of the author",
)

family_name: Optional[str] = Field(
default=None,
description="Family name of the author",
)
64 changes: 64 additions & 0 deletions pyEED/core/citation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sdRDM

from typing import List, Optional
from pydantic import Field
from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from .author import Author


@forge_signature
class Citation(sdRDM.DataModel):
"""Information on publication of the entry 📖"""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("citationINDEX"),
xml="@id",
)

doi: Optional[str] = Field(
default=None,
description="DOI of the publication",
)

pubmed_id: Optional[str] = Field(
default=None,
description="PubMed ID of the publication",
)

medline_id: Optional[str] = Field(
default=None,
description="Medline ID of the publication",
)

year: Optional[int] = Field(
default=None,
description="Year of publication",
)

authors: List[Author] = Field(
description="Authors of the publication",
default_factory=ListPlus,
multiple=True,
)

def add_to_authors(
self,
given_name: Optional[str] = None,
family_name: Optional[str] = None,
id: Optional[str] = None,
) -> None:
"""
This method adds an object of type 'Author' to attribute authors

Args:
id (str): Unique identifier of the 'Author' object. Defaults to 'None'.
given_name (): Given name of the author. Defaults to None
family_name (): Family name of the author. Defaults to None
"""
params = {"given_name": given_name, "family_name": family_name}
if id is not None:
params["id"] = id
self.authors.append(Author(**params))
return self.authors[-1]
4 changes: 2 additions & 2 deletions pyEED/core/dnainfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from pydantic import Field
from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from .organism import Organism
from .dnaregiontype import DNARegionType
from .dnaregion import DNARegion
from .span import Span
from .organism import Organism
from .dnaregiontype import DNARegionType
from ..ncbi.seq_io import get_ncbi_entry, _seqio_to_dna_info


Expand Down
2 changes: 1 addition & 1 deletion pyEED/core/dnaregion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator
from .dnaregiontype import DNARegionType
from .abstractregion import AbstractRegion
from .dnaregiontype import DNARegionType


@forge_signature
Expand Down
45 changes: 41 additions & 4 deletions pyEED/core/proteininfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
from sdRDM.base.utils import forge_signature, IDGenerator
from Bio.Blast import NCBIWWW, NCBIXML
from pyEED.core.dnainfo import DNAInfo
from .site import Site
from .dnaregion import DNARegion
from .proteinsitetype import ProteinSiteType
from .organism import Organism
from .proteinregion import ProteinRegion
from .dnaregion import DNARegion
from .span import Span
from .citation import Citation
from .substrate import Substrate
from .organism import Organism
from .proteinregiontype import ProteinRegionType
from .site import Site
from ..ncbi.seq_io import _seqio_to_nucleotide_info, get_ncbi_entry, get_ncbi_entrys


Expand Down Expand Up @@ -59,8 +61,8 @@ class ProteinInfo(sdRDM.DataModel):
)

coding_sequence_ref: Optional[DNARegion] = Field(
default=DNARegion(),
description="Defines the coding sequence of the protein",
default_factory=DNARegion,
)

ec_number: Optional[str] = Field(
Expand All @@ -73,6 +75,17 @@ class ProteinInfo(sdRDM.DataModel):
description="Calculated molecular weight of the protein",
)

substrates: List[Substrate] = Field(
description="Promiscuous substrates of the protein",
default_factory=ListPlus,
multiple=True,
)

citation: Optional[Citation] = Field(
description="Publication on the protein",
default_factory=Citation,
)

def add_to_regions(
self,
type: Optional[ProteinRegionType] = None,
Expand Down Expand Up @@ -134,6 +147,30 @@ def add_to_sites(
self.sites.append(Site(**params))
return self.sites[-1]

def add_to_substrates(
self,
name: Optional[str] = None,
inchi: Optional[str] = None,
smiles: Optional[str] = None,
chebi_id: Optional[str] = None,
id: Optional[str] = None,
) -> None:
"""
This method adds an object of type 'Substrate' to attribute substrates

Args:
id (str): Unique identifier of the 'Substrate' object. Defaults to 'None'.
name (): Name of the substrate. Defaults to None
inchi (): InChI code of the substrate. Defaults to None
smiles (): SMILES code of the substrate. Defaults to None
chebi_id (): ChEBI ID of the substrate. Defaults to None
"""
params = {"name": name, "inchi": inchi, "smiles": smiles, "chebi_id": chebi_id}
if id is not None:
params["id"] = id
self.substrates.append(Substrate(**params))
return self.substrates[-1]

@classmethod
def from_ncbi(cls, accession_id: str) -> "ProteinInfo":
"""
Expand Down
36 changes: 36 additions & 0 deletions pyEED/core/substrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sdRDM

from typing import Optional
from pydantic import Field
from sdRDM.base.utils import forge_signature, IDGenerator


@forge_signature
class Substrate(sdRDM.DataModel):
"""Promiscuous substrate of an enzyme 🧪"""

id: Optional[str] = Field(
description="Unique identifier of the given object.",
default_factory=IDGenerator("substrateINDEX"),
xml="@id",
)

name: Optional[str] = Field(
default=None,
description="Name of the substrate",
)

inchi: Optional[str] = Field(
default=None,
description="InChI code of the substrate",
)

smiles: Optional[str] = Field(
default=None,
description="SMILES code of the substrate",
)

chebi_id: Optional[str] = Field(
default=None,
description="ChEBI ID of the substrate",
)
4 changes: 2 additions & 2 deletions pyEED/ncbi/seq_io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
import secrets
import time
from datetime import datetime
from tqdm import tqdm
from typing import List
from Bio import SeqIO, Entrez
from Bio.SeqFeature import FeatureLocation, CompoundLocation
from pyEED.core.citation import Citation
from pyEED.core.dnaregion import DNARegion
from pyEED.core.dnaregiontype import DNARegionType
from pyEED.core.proteinregion import ProteinRegion
Expand Down
Loading
Loading