Skip to content

Commit

Permalink
fixed alignment and blast search (#80)
Browse files Browse the repository at this point in the history
* fixed alignment and blast search

* API update

---------

Co-authored-by: sdRDM Bot <[email protected]>
  • Loading branch information
haeussma and sdRDM Bot authored May 25, 2024
1 parent 3ee56e9 commit 42838d1
Show file tree
Hide file tree
Showing 17 changed files with 50 additions and 40 deletions.
2 changes: 1 addition & 1 deletion pyeed/core/abstractannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class AbstractAnnotation(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/alignmentresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class AlignmentResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/blastdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class BlastData(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/clustalomegaresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ClustalOmegaResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/dnarecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class DNARecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class Organism(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/pairwisealignmentresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class PairwiseAlignmentResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
20 changes: 10 additions & 10 deletions pyeed/core/proteinrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from lxml.etree import _Element
from pydantic import PrivateAttr, model_validator
from pydantic_xml import attr, element
from rich.status import Console, Status
from rich.console import Console
from rich.status import Status
from sdRDM.base.listplus import ListPlus
from sdRDM.tools.utils import elem2dict

Expand Down Expand Up @@ -74,7 +75,7 @@ class ProteinRecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down Expand Up @@ -144,7 +145,6 @@ def get_id(cls, protein_id: str) -> "ProteinRecord":

import nest_asyncio

from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand All @@ -166,7 +166,6 @@ def get_ids(cls, accession_ids: List[str]) -> List["ProteinRecord"]:

import nest_asyncio

from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand Down Expand Up @@ -203,7 +202,6 @@ def from_sequence(
"""

from pyeed.fetch.blast import BlastProgram
from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand Down Expand Up @@ -244,7 +242,7 @@ def ncbi_blast(
self,
n_hits: int,
e_value: float = 10.0,
database: str = "nr",
db: str = "swissprot",
matrix: str = "BLOSUM62",
identity: float = 0.0,
**kwargs,
Expand All @@ -255,7 +253,7 @@ def ncbi_blast(
Args:
n_hits (int): The number of hits to retrieve.
e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
database (str, optional): The database to search against. Defaults to "nr".
db (str, optional): The database to search against. Defaults to "swissprot".
matrix (str, optional): The substitution matrix to use. Defaults to "BLOSUM62".
identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
**kwargs: Additional keyword arguments.
Expand All @@ -277,7 +275,9 @@ def ncbi_blast(

nest_asyncio.apply()

assert database in NCBIDataBase
assert (
db in NCBIDataBase
), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"

program = BlastProgram.BLASTP.value
executor = ThreadPoolExecutor(max_workers=1)
Expand All @@ -292,12 +292,12 @@ def ncbi_blast(
with Status(
"Running BLAST", console=Console(force_terminal=False, force_jupyter=True)
):
result = asyncio.run(blaster.async_run(database, program, executor))
result = asyncio.run(blaster.async_run(db, program, executor))
clear_output()

accessions = blaster.extract_accession(result)

return asyncio.run(ProteinFetcher(ids=accessions).fetch(force_terminal=False))
return self.get_ids(accessions)

# def blastp(
# self,
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Region(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/regionset.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class RegionSet(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_object_terms: Set[str] = PrivateAttr(
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Sequence(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/sequencerecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class SequenceRecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Site(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/standardnumbering.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class StandardNumbering(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
8 changes: 3 additions & 5 deletions pyeed/fetch/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from concurrent.futures import ThreadPoolExecutor
from enum import Enum, EnumMeta
from typing import List
from typing import List, Optional

from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Blast.Record import Blast as BlastRecord
Expand Down Expand Up @@ -96,11 +96,9 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO:
async def async_run(
self,
ncbi_db: str,
program: str = BlastProgram.BLASTP.value,
foreign_executor: ThreadPoolExecutor = None,
program: str,
foreign_executor: Optional[ThreadPoolExecutor] = None,
) -> io.StringIO:
assert program in BlastProgram
assert ncbi_db in NCBIDataBase

if not foreign_executor:
executor = ThreadPoolExecutor()
Expand Down
34 changes: 23 additions & 11 deletions pyeed/fetch/ncbiproteinmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import re
from typing import TYPE_CHECKING, List

from Bio import SeqFeature, SeqIO
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord
from pydantic import ValidationError

from pyeed.core.annotation import Annotation
from pyeed.core.dnarecord import DNARecord
Expand Down Expand Up @@ -44,9 +46,16 @@ def map(self, responses: List[str]) -> List[ProteinRecord]:

protein_infos = []
for record in seq_records:

protein_info = ProteinRecord(id=record.id, sequence=str(record.seq))

protein_info.organism = Organism(**self.map_organism(record))
try:
protein_info.organism = Organism(**self.map_organism(record))
except ValidationError as e:
LOGGER.error(
f"Error mapping organism for {record.id}: {e.errors()} {e.json()}"
)
continue

protein_info = self.map_protein(record, protein_info)

Expand All @@ -67,7 +76,7 @@ def map_organism(self, seq_record: SeqRecord) -> dict:
"""

feature = self.get_feature(seq_record, "source")
if len(feature) != 1:
if len(feature) < 1:
LOGGER.debug(
f"Multiple features ({len(feature)}) of type `source` found for {seq_record.id}: {feature}"
)
Expand All @@ -78,26 +87,29 @@ def map_organism(self, seq_record: SeqRecord) -> dict:
LOGGER.info(
f"For {seq_record.id} {feature.qualifiers['db_xref']} taxonomy ID(s) were found, using the first one. Skipping organism assignment"
)
return None
return {}

taxonomy_id = feature.qualifiers["db_xref"][0]
try:
taxonomy_id = next(feature for feature in feature.qualifiers["db_xref"] if "taxon" in feature)
if ":" in taxonomy_id:
taxonomy_id = taxonomy_id.split(":")[1]
except StopIteration:
taxonomy_id = None

if ":" in taxonomy_id:
taxonomy_id = int(taxonomy_id.split(":")[1])

except KeyError:
LOGGER.debug(f"No taxonomy ID found for {seq_record.id}: {feature}")
return None
return {}

try:
organism_name = feature.qualifiers["organism"]
except KeyError:
LOGGER.debug(
f"No organism name found for {seq_record.id}: {feature[0].qualifiers}"
)
organism_name = None
organism_name = ""

return {"name": organism_name[0], "taxonomy_id": taxonomy_id}
return {"id": taxonomy_id, "name": organism_name[0], "taxonomy_id": taxonomy_id}

def map_protein(self, seq_record: SeqRecord, protein_info: ProteinRecord):
"""Maps protein data from a `Bio.SeqRecord` to a `ProteinInfo` object."""
Expand Down Expand Up @@ -259,7 +271,7 @@ def get_cds_regions(coded_by: dict) -> List[DNARecord]:

return regions

def get_feature(self, seq_record: SeqRecord, feature_type: str) -> SeqFeature:
def get_feature(self, seq_record: SeqRecord, feature_type: str) -> List[SeqFeature]:
"""Returns a list of features of a given type from a `Bio.SeqRecord` object."""
return [
feature
Expand Down
2 changes: 1 addition & 1 deletion pyeed/fetch/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async def send_request(self, args: RequestArgs) -> str:
url = args.url

LOGGER.debug(f"Sending request to {url}")
response = await client.get(url, timeout=30)
response = await client.get(url, timeout=120)

LOGGER.debug(f"Received response from {url}. Code: {response.status_code}")

Expand Down

0 comments on commit 42838d1

Please sign in to comment.