Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dna fetcher #90

Merged
merged 10 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/quick_start/basics.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ Besides adding sequence information manually, PyEED also allows searching for se
=== "DNA"

``` py
# Not implemented
dna_record = DNARecord.get_id('AF188200.1')
```

## ⬇️ Save a sequence

### To file

The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used.
The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used. The methods below are written for protein, but can be used the same for `dna_record`
The file path is passed as an argument to the method.

=== "JSON"
Expand Down
39 changes: 30 additions & 9 deletions docs/quick_start/blast.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,36 @@ NCBI offers a web interface for blasting. With PyEED this can be programmaticall
- matrix (str): The matrix to use for the search. The default is `BLOSUM62`.
- identity (float): The minimum identity percentage for the search. The default is `0.0`.

``` py
from pyeed.core import ProteinRecord

# Create a ProteinInfo object
protein = ProteinRecord.get_id("UCS38941.1")

# Perform a BLAST search
blast_results = protein.ncbi_blast()
```
!!! info "NCBI BLAST performance"
=== "Protein"

``` py
from pyeed.core import ProteinRecord

# Create a ProteinInfo object
protein = ProteinRecord.get_id("UCS38941.1")

# Perform a BLAST search
blast_results = protein.ncbi_blast()
```
!!! info "NCBI BLAST performance"

Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.

=== "DNA"

``` py
from pyeed.core import DNARecord

# Create a dna record object
dna_record = DNARecord.get_id('AF188200.1')

# Perform the blast search
similar_dna_records = dna_record.ncbi_blast(n_hits=100, db="nt")

```
!!! info "NCBI BLAST performance"

Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.

Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.
129 changes: 127 additions & 2 deletions pyeed/core/dnarecord.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
from typing import Dict, Optional
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Optional, List
from uuid import uuid4

from lxml.etree import _Element
from pydantic import PrivateAttr, model_validator
from pydantic_xml import attr, element
from rich.status import Status
from rich.console import Console
from sdRDM.base.listplus import ListPlus
from IPython.display import clear_output
from sdRDM.tools.utils import elem2dict
import asyncio
import warnings

from .sequencerecord import SequenceRecord
from pyeed.fetch.blast import Blast, BlastProgram
from pyeed.fetch.dnafetcher import DNAFetcher


from pyeed.core.sequencerecord import SequenceRecord


class DNARecord(
Expand Down Expand Up @@ -48,3 +58,118 @@ def _parse_raw_xml_data(self):
self._raw_xml_data[attr] = elem2dict(value)

return self


@classmethod
def get_id(cls, dna_id: str) -> "DNARecord":
"""
This method creates a 'DNARecord' object from a given DNA accession ID.

Args:
dna_id (str): ID of the DNA in NCBI database.

Returns:
DNARecord: 'DNARecord' with information of the corresponding dna_id.
"""

import nest_asyncio

nest_asyncio.apply()

if isinstance(dna_id, list) and all(isinstance(x, str) for x in dna_id):
warnings.warn("For getting multiple sequences by ID use `get_ids` instead.")
return cls.get_ids(dna_id)

sequences = asyncio.run(DNAFetcher(ids=[dna_id]).fetch(quiet=False))[0]
clear_output()
return sequences

@classmethod
def get_ids(cls, accession_ids: List[str]) -> List["DNARecord"]:
"""Creates a list of 'DNARecord' objects from a list of dna accession IDs.

Returns:
List[DNARecord]: A list of 'DNARecord' objects representing the dna sequences found.
"""

import nest_asyncio

nest_asyncio.apply()

return asyncio.run(
DNAFetcher(ids=accession_ids).fetch(force_terminal=False)
)

def ncbi_blast(
self,
n_hits: int,
e_value: float = 10.0,
db: str = "nt",
identity: float = 0.0,
**kwargs,
) -> List["DNARecord"]:
"""
Runs a BLAST search using the NCBI BLAST service to find similar dna sequences.

Args:
n_hits (int): The number of hits to retrieve.
e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
db (str, optional): The database to search against. Defaults to "nt".
match/mismatch (int, optional): Match/mismatch score. Defaults to 1/-2.
gap_cost (int, optional): Cost to open a gap. Default is 0 and means linear
identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
**kwargs: Additional keyword arguments.

Returns:
List[DNARecord]: A list of DNARecord objects representing the similar dna sequences found.

Raises:
AssertionError: If the specified database is not supported.

Example:
dna_info = DNARecord()
similar_proteins = dna_info.ncbi_blast(n_hits=10, e_value=0.001, database="nt")
"""

import nest_asyncio

from pyeed.fetch.blast import NCBIDataBase

nest_asyncio.apply()

assert (
db in NCBIDataBase
), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"

program = BlastProgram.BLASTN.value
executor = ThreadPoolExecutor(max_workers=1)
blaster = Blast(
query=self.sequence,
n_hits=n_hits,
evalue=e_value,
identity=identity,
)


with Status(
"Running BLAST", console=Console(force_terminal=False, force_jupyter=False)
):
result = asyncio.run(blaster.async_run(db, program, executor))
clear_output()

# result = blaster.run(program, db)


accessions = blaster.extract_accession(result)

return self.get_ids(accessions)


if __name__ == "__main__":
dna_record = DNARecord.get_id('AF188200.1')

print(f"DNA Record: {dna_record}")

print('Running blast...')
similar_dna_records = dna_record.ncbi_blast(n_hits=100, db="nt")
print(similar_dna_records)
11 changes: 8 additions & 3 deletions pyeed/core/proteinrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,15 @@ def ncbi_blast(
# return protein_infos

def get_dna(self):
if not self.coding_sequence_ref:
return
try:
if not self.coding_sequence:
return

return DNARecord.get_id(self.coding_sequence[0].id)

return DNARecord.from_ncbi(self.coding_sequence_ref.id)
except Exception as e:
print('The DNA sequence could not be retrieved. The error is: ', e)
return

def _nblast(sequence: str, n_hits: int = None) -> List["ProteinRecord"]:
# blast_record = NCBIXML.read(result_handle)
Expand Down
34 changes: 22 additions & 12 deletions pyeed/fetch/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class NCBIDataBase(BaseEnum):
UNIPROTKB = "swissprot"
PDB = "pdb"
REFSEQ = "refseq_protein"
NT = "nt"


class SubstitutionMatrix(BaseEnum):
Expand Down Expand Up @@ -84,14 +85,26 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO:
ncbi_db in NCBIDataBase
), f"Invalid database: {ncbi_db}, valid databases: {NCBIDataBase}"

return NCBIWWW.qblast(
program,
ncbi_db,
self.query,
expect=self.evalue,
matrix_name=self.matrix,
hitlist_size=self.n_hits,
)
if program == BlastProgram.BLASTP.value:

return NCBIWWW.qblast(
program,
ncbi_db,
self.query,
expect=self.evalue,
matrix_name=self.matrix,
hitlist_size=self.n_hits,
)

elif program == BlastProgram.BLASTN.value:

return NCBIWWW.qblast(
program=program,
database=ncbi_db,
sequence=self.query,
expect=self.evalue,
hitlist_size=self.n_hits,
)

async def async_run(
self,
Expand Down Expand Up @@ -146,12 +159,9 @@ async def main():
with Status("Running BLAST"):
result = await blast_task

parsed_result = blast.parse(result)
print(parsed_result)

executor.shutdown()

return parsed_result
return result

results = asyncio.run(main())

Expand Down
92 changes: 92 additions & 0 deletions pyeed/fetch/dnafetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import asyncio
import json
import logging
from typing import List

import nest_asyncio
from rich.console import Console
from rich.progress import Progress

from pyeed.fetch.requester import AsyncRequester, AsyncParamRequester

from .ncbidnamapper import NCBIDNAMapper

LOGGER = logging.getLogger(__name__)


class DNAFetcher:
def __init__(self, ids: List[str]):
self.ids = ids
nest_asyncio.apply()

async def fetch(self, **console_kwargs):
"""
Fetches DNA data from various databases based on the provided IDs.

Parameters:
force_terminal (bool): Whether to force the use of a terminal
for progress tracking.

Returns:
List[dnarecord]: A list of dnaRecord objects containing the fetched dna data.

Raises:
Exception: If there is an error during the fetching process.

"""
# right now in the first batch version we just fetch from NCBI
param_requester = None

with Progress(
console=Console(**console_kwargs),
) as progress:
requesters: List[AsyncRequester] = []

#
task_id = progress.add_task(
f"Requesting sequences from NCBI...", total=len(self.ids)
)
requesters.append(
AsyncRequester(
ids=self.ids,
url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=text&rettype=genbank&id=",
task_id=task_id,
progress=progress,
batch_size=10,
rate_limit=2,
n_concurrent=5,
)
)


responses = await asyncio.gather(
*[requester.make_request() for requester in requesters]
)


# in case of multiple databases, identify the source of the data
ncbi_responses, uniprot_response = self.identify_data_source(responses)

# map data to objects
ncbi_entries = NCBIDNAMapper().map(responses=ncbi_responses)

return ncbi_entries


def identify_data_source(self, responses: List[str]) -> tuple:
"""
Identifies the source of the data based on the response content.
"""
ncbi_responses = []
uniprot_response = []

for response in responses:
if response[0].startswith("LOCUS"):
ncbi_responses.append(response)
else:
uniprot_response.append(response)

return ncbi_responses, uniprot_response



Loading
Loading