Skip to content

Commit

Permalink
Fix basics (#73)
Browse files Browse the repository at this point in the history
* refactored pairwise aligner

* API update

* renaimed pairwise_aligner in pairwise

* tested multipairwise aligner

* added docstring

* API update

---------

Co-authored-by: sdRDM Bot <[email protected]>
  • Loading branch information
haeussma and sdRDM Bot authored May 10, 2024
1 parent db538e9 commit ab557ad
Show file tree
Hide file tree
Showing 22 changed files with 13,091 additions and 409 deletions.
12,853 changes: 12,663 additions & 190 deletions examples/basics.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

from .core.abstractannotation import AbstractAnnotation
from .core.alignmentdata import AlignmentData
from .core.alignmentresult import AlignmentResult
from .core.annotation import Annotation
from .core.blastdata import BlastData
from .core.clustalomegadata import ClustalOmegaData
from .core.clustalomegaresult import ClustalOmegaResult
from .core.cluster import Cluster
from .core.dnarecord import DNARecord
from .core.ontology import Ontology
from .core.organism import Organism
from .core.pairwisealignment import PairwiseAlignment
from .core.pairwisealignmentresult import PairwiseAlignmentResult
from .core.proteinrecord import ProteinRecord
from .core.region import Region
from .core.sequence import Sequence
Expand Down
196 changes: 126 additions & 70 deletions pyeed/align/pairwise.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,99 @@
from typing import TYPE_CHECKING
from itertools import combinations
from typing import Dict, List

from Bio.Align import Alignment as Alignment
from Bio.Align import PairwiseAligner as BioPairwiseAligner
from pydantic import Field

from pyeed.align import AbstractAligner

if TYPE_CHECKING:
from Bio.Align import Alignment as BioAlignment
from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix

# ----------------------------------------------------------------------------------------------------------------------
# THIS IS THE OLD ALINGER CLASS - SHOULD BE REMOVED AT SOME POINT
# ----------------------------------------------------------------------------------------------------------------------

class PairwiseAligner(AbstractAligner):

mode: str = Field(
description="Alignment mode",
default="global",
)

match: int = Field(
description="Score of a match",
default=1,
)
mismatch: int = Field(
description="Score of a mismatch",
default=-1,
)
gap_open: int = Field(
description="Gap open cost",
default=-1,
)
gap_extend: int = Field(
description="Gap extend cost",
default=0,
)
substitution_matrix: str = Field(
description="Substitution matrix name",
default="None",
)

"""
@validator("mode")
def mode_validator(cls, mode):
modes = ["global", "local"]
if mode not in modes:
raise ValueError(
f"Invalid alignment mode: {mode}. Valid modes are: {modes}"
)
from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
from joblib import Parallel, cpu_count, delayed
from rich.progress import Progress


class PairwiseAligner:
def __init__(
self,
mode: str = "global",
match: int = 1,
mismatch: int = -1,
gap_open: int = -1,
gap_exted: int = 0,
substitution_matrix: str = "None",
) -> None:
self.mode = mode
self.match = match
self.mismatch = mismatch
self.gap_open = gap_open
self.gap_extend = gap_exted
self.substitution_matrix = substitution_matrix

def _align(
self,
seq1: Dict[str, str],
seq2: Dict[str, str],
) -> Alignment:
"""Aligns two sequences and returns the alignment results.
Args:
seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
progress (Progress, optional): `Rich` progress. Defaults to None.
task_id (TaskID, optional): `Rich` task_id. Defaults to None.
return mode
Returns:
Alignment: The alignment results.
"""

@validator("substitution_matrix")
def substitution_matrix_validator(cls, substitution_matrix):
matrices = ["BLOSUM62", "BLOSUM45", "BLOSUM80", "PAM250", "PAM30", "PAM70"]
aligner = self._get_aligner()

if substitution_matrix not in matrices:
raise ValueError(
f"Invalid substitution matrix: {substitution_matrix}. Available matrices are: {matrices}"
)
results = aligner.align(list(seq1.values())[0], list(seq2.values())[0])

return substitution_matrix
return results[0]

"""
def align_pairwise(
self,
seq1: Dict[str, str],
seq2: Dict[str, str],
) -> dict:
"""Aligns two sequences and returns the alignment results.
def align(self) -> "BioAlignment":
"""
Aligns two sequences using the specified alignment parameters of the `PairwiseAligner` class.
Args:
seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
Returns:
PairwiseAlignment: The aligned sequences along with alignment statistics.
dict: Has the same signature as a `PairwiseAlignmentResult` object.
"""

alignment_result = self._align(seq1, seq2)

Raises:
ValueError: If the sequences are not of type AbstractSequence or Sequence.
ValueError: If the alignment mode is invalid.
ValueError: If the substitution matrix name is invalid.
return self._map_alignment_results(alignment_result, seq1, seq2)

def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
"""Creates all possible pairwise alignments from a dictionary of sequences.
Args:
sequences (Dict[str, str]): A dictionary of sequences to align. The key is the sequence ID.
Returns:
List[dict]: A list of dictionaries containing the alignment results.
"""

pairs = list(combinations(sequences.keys(), 2))

with Progress() as progress:
alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
delayed(self.align_pairwise)(
{pair[0]: sequences[pair[0]]}, {pair[1]: sequences[pair[1]]}
)
for pair in progress.track(
pairs, description=f"⛓️ Aligning {len(pairs)} sequence pairs..."
)
)

return alignments

def _get_aligner(self) -> BioPairwiseAligner:
"""Creates a BioPython pairwise aligner object with the specified parameters
from the class instance."""
aligner = BioPairwiseAligner()
aligner.mode = self.mode
aligner.match_score = self.match
Expand All @@ -89,6 +104,47 @@ def align(self) -> "BioAlignment":
if self.substitution_matrix != "None":
aligner.substitution_matrix = self._load_substitution_matrix()

alignment_result = aligner.align(self.sequences[0], self.sequences[1])[0]
return aligner

def _map_alignment_results(
self, alignment: Alignment, seq1: Dict[str, str], seq2: Dict[str, str]
) -> dict:
"""Maps the alignment results to a dictionary.
The dictionaly has the same signature as a `PairwiseAlignmentResult` object.
Returns:
dict: A dictionary containing the alignment results.
"""

shorter_seq = min(alignment[0], alignment[1], key=lambda x: len(x))

identities = alignment.counts().identities
identity = identities / len(shorter_seq)
gaps = alignment.counts().gaps
mismatches = alignment.counts().mismatches

sequences = [
{"id": list(seq1.keys())[0], "sequence": list(seq1.values())[0]},
{"id": list(seq2.keys())[0], "sequence": list(seq2.values())[0]},
]

aligned_sequences = [
{"id": list(seq1.keys())[0], "sequence": alignment[0]},
{"id": list(seq2.keys())[0], "sequence": alignment[1]},
]

result_dict = {
"score": alignment.score,
"identity": identity,
"gaps": gaps,
"mismatches": mismatches,
"sequences": sequences,
"aligned_sequences": aligned_sequences,
}

return result_dict

def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
from Bio.Align import substitution_matrices

return alignment_result
return substitution_matrices.load(self.substitution_matrix)
117 changes: 0 additions & 117 deletions pyeed/align/pairwise_aligner.py

This file was deleted.

3 changes: 3 additions & 0 deletions pyeed/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from .abstractannotation import AbstractAnnotation
from .alignmentdata import AlignmentData
from .alignmentresult import AlignmentResult
from .annotation import Annotation
from .blastdata import BlastData
from .clustalomegadata import ClustalOmegaData
from .clustalomegaresult import ClustalOmegaResult
from .cluster import Cluster
from .dnarecord import DNARecord
from .ontology import Ontology
from .organism import Organism
from .pairwisealignment import PairwiseAlignment
from .pairwisealignmentresult import PairwiseAlignmentResult
from .proteinrecord import ProteinRecord
from .region import Region
from .sequence import Sequence
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/abstractannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class AbstractAnnotation(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="a7defc5c87a2296a2e4b522b07236b2aef6413ac"
default="d9135dbd1cbd205a7d0ddcbabc5d9a192e10a20b"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
Loading

0 comments on commit ab557ad

Please sign in to comment.