Fix basics (#73)

* refactored pairwise aligner * API update * renaimed pairwise_aligner in pairwise * tested multipairwise aligner * added docstring * API update --------- Co-authored-by: sdRDM Bot <[email protected]>
PyEED · May 10, 2024 · ab557ad · ab557ad
1 parent db538e9
commit ab557ad
Show file tree

Hide file tree

Showing 22 changed files with 13,091 additions and 409 deletions.
diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/pyeed/__init__.py b/pyeed/__init__.py
@@ -2,14 +2,17 @@
 
 from .core.abstractannotation import AbstractAnnotation
 from .core.alignmentdata import AlignmentData
+from .core.alignmentresult import AlignmentResult
 from .core.annotation import Annotation
 from .core.blastdata import BlastData
 from .core.clustalomegadata import ClustalOmegaData
+from .core.clustalomegaresult import ClustalOmegaResult
 from .core.cluster import Cluster
 from .core.dnarecord import DNARecord
 from .core.ontology import Ontology
 from .core.organism import Organism
 from .core.pairwisealignment import PairwiseAlignment
+from .core.pairwisealignmentresult import PairwiseAlignmentResult
 from .core.proteinrecord import ProteinRecord
 from .core.region import Region
 from .core.sequence import Sequence

diff --git a/pyeed/align/pairwise.py b/pyeed/align/pairwise.py
@@ -1,84 +1,99 @@
-from typing import TYPE_CHECKING
+from itertools import combinations
+from typing import Dict, List
 
+from Bio.Align import Alignment as Alignment
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
-from pydantic import Field
-
-from pyeed.align import AbstractAligner
-
-if TYPE_CHECKING:
-    from Bio.Align import Alignment as BioAlignment
-    from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
-
-# ----------------------------------------------------------------------------------------------------------------------
-# THIS IS THE OLD ALINGER CLASS - SHOULD BE REMOVED AT SOME POINT
-# ----------------------------------------------------------------------------------------------------------------------
-
-class PairwiseAligner(AbstractAligner):
-
-    mode: str = Field(
-        description="Alignment mode",
-        default="global",
-    )
-
-    match: int = Field(
-        description="Score of a match",
-        default=1,
-    )
-    mismatch: int = Field(
-        description="Score of a mismatch",
-        default=-1,
-    )
-    gap_open: int = Field(
-        description="Gap open cost",
-        default=-1,
-    )
-    gap_extend: int = Field(
-        description="Gap extend cost",
-        default=0,
-    )
-    substitution_matrix: str = Field(
-        description="Substitution matrix name",
-        default="None",
-    )
-
-    """
-    @validator("mode")
-    def mode_validator(cls, mode):
-        modes = ["global", "local"]
-
-        if mode not in modes:
-            raise ValueError(
-                f"Invalid alignment mode: {mode}. Valid modes are: {modes}"
-            )
+from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
+from joblib import Parallel, cpu_count, delayed
+from rich.progress import Progress
+
+
+class PairwiseAligner:
+    def __init__(
+        self,
+        mode: str = "global",
+        match: int = 1,
+        mismatch: int = -1,
+        gap_open: int = -1,
+        gap_exted: int = 0,
+        substitution_matrix: str = "None",
+    ) -> None:
+        self.mode = mode
+        self.match = match
+        self.mismatch = mismatch
+        self.gap_open = gap_open
+        self.gap_extend = gap_exted
+        self.substitution_matrix = substitution_matrix
+
+    def _align(
+        self,
+        seq1: Dict[str, str],
+        seq2: Dict[str, str],
+    ) -> Alignment:
+        """Aligns two sequences and returns the alignment results.
+
+        Args:
+            seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
+            seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
+            progress (Progress, optional): `Rich` progress. Defaults to None.
+            task_id (TaskID, optional): `Rich` task_id. Defaults to None.
 
-        return mode
+        Returns:
+            Alignment: The alignment results.
+        """
 
-    @validator("substitution_matrix")
-    def substitution_matrix_validator(cls, substitution_matrix):
-        matrices = ["BLOSUM62", "BLOSUM45", "BLOSUM80", "PAM250", "PAM30", "PAM70"]
+        aligner = self._get_aligner()
 
-        if substitution_matrix not in matrices:
-            raise ValueError(
-                f"Invalid substitution matrix: {substitution_matrix}. Available matrices are: {matrices}"
-            )
+        results = aligner.align(list(seq1.values())[0], list(seq2.values())[0])
 
-        return substitution_matrix
+        return results[0]
 
-    """
+    def align_pairwise(
+        self,
+        seq1: Dict[str, str],
+        seq2: Dict[str, str],
+    ) -> dict:
+        """Aligns two sequences and returns the alignment results.
 
-    def align(self) -> "BioAlignment":
-        """
-        Aligns two sequences using the specified alignment parameters of the `PairwiseAligner` class.
+        Args:
+            seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
+            seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
 
         Returns:
-            PairwiseAlignment: The aligned sequences along with alignment statistics.
+            dict: Has the same signature as a `PairwiseAlignmentResult` object.
+        """
+
+        alignment_result = self._align(seq1, seq2)
 
-        Raises:
-            ValueError: If the sequences are not of type AbstractSequence or Sequence.
-            ValueError: If the alignment mode is invalid.
-            ValueError: If the substitution matrix name is invalid.
+        return self._map_alignment_results(alignment_result, seq1, seq2)
+
+    def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
+        """Creates all possible pairwise alignments from a dictionary of sequences.
+
+        Args:
+            sequences (Dict[str, str]): A dictionary of sequences to align. The key is the sequence ID.
+
+        Returns:
+            List[dict]: A list of dictionaries containing the alignment results.
         """
 
+        pairs = list(combinations(sequences.keys(), 2))
+
+        with Progress() as progress:
+            alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
+                delayed(self.align_pairwise)(
+                    {pair[0]: sequences[pair[0]]}, {pair[1]: sequences[pair[1]]}
+                )
+                for pair in progress.track(
+                    pairs, description=f"⛓️ Aligning {len(pairs)} sequence pairs..."
+                )
+            )
+
+        return alignments
+
+    def _get_aligner(self) -> BioPairwiseAligner:
+        """Creates a BioPython pairwise aligner object with the specified parameters
+        from the class instance."""
         aligner = BioPairwiseAligner()
         aligner.mode = self.mode
         aligner.match_score = self.match
@@ -89,6 +104,47 @@ def align(self) -> "BioAlignment":
         if self.substitution_matrix != "None":
             aligner.substitution_matrix = self._load_substitution_matrix()
 
-        alignment_result = aligner.align(self.sequences[0], self.sequences[1])[0]
+        return aligner
+
+    def _map_alignment_results(
+        self, alignment: Alignment, seq1: Dict[str, str], seq2: Dict[str, str]
+    ) -> dict:
+        """Maps the alignment results to a dictionary.
+        The dictionaly has the same signature as a `PairwiseAlignmentResult` object.
+
+        Returns:
+            dict: A dictionary containing the alignment results.
+        """
+
+        shorter_seq = min(alignment[0], alignment[1], key=lambda x: len(x))
+
+        identities = alignment.counts().identities
+        identity = identities / len(shorter_seq)
+        gaps = alignment.counts().gaps
+        mismatches = alignment.counts().mismatches
+
+        sequences = [
+            {"id": list(seq1.keys())[0], "sequence": list(seq1.values())[0]},
+            {"id": list(seq2.keys())[0], "sequence": list(seq2.values())[0]},
+        ]
+
+        aligned_sequences = [
+            {"id": list(seq1.keys())[0], "sequence": alignment[0]},
+            {"id": list(seq2.keys())[0], "sequence": alignment[1]},
+        ]
+
+        result_dict = {
+            "score": alignment.score,
+            "identity": identity,
+            "gaps": gaps,
+            "mismatches": mismatches,
+            "sequences": sequences,
+            "aligned_sequences": aligned_sequences,
+        }
+
+        return result_dict
+
+    def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
+        from Bio.Align import substitution_matrices
 
-        return alignment_result
+        return substitution_matrices.load(self.substitution_matrix)
diff --git a/pyeed/align/pairwise_aligner.py b/pyeed/align/pairwise_aligner.py
diff --git a/pyeed/core/__init__.py b/pyeed/core/__init__.py
@@ -1,13 +1,16 @@
 from .abstractannotation import AbstractAnnotation
 from .alignmentdata import AlignmentData
+from .alignmentresult import AlignmentResult
 from .annotation import Annotation
 from .blastdata import BlastData
 from .clustalomegadata import ClustalOmegaData
+from .clustalomegaresult import ClustalOmegaResult
 from .cluster import Cluster
 from .dnarecord import DNARecord
 from .ontology import Ontology
 from .organism import Organism
 from .pairwisealignment import PairwiseAlignment
+from .pairwisealignmentresult import PairwiseAlignmentResult
 from .proteinrecord import ProteinRecord
 from .region import Region
 from .sequence import Sequence

diff --git a/pyeed/core/abstractannotation.py b/pyeed/core/abstractannotation.py
@@ -51,7 +51,7 @@ class AbstractAnnotation(
 
     _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
     _commit: Optional[str] = PrivateAttr(
-        default="a7defc5c87a2296a2e4b522b07236b2aef6413ac"
+        default="d9135dbd1cbd205a7d0ddcbabc5d9a192e10a20b"
     )
 
     _raw_xml_data: Dict = PrivateAttr(default_factory=dict)