diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml index 474559d..7335863 100644 --- a/.github/workflows/dev-build.yml +++ b/.github/workflows/dev-build.yml @@ -36,7 +36,7 @@ jobs: password: ${{ env.DOCKER_HUB_PASSWORD }} - name: Build and push Docker image - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5.3.0 with: context: . file: ./Dockerfile diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 91c1f4e..3d41c52 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,7 +10,7 @@ jobs: matrix: python-version: ["3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4.1.4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: diff --git a/Dockerfile b/Dockerfile index 6b2c151..609e726 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,4 +45,4 @@ RUN pip3 install --no-cache-dir chembl_structure_pipeline --no-deps COPY ./app /code/app -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "8"] diff --git a/app/modules/coconut/preprocess.py b/app/modules/coconut/preprocess.py index 8f1dd24..6850ca8 100644 --- a/app/modules/coconut/preprocess.py +++ b/app/modules/coconut/preprocess.py @@ -58,6 +58,7 @@ def get_parent_smiles(molecule: Chem.Mol) -> str: parent_mol = Chem.MolFromMolBlock(parent) if parent_mol: + [a.SetAtomMapNum(0) for i, a in enumerate(parent_mol.GetAtoms())] parent_smiles = Chem.MolToSmiles( parent_mol, isomericSmiles=False, kekuleSmiles=True ) @@ -72,6 +73,62 @@ def get_parent_smiles(molecule: Chem.Mol) -> str: return "Error Check input SMILES" +def get_smiles(molecule: Chem.Mol, isomeric: bool = True) -> str: + """ + Retrieves the SMILES string (Isomeric or Canonical) for a given RDKit molecule object. + + Args: + molecule (Chem.Mol): An RDKit molecule object representing the molecular structure. + isomeric (bool, optional): Whether to retrieve the Isomeric SMILES (True) or the Canonical SMILES (False). + Defaults to True. + + Returns: + str: The Isomeric or Canonical SMILES string for the given molecule. + """ + if molecule: + [a.SetAtomMapNum(0) for i, a in enumerate(molecule.GetAtoms())] + initial_smiles = Chem.MolToSmiles( + molecule, isomericSmiles=isomeric, kekuleSmiles=True + ) + canonical_mol = Chem.MolFromSmiles(Chem.CanonSmiles(initial_smiles)) + + if canonical_mol: + new_smiles = Chem.MolToSmiles( + canonical_mol, isomericSmiles=isomeric, kekuleSmiles=True + ) + return new_smiles + + return "Error Check input SMILES" + + +def get_standardized_smiles(standardized_mol_block: str) -> str: + """ + Get the standardized SMILES representation of a molecule. + + This function takes a standardized molecular structure represented as a MolBlock and generates the corresponding + standardized SMILES representation. + + Args: + standardized_mol_block (str): The standardized molecular structure in MolBlock format. + + Returns: + str: The standardized SMILES representation of the molecule. + """ + mol = Chem.MolFromMolBlock(standardized_mol_block) + [a.SetAtomMapNum(0) for i, a in enumerate(mol.GetAtoms())] + standardized_smiles = Chem.MolToSmiles( + mol, kekuleSmiles=True + ) + canonical_mol = Chem.MolFromSmiles(Chem.CanonSmiles(standardized_smiles)) + if canonical_mol: + new_smiles = Chem.MolToSmiles( + canonical_mol, isomericSmiles=True, kekuleSmiles=True + ) + return new_smiles + + return "Error Check input SMILES" + + def get_molecule_hash(molecule: Chem.Mol) -> dict: """Return various molecule hashes for the provided SMILES. @@ -83,12 +140,8 @@ def get_molecule_hash(molecule: Chem.Mol) -> dict: """ if molecule: Formula = Chem.rdMolDescriptors.CalcMolFormula(molecule) - Isomeric_SMILES = Chem.MolToSmiles(molecule, kekuleSmiles=True) - Canonical_SMILES = Chem.MolToSmiles( - molecule, - kekuleSmiles=True, - isomericSmiles=False, - ) + Isomeric_SMILES = get_smiles(molecule, isomeric=True) + Canonical_SMILES = get_smiles(molecule, isomeric=False) Parent_SMILES = get_parent_smiles(molecule) return { "Formula": Formula, @@ -152,9 +205,7 @@ def get_COCONUT_preprocessing( # Standardized molecule standardized_mol_block = standardizer.standardize_molblock(original_mol_block) - standardized_SMILES = Chem.MolToSmiles( - Chem.MolFromMolBlock(standardized_mol_block), kekuleSmiles=True - ) + standardized_SMILES = get_standardized_smiles(standardized_mol_block) standardized_mol = parse_input(standardized_SMILES, "rdkit", False) standardized_representations = get_representations(standardized_mol) diff --git a/tests/test_classyfire.py b/tests/test_classyfire.py index 114441a..0c6894a 100644 --- a/tests/test_classyfire.py +++ b/tests/test_classyfire.py @@ -19,7 +19,8 @@ def test_valid_classyfire(valid_smiles): assert result_["query_type"] == "STRUCTURE" id_ = result_["id"] classified = loop.run_until_complete(result(id_)) - assert classified["classification_status"] == "In Queue" + assert classified["classification_status"] == "Done" + assert classified["entities"][0]["class"]["name"] == "Imidazopyrimidines" def test_invalid_classyfire(invalid_smiles): @@ -28,4 +29,8 @@ def test_invalid_classyfire(invalid_smiles): assert result_["query_input"] == "invalid_smiles" id_ = result_["id"] classified = loop.run_until_complete(result(id_)) - assert classified["classification_status"] == "In Queue" + assert classified["classification_status"] == "Done" + assert ( + classified["invalid_entities"][0]["report"][0] + == "Cannot process the input SMILES string, please check again" + )