diff --git a/app/modules/coconut/preprocess.py b/app/modules/coconut/preprocess.py index 4ff946c..28ca0ad 100644 --- a/app/modules/coconut/preprocess.py +++ b/app/modules/coconut/preprocess.py @@ -1,11 +1,13 @@ from __future__ import annotations +from chembl_structure_pipeline import checker from chembl_structure_pipeline import standardizer from rdkit import Chem import app.modules.toolkits.cdk_wrapper as cdk import app.modules.toolkits.rdkit_wrapper as rdkitmodules from app.modules.coconut.descriptors import get_COCONUT_descriptors +from app.modules.toolkits.helpers import InvalidInputException from app.modules.toolkits.helpers import parse_input @@ -76,7 +78,11 @@ def get_representations(molecule: any) -> dict: InChI_Key = Chem.inchi.MolToInchiKey(molecule) cdkMolecule = parse_input(Chem.MolToSmiles(molecule), "cdk", False) Murko = cdk.get_murko_framework(cdkMolecule) - return {"InChI": InChI, "InChI_Key": InChI_Key, "Murko": Murko} + return { + "standard_inchi": InChI, + "standard_inchikey": InChI_Key, + "murko_framework": Murko, + } else: return {"Error": "Check input SMILES"} @@ -84,95 +90,68 @@ def get_representations(molecule: any) -> dict: def get_COCONUT_preprocessing(input_text: str) -> dict: """Preprocess user input text suitable for the COCONUT database submission. - data. - Args: input_text (str): Input text (Mol/str). Returns: dict: COCONUT preprocessed data. """ - original_mol = get_mol_block(input_text) - standarised_mol_block = standardizer.standardize_molblock(original_mol) - standardised_SMILES = Chem.MolToSmiles( - Chem.MolFromMolBlock(standarised_mol_block), - kekuleSmiles=True, - ) - - rdkitMol = parse_input(standardised_SMILES, "rdkit", False) - molecule_hash = get_molecule_hash(rdkitMol) - - parent_canonical_smiles = molecule_hash["Canonical_SMILES"] - cdkParentMol = parse_input(parent_canonical_smiles, "cdk", False) - parent_2D_molblock = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=False).replace( - "$$$$\n", - "", - ) - parent_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=True).replace( - "$$$$\n", - "", - ) - rdkitParentMol = parse_input(parent_canonical_smiles, "rdkit", False) - parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol) - - parent_representations = get_representations(rdkitParentMol) - parent_descriptors = get_COCONUT_descriptors( - parent_canonical_smiles, - "rdkit", - ) - - if rdkitmodules.has_stereochemistry(rdkitMol): - variant_isomeric_smiles = molecule_hash["Isomeric_SMILES"] - cdkVariantMol = parse_input(variant_isomeric_smiles, "cdk", False) - variant_2D_molblock = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=False).replace( - "$$$$\n", - "", - ) - variant_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=True).replace( - "$$$$\n", - "", - ) - rdkitVariantMol = parse_input(standardised_SMILES, "rdkit", False) - variant_3D_molblock = rdkitmodules.get_3d_conformers(rdkitVariantMol) - variant_representations = get_representations(rdkitVariantMol) - variant_descriptors = get_COCONUT_descriptors( - variant_isomeric_smiles, - "rdkit", + try: + original_mol = parse_input(input_text, "rdkit", False) + original_mol_block = get_mol_block(input_text) + original_mol_hash = get_molecule_hash(original_mol) + original_representations = get_representations(original_mol) + original_descriptors = get_COCONUT_descriptors(input_text, "rdkit") + standarised_mol_block = standardizer.standardize_molblock(original_mol_block) + + standardized_SMILES = Chem.MolToSmiles( + Chem.MolFromMolBlock(standarised_mol_block), + kekuleSmiles=True, ) + standardized_mol = parse_input(standardized_SMILES, "rdkit", False) + standardized_representations = get_representations(standardized_mol) + standardized_descriptors = get_COCONUT_descriptors(standardized_SMILES, "rdkit") + + parent_canonical_smiles = original_mol_hash["Canonical_SMILES"] + rdkitParentMol = parse_input(parent_canonical_smiles, "rdkit", False) + parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol) + + parent_representations = get_representations(rdkitParentMol) + parent_descriptors = get_COCONUT_descriptors(parent_canonical_smiles, "rdkit") + return { - "original_mol": original_mol, - "standardised_mol": standarised_mol_block, - "standardised_SMILES": standardised_SMILES, - "molecule_hash": molecule_hash, + "original": { + "representations": { + "2D_MOL": original_mol_block, + "3D_MOL": rdkitmodules.get_3d_conformers(original_mol), + "cannonical_smiles": original_mol_hash["Isomeric_SMILES"], + **original_representations, + }, + "has_stereo": rdkitmodules.has_stereochemistry(original_mol), + "descriptors": original_descriptors, + "errors": checker.check_molblock(original_mol_block), + }, + "standardized": { + "representations": { + "2D_MOL": original_mol_block, + "3D_MOL": rdkitmodules.get_3d_conformers(standardized_mol), + "cannonical_smiles": standardized_SMILES, + **standardized_representations, + }, + "has_stereo": rdkitmodules.has_stereochemistry(standardized_mol), + "descriptors": standardized_descriptors, + "errors": checker.check_molblock(standarised_mol_block), + }, "parent": { - "2D_mol": parent_2D_molblock, - "3D_mol": parent_3D_molblock, - "v3000": parent_2D_molblock_v3, - "representations": parent_representations, + "representations": { + "3D_MOL": parent_3D_molblock, + "cannonical_smiles": parent_canonical_smiles, + **parent_representations, + }, + "has_stereo": rdkitmodules.has_stereochemistry(rdkitParentMol), "descriptors": parent_descriptors, }, - "stereochemical_variants": True, - "variants": { - "2D_mol": variant_2D_molblock, - "3D_mol": variant_3D_molblock, - "v3000": variant_2D_molblock_v3, - "representations": variant_representations, - "descriptors": variant_descriptors, - }, } - - return { - "original_mol": original_mol, - "standardised_mol": standarised_mol_block, - "standardised_SMILES": standardised_SMILES, - "molecule_hash": molecule_hash, - "parent": { - "2D_mol": parent_2D_molblock, - "3D_mol": parent_3D_molblock, - "v3000": parent_2D_molblock_v3, - "representations": parent_representations, - "descriptors": parent_descriptors, - }, - "stereochemical_variants": False, - } + except InvalidInputException as e: + return {"Error": f"Invalid input SMILES {e}"}