Skip to content

Commit

Permalink
Merge pull request #458 from Steinbeck-Lab/dev-kohulan
Browse files Browse the repository at this point in the history
Implement issues #457
  • Loading branch information
CS76 authored Feb 20, 2024
2 parents 0b4c3ee + e48b015 commit 50f34a6
Showing 1 changed file with 59 additions and 80 deletions.
139 changes: 59 additions & 80 deletions app/modules/coconut/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

from chembl_structure_pipeline import checker
from chembl_structure_pipeline import standardizer
from rdkit import Chem

import app.modules.toolkits.cdk_wrapper as cdk
import app.modules.toolkits.rdkit_wrapper as rdkitmodules
from app.modules.coconut.descriptors import get_COCONUT_descriptors
from app.modules.toolkits.helpers import InvalidInputException
from app.modules.toolkits.helpers import parse_input


Expand Down Expand Up @@ -76,103 +78,80 @@ def get_representations(molecule: any) -> dict:
InChI_Key = Chem.inchi.MolToInchiKey(molecule)
cdkMolecule = parse_input(Chem.MolToSmiles(molecule), "cdk", False)
Murko = cdk.get_murko_framework(cdkMolecule)
return {"InChI": InChI, "InChI_Key": InChI_Key, "Murko": Murko}
return {
"standard_inchi": InChI,
"standard_inchikey": InChI_Key,
"murko_framework": Murko,
}
else:
return {"Error": "Check input SMILES"}


def get_COCONUT_preprocessing(input_text: str) -> dict:
"""Preprocess user input text suitable for the COCONUT database submission.
data.
Args:
input_text (str): Input text (Mol/str).
Returns:
dict: COCONUT preprocessed data.
"""
original_mol = get_mol_block(input_text)
standarised_mol_block = standardizer.standardize_molblock(original_mol)
standardised_SMILES = Chem.MolToSmiles(
Chem.MolFromMolBlock(standarised_mol_block),
kekuleSmiles=True,
)

rdkitMol = parse_input(standardised_SMILES, "rdkit", False)
molecule_hash = get_molecule_hash(rdkitMol)

parent_canonical_smiles = molecule_hash["Canonical_SMILES"]
cdkParentMol = parse_input(parent_canonical_smiles, "cdk", False)
parent_2D_molblock = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=False).replace(
"$$$$\n",
"",
)
parent_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=True).replace(
"$$$$\n",
"",
)
rdkitParentMol = parse_input(parent_canonical_smiles, "rdkit", False)
parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol)

parent_representations = get_representations(rdkitParentMol)
parent_descriptors = get_COCONUT_descriptors(
parent_canonical_smiles,
"rdkit",
)

if rdkitmodules.has_stereochemistry(rdkitMol):
variant_isomeric_smiles = molecule_hash["Isomeric_SMILES"]
cdkVariantMol = parse_input(variant_isomeric_smiles, "cdk", False)
variant_2D_molblock = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=False).replace(
"$$$$\n",
"",
)
variant_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=True).replace(
"$$$$\n",
"",
)
rdkitVariantMol = parse_input(standardised_SMILES, "rdkit", False)
variant_3D_molblock = rdkitmodules.get_3d_conformers(rdkitVariantMol)
variant_representations = get_representations(rdkitVariantMol)
variant_descriptors = get_COCONUT_descriptors(
variant_isomeric_smiles,
"rdkit",
try:
original_mol = parse_input(input_text, "rdkit", False)
original_mol_block = get_mol_block(input_text)
original_mol_hash = get_molecule_hash(original_mol)
original_representations = get_representations(original_mol)
original_descriptors = get_COCONUT_descriptors(input_text, "rdkit")
standarised_mol_block = standardizer.standardize_molblock(original_mol_block)

standardized_SMILES = Chem.MolToSmiles(
Chem.MolFromMolBlock(standarised_mol_block),
kekuleSmiles=True,
)

standardized_mol = parse_input(standardized_SMILES, "rdkit", False)
standardized_representations = get_representations(standardized_mol)
standardized_descriptors = get_COCONUT_descriptors(standardized_SMILES, "rdkit")

parent_canonical_smiles = original_mol_hash["Canonical_SMILES"]
rdkitParentMol = parse_input(parent_canonical_smiles, "rdkit", False)
parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol)

parent_representations = get_representations(rdkitParentMol)
parent_descriptors = get_COCONUT_descriptors(parent_canonical_smiles, "rdkit")

return {
"original_mol": original_mol,
"standardised_mol": standarised_mol_block,
"standardised_SMILES": standardised_SMILES,
"molecule_hash": molecule_hash,
"original": {
"representations": {
"2D_MOL": original_mol_block,
"3D_MOL": rdkitmodules.get_3d_conformers(original_mol),
"cannonical_smiles": original_mol_hash["Isomeric_SMILES"],
**original_representations,
},
"has_stereo": rdkitmodules.has_stereochemistry(original_mol),
"descriptors": original_descriptors,
"errors": checker.check_molblock(original_mol_block),
},
"standardized": {
"representations": {
"2D_MOL": original_mol_block,
"3D_MOL": rdkitmodules.get_3d_conformers(standardized_mol),
"cannonical_smiles": standardized_SMILES,
**standardized_representations,
},
"has_stereo": rdkitmodules.has_stereochemistry(standardized_mol),
"descriptors": standardized_descriptors,
"errors": checker.check_molblock(standarised_mol_block),
},
"parent": {
"2D_mol": parent_2D_molblock,
"3D_mol": parent_3D_molblock,
"v3000": parent_2D_molblock_v3,
"representations": parent_representations,
"representations": {
"3D_MOL": parent_3D_molblock,
"cannonical_smiles": parent_canonical_smiles,
**parent_representations,
},
"has_stereo": rdkitmodules.has_stereochemistry(rdkitParentMol),
"descriptors": parent_descriptors,
},
"stereochemical_variants": True,
"variants": {
"2D_mol": variant_2D_molblock,
"3D_mol": variant_3D_molblock,
"v3000": variant_2D_molblock_v3,
"representations": variant_representations,
"descriptors": variant_descriptors,
},
}

return {
"original_mol": original_mol,
"standardised_mol": standarised_mol_block,
"standardised_SMILES": standardised_SMILES,
"molecule_hash": molecule_hash,
"parent": {
"2D_mol": parent_2D_molblock,
"3D_mol": parent_3D_molblock,
"v3000": parent_2D_molblock_v3,
"representations": parent_representations,
"descriptors": parent_descriptors,
},
"stereochemical_variants": False,
}
except InvalidInputException as e:
return {"Error": f"Invalid input SMILES {e}"}

0 comments on commit 50f34a6

Please sign in to comment.