diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py index 0df33af..3522f47 100644 --- a/src/pyeed/adapter/ncbi_protein_mapper.py +++ b/src/pyeed/adapter/ncbi_protein_mapper.py @@ -40,9 +40,14 @@ def map_organism(self, seq_record: SeqRecord) -> Tuple[Any, Any]: logger.info( f"For {seq_record.id} {feature.qualifiers['db_xref']} taxonomy ID(s) were found, using the first one. Skipping organism assignment" ) - return (None, None) - taxonomy_id = feature.qualifiers["db_xref"][0] + # check wether one of the db_xref is a taxonomy id starts with 'taxon:' + taxonomy_id = None + for db_xref in feature.qualifiers["db_xref"]: + logger.debug(f"Checking db_xref: {db_xref}") + if db_xref.startswith("taxon:"): + taxonomy_id = db_xref + break if ":" in taxonomy_id: taxonomy_id = int(taxonomy_id.split(":")[1]) diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py new file mode 100644 index 0000000..43a7093 --- /dev/null +++ b/src/pyeed/analysis/ontology_loading.py @@ -0,0 +1,119 @@ +from Bio.Align import Alignment as Alignment +from pyeed.dbconnect import DatabaseConnector +from pyeed.main import Pyeed +from rdflib import Graph, RDF, RDFS, OWL, Namespace + + + +class OntologyAdapter(): + """ + Adapter class to load ontology files into the database. + """ + + def import_ontology_file_in_db(self, file_path: str, db: DatabaseConnector): + """ + Imports an ontology file into the database. + + :param file_path: The path to the ontology file. + :param db: The database connector + + :return: None + """ + + # Load the OWL file + g = Graph() + g.parse(file_path) + + # Create a namespace for the ontology + IAO_NS = Namespace("http://purl.obolibrary.org/obo/IAO_") + OBOINOWL_NS = Namespace("http://www.geneontology.org/formats/oboInOwl#") + + # create a dictonary of the labels + dicts_labels = {} + for s, p, o in g.triples((None, RDFS.label, None)): + dicts_labels[str(s)] = str(o) + + # Iterate over the classes in the OWL file + for s, p, o in g.triples((None, RDF.type, OWL.Class)): + class_name = str(s) + db.execute_write("CREATE (c:OntologyObject {name: $name})", parameters = {"name": class_name}) + + # add discreption, example in CARD: eccC5 is a..... + for _, _, desc in g.triples((s, IAO_NS['0000115'], None)): + description = str(desc) + db.execute_write(""" + MATCH (c:OntologyObject {name: $name}) + SET c.description = $description + """, parameters = {"name": class_name, "description": description}) + + # add the label to the class + db.execute_write(""" + MATCH (c:OntologyObject {name: $name}) + SET c.label = $label + """, parameters = {"name": class_name, "label": dicts_labels[class_name]}) + + # add the synonyms to the class + # Mtub_eccC5_FLO + for _, _, syn in g.triples((s, OBOINOWL_NS.hasExactSynonym, None)): + synonym = str(syn) + db.execute_write(""" + MATCH (c:OntologyObject {name: $name}) + SET c.synonym = $synonym + """, parameters = {"name": class_name, "synonym": synonym}) + + + # Create relationships (subclasses, properties) + for s, p, o in g.triples((None, RDFS.subClassOf, None)): + if (o, RDF.type, OWL.Class) in g: + subclass = str(s) + superclass = str(o) + db.execute_write(""" + MATCH (sub:OntologyObject {name: $subclass}), (super:OntologyObject {name: $superclass}) + CREATE (sub)-[:SUBCLASS_OF]->(super) + """, parameters = {"subclass": subclass, "superclass": superclass}) + + # handels the case where the subclass is a restriction, RO_ (in CARD) + elif (o, RDF.type, OWL.Restriction) in g: + on_property = None + some_values_from = None + + # Extract onProperty + for _, _, prop in g.triples((o, OWL.onProperty, None)): + on_property = str(prop) + + # Extract someValuesFrom + for _, _, value in g.triples((o, OWL.someValuesFrom, None)): + some_values_from = str(value) + + if on_property and some_values_from: + # create a realtionship of type CustomRealationship with the name on_property and the description which can be checked in the dict + # link is between the subclass and the some_values_from + db.execute_write(""" + MATCH (sub:OntologyObject {name: $subclass}), (super:OntologyObject {name: $some_values_from}) + CREATE (sub)-[:CustomRelationship {name: $on_property, description: $description}]->(super) + """, parameters = {"subclass": subclass, "some_values_from": some_values_from, "on_property": on_property, "description": dicts_labels[on_property]}) + + + + + +if __name__ == "__main__": + + uri = "bolt://localhost:7687" + username = "neo4j" + password = "12345678" + + file_path = "/home/nab/Niklas/TEM-lactamase/CARD_Data_Ontologies/aro.owl" + + eedb = Pyeed(uri, user=username, password=password) + eedb.db.wipe_database() + eedb.db.remove_db_constraints(user=username, password=password) + + eedb.db.initialize_db_constraints(user=username, password=password) + + + db = eedb.db + ontology_adapter = OntologyAdapter() + + ontology_adapter.import_ontology_file_in_db(file_path, db) + \ No newline at end of file diff --git a/src/pyeed/main.py b/src/pyeed/main.py index bc14238..88dfc8f 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -83,6 +83,8 @@ def fetch_from_primary_db( accessions = self.db.execute_read(query)[0]["accessions"] ids = [id for id in ids if id not in accessions] + # count how many sequences are already in the database + logger.info(f"Found {len(accessions)} sequences in the database.") logger.info(f"Fetching {len(ids)} sequences from {db}.") if db.lower() == "uniprot": diff --git a/src/pyeed/model.py b/src/pyeed/model.py index 138de3c..235f6cb 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -439,6 +439,7 @@ class Protein(StrictStructuredNode): site = RelationshipTo("Site", "HAS_SITE", model=SiteRel) region = RelationshipTo("Region", "HAS_REGION", model=RegionRel) go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") + ontology_object = RelationshipTo("OntologyObject", "ASSOCIATED_WITH") mutation = RelationshipTo("Protein", "MUTATION", model=Mutation) pairwise_aligned = RelationshipTo( "Protein", "PAIRWISE_ALIGNED", model=PairwiseAlignmentResult @@ -469,3 +470,49 @@ class DNA(StrictStructuredNode): pairwise_aligned = RelationshipTo( "DNA", "PAIRWISE_ALIGNED", model=PairwiseAlignmentResult ) + + +class CustomRealationship(StructuredRel): + """A custom relationship between two ontology objects.""" + + name = StringProperty(required=True) + description = StringProperty() + + @classmethod + def validate_and_connect( + cls, + molecule1: StrictStructuredNode, + molecule2: StrictStructuredNode, + name: str, + description: str, + ): + molecule1.custom_relationships.connect( + molecule2, + { + "name": name, + "description": description, + }, + ) + + return cls( + name=name, + description=description, + ) + + @property + def label(self): + return self.name + + + +class OntologyObject(StrictStructuredNode): + """A node representing an ontology object in the database.""" + + name = StringProperty(required=True, unique_index=True) + description = StringProperty() + label = StringProperty() + synonyms = ArrayProperty(StringProperty()) + + # Relationships + subclasses = RelationshipTo("OntologyObject", "SUBCLASS_OF") + custom_relationships = RelationshipTo("OntologyObject", "CUSTOM_RELATIONSHIP", model=CustomRealationship)