diff --git a/alzkb/data/alzkb_v2.rdf b/alzkb/data/alzkb_v2.rdf new file mode 100644 index 0000000..9fdc155 --- /dev/null +++ b/alzkb/data/alzkb_v2.rdf @@ -0,0 +1,2463 @@ + + + + + English + A note on classes vs. individuals: + +In this ontology, individuals are modeled as the idealised entities corresponding to examples of a certain class. For example, 'paroxetine' is an individual of the class Chemical, and 'hsdl1' is an individual of the class Gene. Other ontologies may choose to model these instead as subclasses (e.g., Paroxetine is a subclass of Chemical), and individuals are physical realizations of those classes (e.g., a specific molecule of Paroxetine in the real world). + +The decision to model idealised entities as individuals rather than classes allows us stricter control over the logical assumptions we apply to all entities of a related type. For many use cases, it is often appropriate to take the alternative approach. + +Furthermore, this is beneficial when the ontology is used to populate a graph database. Each individual in the ontology corresponds to a node in the graph database, data properties on those individuals correspond to node attributes, and object properties corresond to edges in the graph. + An ontology describing entities relevant to Alzheimer's disease etiology and entities relevant to drug discovery for Alzheimer's disease. + 0.1.0a + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + String prefix that precedes the unique label on a named individual of the corresponding class. This is necessary to avoid duplicate labels, which result in invalid RDF/XML. Note that the data property "commonName" should also be set, preserving punctuation and whitespace, and omitting the individualLabelPrefix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The entity that is being altered via the KE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + + + + Two proteins are isozymes if they are comprised of different amino acid sequences but catalyze the same enzymatic reaction. Isozymes often have different reaction rates and respond differently in various environmental settings. They may also be regulated through different regulatory mechanisms. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If True, chemical is a known pharmaceutical drug. Effectively, this means that it is present in DrugBank, but this de facto definition may be modified in the future. + + + + + + + + + + + If True, chemical is present in the Comparative Toxicogenomics Database. + + + + + + + + + + + + If True, chemical is considered foreign to the human body. + + + + + + + + + + + + + + + + + + + + + + + + + + + A string used to name the entity. This provides a more useful way to label nodes without the prefixes needed to prevent conflicts when nodes in different classes have the same common name. This common name should also be safe to use with punctuation and whitespace characters (which should be removed from node names). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MACCS fingerprint is stored as a string where each character is a bit representing an individual feature. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Term from the Cell Ontology (CL) + + + + + + + + + + + + + + + + + + + + Term from Chemical Entities of Biological Interest (ChEBI) + + + + + + + + + + + + DSSTox substance identifier. This is the main 'unit of ground truth' for chemicals in ComptoxAI. + + + + + + + + + + + Note: Due to inconsistencies in granularity, some diseases may have multiple DOIDs. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Term from the Foundational Model of Anatomy (FMA) + + + + + + + + + + + EPA GSID. This has been largely superceded by DSSTOX IDs, but web services and Invitrodb still identify chemicals using GSID. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Any medical subject heading (does not include supplemental terms, such as the UIs that point to specific compounds). + + + + + + + + + + + + + + + + + + + + + A MeSH "Unique ID". This is different from a true subject heading, and is usually a controlled term for a chemical substance. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Pathway ID from an unknown (or deprecated) pathway database. + + + + + + + + + + + + + + + + + + + + Detrimental phenotypic effect resulting from exposure to a chemical + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + se_ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + chem_ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Chemical Lists defined by the US EPA and used in the EPA's Comptox Dashboard web application. These lists are scraped from the Dashboard's public-facing API. + +A chemical list is loosely defined, but they can be thought of in broad terms as functional classes of chemicals. They range in size from a few chemicals to tens of thousands of chemicals. + + + + + + + + + + + + 0 + + + + + + + + + + phen_ + + + + + + + + + + + + + + + + + + + + + + + Any database that is relevant to computational toxicology (not necessarily intended to be used primarily for toxicology - e.g., PubChem). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + dis_ + A disease is defined as a medical condition with a deleterious effect. + + + + + + + + + + + + + + 1 + + + + + + + + + + + + + + + + + + + + + + 1 + + + + A chemical substance that causes a change in an organism's physiology or psychology when consumed or administered. This ontology primarily considers the effects of drugs when administered on humans. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + gene_ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Pathways are taken from AOP-DB's pathway_gene table, which includes pathways from many taxa taken from a large number of source databases. We only include pathways with taxid 9606 (humans), but we don't filter based on source database. Therefore, we can't be 100% sure that there isn't duplication, and we haven't created individual node properties for each of the source database xrefs. Generally, the 'sourceDatabase' property can be used along with the format of the pathwayId to determine where the pathway originally came from. + +Note that each gene can be in potentially many pathways, and each pathway can contain potentially many genes. + + + + + + + + + + + + + + 1 + + + + + + + A drug with an intended beneficial effect, intended to 'treat, cure, prevent, or diagnose a disease or promote well-being.' + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/alzkb/populate_edge_weights.py b/alzkb/populate_edge_weights.py new file mode 100644 index 0000000..11ce4a4 --- /dev/null +++ b/alzkb/populate_edge_weights.py @@ -0,0 +1,215 @@ +import pandas as pd +import os + + +path = './data/alzkb_v2-populated.csv' +df= pd.read_csv(path) +df= pd.concat([df,pd.DataFrame(columns=['sourceDB','unbiased','affinity_nM','p_fisher','z_score','correlation','score','confidence'])]) + +# hetionet-custom-edges.tsv +data_dir = "./AlzKB_Raw_Data" +hetionet_custom = pd.read_table(os.path.join(data_dir,'hetionet/hetionet-custom-edges.tsv')) + +hetio_custom = { + 'CbG':'CHEMICALBINDSGENE', + 'DrD':'DISEASEASSOCIATESWITHDISEASE', # no results + 'DlA':'DISEASELOCALIZESTOANATOMY', + 'DpS':'SYMPTOMMANIFESTATIONOFDISEASE' +} + + +affinity_nM = hetionet_custom[hetionet_custom['metaedge']=='CbG'] +affinity_nM['xrefDrugbank'] = affinity_nM['source'].str.split('::').str[-1] +affinity_nM['xrefNcbiGene'] = affinity_nM['target'].str.split('::').str[-1].astype(int) +affinity_nM = affinity_nM.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') +affinity_nM = affinity_nM.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') +affinity_nM['_type'] = hetio_custom['CbG'] +merged_df = df.merge(affinity_nM, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'unbiased', 'affinity_nM']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape + + +disgenet = pd.read_table('./AlzKB_Raw_Data/disgenet/CUSTOM/disease_mappings_alzheimer.tsv') +disgenet = disgenet[disgenet['vocabulary']=='DO'] + + +p_fisher_DlA = hetionet_custom[hetionet_custom['metaedge']=='DlA'] + +p_fisher_DlA['do_id'] = p_fisher_DlA['source'].str.split('::').str[-1].str.split(':').str[-1] +p_fisher_DlA['xrefUberon'] = p_fisher_DlA['target'].str.split('::').str[-1] + +p_fisher_DlA = p_fisher_DlA.merge(disgenet, left_on='do_id', right_on= 'code') +p_fisher_DlA['_start'] = 'disease_'+p_fisher_DlA['diseaseId'].str.lower() +p_fisher_DlA = p_fisher_DlA.merge(df[['_id','xrefUberon']].rename(columns={'_id':'_end'}), on='xrefUberon', how='left') +p_fisher_DlA['_type'] = hetio_custom['DlA'] + +p_fisher_DpS = hetionet_custom[hetionet_custom['metaedge']=='DpS'] + +p_fisher_DpS['xrefMeSH'] = p_fisher_DpS['target'].str.split('::').str[-1] +p_fisher_DpS['do_id'] = p_fisher_DpS['source'].str.split('::').str[-1].str.split(':').str[-1] + +p_fisher_DpS = p_fisher_DpS.merge(df[['_id','xrefMeSH']].rename(columns={'_id':'_start'}), on='xrefMeSH', how='left') +p_fisher_DpS = p_fisher_DpS.merge(disgenet, left_on='do_id', right_on= 'code') +p_fisher_DpS['_end'] = 'disease_'+p_fisher_DpS['diseaseId'].str.lower() +p_fisher_DpS['_type'] = hetio_custom['DpS'] + +p_fisher = pd.concat([p_fisher_DlA, p_fisher_DpS]) + +merged_df = df.merge(p_fisher, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'unbiased', 'p_fisher']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape + + +# hetionet-v1.0-edges.sif +#https://github.com/dhimmel/integrate/blob/master/integrate.ipynb + +import hetio.hetnet +import hetio.readwrite +import hetio.stats + +path = 'https://raw.githubusercontent.com/dhimmel/integrate/master/data/hetnet.json.bz2' +graph = hetio.readwrite.read_graph(path, formatting=None) + + +#https://github.com/hetio/hetnetpy/blob/main/hetnetpy/readwrite.py +import collections +import operator +import pandas as pd + +def write_nodetable(graph): + """Write a tabular encoding of the graph nodes.""" + rows = list() + for node in graph.node_dict.values(): + row = collections.OrderedDict() + row["kind"] = node.metanode.identifier + row["id"] = str(node) + row["name"] = node.name + row["source"] = node.data['source'] + rows.append(row) + rows.sort(key=operator.itemgetter("kind", "id")) + fieldnames = ["id", "name", "kind", "source"] + df_nodes_tsv = pd.DataFrame(rows, columns=fieldnames) + print(df_nodes_tsv.shape) + return df_nodes_tsv + + +def write_edgetable(graph): + """Write a tsv of the graph edges.""" + rows = list() + edge_properties=["sourceDB", "unbiased", "affinity_nM", "z_score", "p_fisher", "correlation"] + fieldnames =["source", "metaedge", "target"] + fieldnames = fieldnames+edge_properties + metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True) + for metaedge, edges in metaedge_to_edges.items(): + for edge in edges: + row = collections.OrderedDict() + row["source"] = edge.source + row["metaedge"] = edge.metaedge.abbrev + row["target"] = edge.target + for pro in edge_properties: + if pro =='sourceDB': + if 'source' in edge.data.keys(): + row[pro]=edge.data['source'] + else: + row[pro]=None + else: + if pro in edge.data.keys(): + row[pro]=edge.data[pro] + else: + row[pro]=None + rows.append(row) + df_edges_tsv = pd.DataFrame(rows, columns=fieldnames) + print(df_edges_tsv.shape) + return df_edges_tsv + +hetionet = write_edgetable(graph) +hetionet['source']=hetionet['source'].astype(str) +hetionet['target']=hetionet['target'].astype(str) +hetionet + +hetio = { + 'CuG':'CHEMICALINCREASESEXPRESSION', + 'CdG':'CHEMICALDECREASESEXPRESSION', + 'GcG':'GENECOVARIESWITHGENE', + 'Gr>G':'GENEREGULATESGENE' +} + + +z_score = hetionet[hetionet['metaedge']=='CuG'] +z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] +z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) + +z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') +z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') +z_score['_type'] = hetio['CuG'] + +z_score_all = z_score + +z_score = hetionet[hetionet['metaedge']=='CdG'] +z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] +z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) + +z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') +z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') +z_score['_type'] = hetio['CdG'] + +z_score_all = pd.concat([z_score_all,z_score]) + +merged_df = df.merge(z_score_all, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'unbiased', 'z_score']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape + + +correlation = pd.read_table(os.path.join(data_dir,'hetionet/geneCovariesWithGene_correlation.tsv')) + +correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='source_entrez', right_on='xrefNcbiGene', how='left') +correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), left_on='target_entrez', right_on='xrefNcbiGene', how='left') +correlation['_type'] = hetio['GcG'] +correlation['sourceDB'] = 'Hetionet - ERC' +correlation['unbiased'] = True + +merged_df = df.merge(correlation, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'unbiased', 'correlation']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape +df.loc[~df['correlation'].isna()] + + +#DisGeNET +score = pd.read_table('./AlzKB_Raw_Data/disgenet/curated_gene_disease_associations.tsv') +score['sourceDB'] = 'DisGeNET - '+score['source'] + +score = score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='geneId', right_on='xrefNcbiGene', how='left') +score['_end'] = 'disease_'+score['diseaseId'].str.lower() +score['_type'] = 'GENEASSOCIATESWITHDISEASE' + +merged_df = df.merge(score, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'score']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape + + +#TF +confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') +confidence + +confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') + +confidence = confidence.merge(df[['_id','TF']].rename(columns={'_id':'_start'}), on='TF', how='left') +confidence = confidence.merge(df[['_id','geneSymbol']].rename(columns={'_id':'_end'}), left_on='Gene', right_on='geneSymbol', how='left') + +confidence['_type'] = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE' + +merged_df = df.merge(confidence, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') +for column in ['sourceDB', 'confidence']: + df[column] = merged_df[column + '_new'].combine_first(df[column]) +df.shape + +#save data file +df.to_csv('./data/alzkb_v2.0.0_with_edge_properties.csv') + + + diff --git a/alzkb/populate_ontology.py b/alzkb/populate_ontology.py index 87179fb..4be7b74 100644 --- a/alzkb/populate_ontology.py +++ b/alzkb/populate_ontology.py @@ -3,33 +3,30 @@ import owlready2 -import secrets +import mysecrets import ipdb -onto = owlready2.get_ontology("file://D:\\projects\\ista\\tests\\projects\\alzkb\\alzkb.rdf").load() -data_dir = "D:\\data\\" +onto = owlready2.get_ontology("./data/alzkb_v2.rdf").load() +data_dir = "./AlzKB_Raw_Data/" mysql_config = { - 'host': secrets.MYSQL_HOSTNAME, - 'user': secrets.MYSQL_USERNAME, - 'passwd': secrets.MYSQL_PASSWORD + 'host': mysecrets.MYSQL_HOSTNAME, + 'user': mysecrets.MYSQL_USERNAME, + 'passwd': mysecrets.MYSQL_PASSWORD } -epa = FlatFileDatabaseParser("epa", onto, data_dir) ncbigene = FlatFileDatabaseParser("ncbigene", onto, data_dir) drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir) hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir) aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config) -aopwiki = FlatFileDatabaseParser("aopwiki", onto, data_dir) -tox21 = FlatFileDatabaseParser("tox21", onto, data_dir) disgenet = FlatFileDatabaseParser("disgenet", onto, data_dir) - +dorothea = FlatFileDatabaseParser("dorothea", onto, data_dir) drugbank.parse_node_type( node_type="Drug", # Switch from "Chemical" in ComptoxAI to "Drug" in AlzKB - source_filename="drug_links.csv", - fmt="csv", + source_filename="CUSTOM/drug_links.tsv", + fmt="tsv", parse_config={ "iri_column_name": "DrugBank ID", "headers": True, @@ -37,6 +34,7 @@ "DrugBank ID": onto.xrefDrugbank, "CAS Number": onto.xrefCasRN, "Name": onto.commonName, + "data_resource": onto.sourceDatabase, }, "merge_column": { "source_column_name": "CAS Number", @@ -49,7 +47,7 @@ ncbigene.parse_node_type( node_type="Gene", - source_filename="Homo_sapiens.gene_info", + source_filename="CUSTOM/output.tsv", fmt="tsv-pandas", parse_config={ "compound_fields": { @@ -65,6 +63,8 @@ "MIM": onto.xrefOMIM, "HGNC": onto.xrefHGNC, "Ensembl": onto.xrefEnsembl, + "chromosome": onto.chromosome, + "data_resource": onto.sourceDatabase, # TODO: Parse Feature_type and other columns }, }, @@ -74,7 +74,7 @@ hetionet.parse_node_type( node_type="DrugClass", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -86,35 +86,17 @@ }, "data_property_map": { "id": onto.xrefNciThesaurus, - "name": onto.commonName + "name": onto.commonName, + "sourceDB": onto.sourceDatabase, } }, merge=False, skip=False ) -# hetionet.parse_node_type( -# node_type="ChemicalEffect", -# source_filename="hetionet-v1.0-nodes.tsv", -# fmt="tsv", -# parse_config={ -# "iri_column_name": "name", -# "headers": True, -# "filter_column": "kind", -# "filter_value": "Side Effect", -# "data_transforms": { -# "id": lambda x: x.split("::")[-1] -# }, -# "data_property_map": { -# "id": onto.xrefUmlsCUI, -# "name": onto.commonName -# } -# }, -# merge=False, -# skip=False -# ) + hetionet.parse_node_type( node_type="Symptom", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -126,7 +108,8 @@ }, "data_property_map": { "id": onto.xrefMeSH, - "name": onto.commonName + "name": onto.commonName, + "sourceDB": onto.sourceDatabase, } }, merge=False, @@ -134,7 +117,7 @@ ) hetionet.parse_node_type( # ANATOMY RESOLUTION NEEDS TO BE REFINED! node_type="BodyPart", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -146,7 +129,8 @@ }, "data_property_map": { "id": onto.xrefUberon, - "name": onto.commonName + "name": onto.commonName, + "sourceDB": onto.sourceDatabase, } }, merge=False, @@ -154,7 +138,7 @@ ) hetionet.parse_node_type( node_type="BiologicalProcess", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -166,7 +150,8 @@ }, "data_property_map": { "id": onto.xrefGeneOntology, - "name": onto.commonName + "name": onto.commonName, + "sourceDB": onto.sourceDatabase, } }, merge=False, @@ -174,7 +159,7 @@ ) hetionet.parse_node_type( node_type="MolecularFunction", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -186,7 +171,8 @@ }, "data_property_map": { "id": onto.xrefGeneOntology, - "name": onto.commonName + "name": onto.commonName, + "source": onto.sourceDatabase, } }, merge=False, @@ -194,7 +180,7 @@ ) hetionet.parse_node_type( node_type="CellularComponent", - source_filename="hetionet-v1.0-nodes.tsv", + source_filename="hetionet-custom-nodes.tsv", #use customized hetionet fmt="tsv", parse_config={ "iri_column_name": "name", @@ -206,13 +192,15 @@ }, "data_property_map": { "id": onto.xrefGeneOntology, - "name": onto.commonName + "name": onto.commonName, + "source": onto.sourceDatabase, } }, merge=False, skip=False ) +""" aopdb.parse_node_type( node_type="Drug", source_table="chemical_info", @@ -227,17 +215,26 @@ merge=True, skip=False ) +""" + aopdb.parse_node_type( node_type="Pathway", source_table="stressor_info", parse_config={ - "iri_column_name": "path_id", + "iri_column_name": "path_name", "data_property_map": { "path_id": onto.pathwayId, - "path_name": onto.commonName, + #"path_name": onto.commonName, + "path_name": onto.pathwayName, "ext_source": onto.sourceDatabase, }, - "custom_sql_query": "SELECT DISTINCT path_id, path_name, ext_source FROM aopdb.pathway_gene WHERE tax_id = 9606;" + "custom_sql_query": """SELECT path_name, GROUP_CONCAT(DISTINCT path_id) as path_id, CONCAT('AOPDB - ', GROUP_CONCAT(DISTINCT ext_source)) as ext_source + FROM( + SELECT DISTINCT path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name, ext_source + FROM aopdb.pathway_gene + WHERE tax_id = 9606 + )data + GROUP BY path_name;""" #clean duplicated pathway }, merge=False, skip=False @@ -253,6 +250,7 @@ "data_property_map": { "diseaseId": onto.xrefUmlsCUI, "name": onto.commonName, + "data_source": onto.sourceDatabase, } }, merge=False, @@ -269,7 +267,8 @@ "filter_value": "DO", "merge_column": { "source_column_name": "diseaseId", - "data_property": onto.xrefUmlsCUI + "data_property": onto.xrefUmlsCUI, + "data_source": onto.sourceDatabase, }, "data_property_map": { "code": onto.xrefDiseaseOntology @@ -344,7 +343,7 @@ ), hetionet.parse_relationship_type( relationship_type=onto.chemicalBindsGene, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Drug, @@ -388,7 +387,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.drugInClass, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Drug, @@ -432,7 +431,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.symptomManifestationOfDisease, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Symptom, @@ -498,7 +497,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.diseaseLocalizesToAnatomy, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Disease, @@ -520,7 +519,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.diseaseAssociatesWithDisease, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Disease, @@ -542,7 +541,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.geneParticipatesInBiologicalProcess, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Gene, @@ -563,8 +562,8 @@ skip=False ) hetionet.parse_relationship_type( - relationship_type=onto.geneAssociatedWithCellularComponent, - source_filename="hetionet-v1.0-edges.sif", + relationship_type=onto.geneAssociatedWithCellularComponent, + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Gene, @@ -586,7 +585,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.geneHasMolecularFunction, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-custom-edges.tsv", #use customized hetionet fmt="tsv", parse_config={ "subject_node_type": onto.Gene, @@ -615,9 +614,11 @@ "subject_column_name": "entrez", "subject_match_property": onto.xrefNcbiGene, "object_node_type": onto.Pathway, - "object_column_name": "path_id", - "object_match_property": onto.pathwayId, - "custom_sql_query": "SELECT * FROM aopdb.pathway_gene WHERE tax_id = 9606;", + "object_column_name": "path_name", + "object_match_property": onto.pathwayName, + "custom_sql_query": """SELECT DISTINCT entrez, path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name + FROM aopdb.pathway_gene + WHERE tax_id = 9606;""", "source_table_type": "foreignKey", "source_table": "pathway_gene", }, @@ -626,7 +627,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.bodyPartOverexpressesGene, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-v1.0-edges.sif", fmt="tsv", parse_config={ "subject_node_type": onto.BodyPart, @@ -648,7 +649,7 @@ ) hetionet.parse_relationship_type( relationship_type=onto.bodyPartUnderexpressesGene, - source_filename="hetionet-v1.0-edges.sif", + source_filename="hetionet-v1.0-edges.sif", fmt="tsv", parse_config={ "subject_node_type": onto.BodyPart, @@ -672,7 +673,88 @@ # POSSIBLE ISSUE: Normalize Drug > Chemical or vice versa? Gonna have to look for 'gaps' # in Neo4j database stemming from inconsistency in node type. +hetionet.parse_relationship_type( + relationship_type=onto.geneCovariesWithGene, + source_filename="hetionet-v1.0-edges.sif", + fmt="tsv", + parse_config={ + "subject_node_type": onto.Gene, + "subject_column_name": "source", + "subject_match_property": onto.xrefNcbiGene, + "object_node_type": onto.Gene, + "object_column_name": "target", + "object_match_property": onto.xrefNcbiGene, + "filter_column": "metaedge", + "filter_value": "GcG", + "headers": True, + "data_transforms": { + "source": lambda x: int(x.split("::")[-1]), + "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? + }, + }, + merge=False, + skip=False +) + +hetionet.parse_relationship_type( + relationship_type=onto.geneRegulatesGene, + source_filename="hetionet-v1.0-edges.sif", + fmt="tsv", + parse_config={ + "subject_node_type": onto.Gene, + "subject_column_name": "source", + "subject_match_property": onto.xrefNcbiGene, + "object_node_type": onto.Gene, + "object_column_name": "target", + "object_match_property": onto.xrefNcbiGene, + "filter_column": "metaedge", + "filter_value": "Gr>G", + "headers": True, + "data_transforms": { + "source": lambda x: int(x.split("::")[-1]), + "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? + }, + }, + merge=False, + skip=False +) + +dorothea.parse_node_type( + node_type="TranscriptionFactor", + source_filename="tf.tsv", + fmt="tsv", + parse_config={ + "iri_column_name": "source", + "headers": True, + "data_property_map": { + "source": onto.TF, + #"source": onto.commonName, + "sourceDB": onto.sourceDatabase, + }, + }, + merge=False, + skip=False +) + + +dorothea.parse_relationship_type( + relationship_type=onto.transcriptionFactorInteractsWithGene, + source_filename="tf.tsv", + fmt="tsv", + parse_config={ + "subject_node_type": onto.TranscriptionFactor, + "subject_column_name": "source", + "subject_match_property": onto.TF, + "object_node_type": onto.Gene, + "object_column_name": "target", + "object_match_property": onto.geneSymbol, + "headers": True, + }, + merge=False, + skip=False +) + print_onto_stats(onto) -with open("./data/alzkb-populated.rdf", 'wb') as fp: +with open("./data/alzkb_v2-populated.rdf", 'wb') as fp: onto.save(file=fp, format="rdfxml") \ No newline at end of file diff --git a/alzkb/rdf_to_memgraph_csv.py b/alzkb/rdf_to_memgraph_csv.py new file mode 100644 index 0000000..1917d6b --- /dev/null +++ b/alzkb/rdf_to_memgraph_csv.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python +# coding: utf-8 +import pandas as pd +import numpy as np +from gqlalchemy import Memgraph +import owlready2 + + +#read RDF +path = './data/alzkb_v2-populated.rdf' +onto = owlready2.get_ontology(path).load() + + +#Load node and property +def extract_node_details(label, node): + details = { + '_id': node.name, + '_labels': label, + 'commonName': node.commonName if node.commonName else np.nan, + 'geneSymbol': node.geneSymbol if node.geneSymbol else np.nan, + 'pathwayId': node.pathwayId if node.pathwayId else np.nan, + 'pathwayName': node.pathwayName if node.pathwayName else np.nan, + 'sourceDatabase': node.sourceDatabase if node.sourceDatabase else np.nan, + 'typeOfGene': node.typeOfGene if node.typeOfGene else np.nan, + 'chromosome': node.chromosome if node.chromosome else np.nan, + 'TF': node.TF if node.TF else np.nan, + 'xrefCasRN': node.xrefCasRN if node.xrefCasRN else np.nan, + 'xrefDiseaseOntology': node.xrefDiseaseOntology if node.xrefDiseaseOntology else np.nan, + 'xrefDrugbank': node.xrefDrugbank if node.xrefDrugbank else np.nan, + 'xrefEnsembl': node.xrefEnsembl if node.xrefEnsembl else np.nan, + 'xrefGeneOntology': node.xrefGeneOntology if node.xrefGeneOntology else np.nan, + 'xrefHGNC': node.xrefHGNC if node.xrefHGNC else np.nan, + 'xrefMeSH': node.xrefMeSH if node.xrefMeSH else np.nan, + 'xrefNcbiGene': node.xrefNcbiGene if node.xrefNcbiGene else np.nan, + 'xrefNciThesaurus': node.xrefNciThesaurus if node.xrefNciThesaurus else np.nan, + 'xrefOMIM': node.xrefOMIM if node.xrefOMIM else np.nan, + 'xrefUberon': node.xrefUberon if node.xrefUberon else np.nan, + 'xrefUmlsCUI': node.xrefUmlsCUI if node.xrefUmlsCUI else np.nan + } + + for key, value in details.items(): + if isinstance(value, list) and len(value) > 0: + try: + details[key] = str(value[-1]) + except ValueError: + details[key] = np.nan + elif isinstance(value, list): + details[key] = np.nan + + return details + + +#Drug +drug_details_list = [] +for drug in onto.individuals(): + if onto.Drug in drug.is_a: + drug_details_list.append(extract_node_details(':Drug', drug)) +drug_details_df = pd.DataFrame(drug_details_list) + + +#Gene +gene_details_list = [] +for gene in onto.individuals(): + if onto.Gene in gene.is_a: + gene_details_list.append(extract_node_details(':Gene', gene)) +gene_details_df = pd.DataFrame(gene_details_list) + + +#BodyPart +bodypart_details_list = [] +for bodypart in onto.individuals(): + if onto.BodyPart in bodypart.is_a: + bodypart_details_list.append(extract_node_details(':BodyPart', bodypart)) +bodypart_details_df = pd.DataFrame(bodypart_details_list) + + +#Disease +disease_details_list = [] +for disease in onto.individuals(): + if onto.Disease in disease.is_a: + disease_details_list.append(extract_node_details(':Disease', disease)) +disease_details_df = pd.DataFrame(disease_details_list) + + +#DrugClass +drugclass_details_list = [] +for drugclass in onto.individuals(): + if onto.DrugClass in drugclass.is_a: + drugclass_details_list.append(extract_node_details(':DrugClass', drugclass)) +drugclass_details_df = pd.DataFrame(drugclass_details_list) + + +#CellularComponent +cellular_details_list = [] +for cellular in onto.individuals(): + if onto.CellularComponent in cellular.is_a: + cellular_details_list.append(extract_node_details(':CellularComponent', cellular)) +cellular_details_df = pd.DataFrame(cellular_details_list) + + +#MolecularFunction +molecular_details_list = [] +for molecular in onto.individuals(): + if onto.MolecularFunction in molecular.is_a: + molecular_details_list.append(extract_node_details(':MolecularFunction', molecular)) +molecular_details_df = pd.DataFrame(molecular_details_list) + + +#Pathway +pathway_details_list = [] +for pathway in onto.individuals(): + if onto.Pathway in pathway.is_a: + pathway_details_list.append(extract_node_details(':Pathway', pathway)) +pathway_details_df = pd.DataFrame(pathway_details_list) + + +#BiologicalProcess +biological_details_list = [] +for biological in onto.individuals(): + if onto.BiologicalProcess in biological.is_a: + biological_details_list.append(extract_node_details(':BiologicalProcess', biological)) +biological_details_df = pd.DataFrame(biological_details_list) + + +#Symptom +symptom_details_list = [] +for symptom in onto.individuals(): + if onto.Symptom in symptom.is_a: + symptom_details_list.append(extract_node_details(':Symptom', symptom)) +symptom_details_df = pd.DataFrame(symptom_details_list) + + +# TranscriptionFactor +transcription_details_list = [] +for transcriptionfactor in onto.individuals(): + if onto.TranscriptionFactor in transcriptionfactor.is_a: + transcription_details_list.append(extract_node_details(':TranscriptionFactor', transcriptionfactor)) +transcription_details_df = pd.DataFrame(transcription_details_list) + + +#Merge all nodes df +merged_node_df = pd.concat([drug_details_df, gene_details_df, bodypart_details_df, disease_details_df, + drugclass_details_df, cellular_details_df, molecular_details_df, pathway_details_df, + biological_details_df, symptom_details_df, transcription_details_df], ignore_index=True) +merged_node_df.reset_index(drop=True, inplace=True) +merged_node_df.shape + + +#Load relationship + +#Drug +relations = [] +def extract_rel_details_from_drug(node): + for gene in node.chemicalBindsGene: + relations.append({ + '_start': node.name, + '_end': gene.name, + '_type': 'CHEMICALBINDSGENE'}) + for gene in node.chemicalDecreasesExpression: + relations.append({ + '_start': node.name, + '_end': gene.name, + '_type': 'CHEMICALDECREASESEXPRESSION'}) + for gene in node.chemicalIncreasesExpression: + relations.append({ + '_start': node.name, + '_end': gene.name, + '_type': 'CHEMICALINCREASESEXPRESSION'}) + for disease in node.drugCausesEffect: + relations.append({ + '_start': node.name, + '_end': disease.name, + '_type': 'DRUGCAUSESEFFECT'}) + for disease in node.drugTreatsDisease: + relations.append({ + '_start': node.name, + '_end': disease.name, + '_type': 'DRUGTREATSDISEASE'}) + for drugclass in node.drugInClass: + relations.append({ + '_start': node.name, + '_end': drugclass.name, + '_type': 'DRUGINCLASS'}) + + +for drug in onto.individuals(): + if onto.Drug in drug.is_a: + extract_rel_details_from_drug(drug) + +drug_rel = pd.DataFrame(relations) + + +#Gene +relations = [] +def extract_rel_details_from_gene(node): + for cellular in node.geneAssociatedWithCellularComponent: + relations.append({ + '_start': node.name, + '_end': cellular.name, + '_type': 'GENEASSOCIATEDWITHCELLULARCOMPONENT'}) + for disease in node.geneAssociatesWithDisease: + relations.append({ + '_start': node.name, + '_end': disease.name, + '_type': 'GENEASSOCIATESWITHDISEASE'}) + for molecular in node.geneHasMolecularFunction: + relations.append({ + '_start': node.name, + '_end': molecular.name, + '_type': 'GENEHASMOLECULARFUNCTION'}) + for biological in node.geneParticipatesInBiologicalProcess: + relations.append({ + '_start': node.name, + '_end': biological.name, + '_type': 'GENEPARTICIPATESINBIOLOGICALPROCESS'}) + + +for gene in onto.individuals(): + if onto.Gene in gene.is_a: + extract_rel_details_from_gene(gene) + +gene_rel = pd.DataFrame(relations) + + +# #### geneInteractsWithGene (to avoid inverse property problem) +from rdflib import Graph, URIRef + +g = Graph() + +rdf_file = path +g.parse(rdf_file, format='xml') + +pred_uri_1 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneCovariesWithGene') +pred_uri_2 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInteractsWithGene') +pred_uri_3 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneRegulatesGene') +pred_uri_4 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInPathway') + +def extract_last_part(uri): + return uri.split('#')[-1] + +triples = [] +for subj, pred, obj in g: + if pred == pred_uri_1: + triples.append([extract_last_part(subj), 'GENECOVARIESWITHGENE', extract_last_part(obj)]) + elif pred == pred_uri_2: + triples.append([extract_last_part(subj), 'GENEINTERACTSWITHGENE', extract_last_part(obj)]) + elif pred == pred_uri_3: + triples.append([extract_last_part(subj), 'GENEREGULATESGENE', extract_last_part(obj)]) + elif pred == pred_uri_4: + triples.append([extract_last_part(subj), 'GENEINPATHWAY', extract_last_part(obj)]) + +gene_rel2 = pd.DataFrame(triples, columns=['_start', '_type', '_end']) + +#Merge gene rel and rel2 +gene_rel2 = gene_rel2[gene_rel.columns] +gene_rel = pd.concat([gene_rel, gene_rel2], ignore_index=True) + + +#Body Part +relations = [] +def extract_rel_details_from_bodypart(node): + for gene in node.bodyPartOverexpressesGene: + relations.append({ + '_start': node.name, + '_end': gene.name, + '_type': 'BODYPARTOVEREXPRESSESGENE'}) + for gene in node.bodyPartUnderexpressesGene: + relations.append({ + '_start': node.name, + '_end': gene.name, + '_type': 'BODYPARTUNDEREXPRESSESGENE'}) + + +for bodypart in onto.individuals(): + if onto.BodyPart in bodypart.is_a: + extract_rel_details_from_bodypart(bodypart) + +bodypart_rel = pd.DataFrame(relations) + + +#Disease +relations = [] +def extract_rel_details_from_disease(node): + for disease in node.diseaseAssociatesWithDisease: + relations.append({ + '_start': node.name, + '_end': disease.name, + '_type': 'DISEASEASSOCIATESWITHDISEASE'}) + for bodypart in node.diseaseLocalizesToAnatomy: + relations.append({ + '_start': node.name, + '_end': bodypart.name, + '_type': 'DISEASELOCALIZESTOANATOMY'}) + + +for disease in onto.individuals(): + if onto.Disease in disease.is_a: + extract_rel_details_from_disease(disease) + +disease_rel = pd.DataFrame(relations) + + +#Symptom +relations = [] +def extract_rel_details_from_symptom(node): + for disease in node.symptomManifestationOfDisease: + relations.append({ + '_start': node.name, + '_end': disease.name, + '_type': 'SYMPTOMMANIFESTATIONOFDISEASE'}) + + +for symptom in onto.individuals(): + if onto.Symptom in symptom.is_a: + extract_rel_details_from_symptom(symptom) + +symptom_rel = pd.DataFrame(relations) + + +# Transcription Factor +relations = [] +def extract_rel_details_from_transcriptionfactor(node): + for transcriptionfactor in node.transcriptionFactorInteractsWithGene: + relations.append({ + '_start': node.name, + '_end': transcriptionfactor.name, + '_type': 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'}) + + +for transcriptionfactor in onto.individuals(): + if onto.TranscriptionFactor in transcriptionfactor.is_a: + extract_rel_details_from_transcriptionfactor(transcriptionfactor) + +transcriptionfactor_rel = pd.DataFrame(relations) + + +#Merge all rels df +merged_rel_df = pd.concat([drug_rel, gene_rel, bodypart_rel, disease_rel, symptom_rel, transcriptionfactor_rel], ignore_index=True) +merged_rel_df.reset_index(drop=True, inplace=True) +merged_rel_df.shape + + +#Merge node and rel +df_all = pd.concat([merged_node_df, merged_rel_df], axis=0, ignore_index=True) +df_all.to_csv('./data/alzkb_v2-populated.csv', index=False) + + + + diff --git a/scripts/alzkb_parse_disgenet.py b/scripts/alzkb_parse_disgenet.py index 0780ebd..6bc8df3 100644 --- a/scripts/alzkb_parse_disgenet.py +++ b/scripts/alzkb_parse_disgenet.py @@ -12,12 +12,21 @@ disgenet_df = pd.read_csv("./disease_mappings_to_attributes.tsv", sep="\t", header=0) disgenet_do_df = pd.read_csv("./disease_mappings.tsv", sep="\t", header=0) -disgenet_ad_df = disgenet_df.loc[disgenet_df["name"].str.contains("Alzheimer"),:] +# case insensitive match +disgenet_ad_df = disgenet_df.loc[disgenet_df["name"].str.contains("Alzheimer",case=False),:] cuis = list(disgenet_ad_df.diseaseId.unique()) # For adding disease ontology identifiers disgenet_ad_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis),:] +# clean data +# Creutzfeldt-jakob disease (CJD) and Familial Alzheimer Disease (FAD) are different diseases but got merged to the same node in AlzKB because of disease mappings in DisGeNET file “UMLS CUI to several disease vocabularies” in which the DO of Creutzfeldt-Jakob disease is mapped to FAD. +disgenet_ad_do_df = disgenet_ad_do_df[~((disgenet_ad_do_df['name']=='Familial Alzheimer Disease (FAD)') & (disgenet_ad_do_df['vocabularyName']=='Creutzfeldt-Jakob disease'))] + +# add "data_source" & "unbiased" colomns +disgenet_ad_do_df['data_source'] ='DisGeNET' +disgenet_ad_df['data_source'] ='DisGeNET' + # if we don't have the CUSTOM subdirectory, create it Path("CUSTOM").mkdir(exist_ok=True) diff --git a/scripts/alzkb_parse_dorothea.py b/scripts/alzkb_parse_dorothea.py new file mode 100644 index 0000000..cc36a55 --- /dev/null +++ b/scripts/alzkb_parse_dorothea.py @@ -0,0 +1,29 @@ +import pandas as pd +import rpy2.robjects as robjects +from rpy2.robjects import pandas2ri + +# dorothea +# Defining the R script and loading the instance in Python (create and save R script in Rstudio) +r = robjects.r +r['source']('./dorothea.R') + +# Loading the function we have defined in R. +#list(robjects.globalenv.keys()) +net_r = robjects.globalenv['net'] + +#r to pandas dataframe +import rpy2.robjects as ro +with (ro.default_converter + pandas2ri.converter).context(): + dorothea = ro.conversion.get_conversion().rpy2py(net_r) +#dorothea['source'].nunique() #643 TFs + + +#trrust +trrust_rawdata = pd.read_csv('./trrust_rawdata.human.tsv', sep='\t', header=None, names=["TF","Gene","Interaction","PMID"]) +#trrust_rawdata['TF'].nunique() #795 TFs matches with https://www.grnpedia.org/trrust/downloadnetwork.php + + +#combine +df_comb = trrust_rawdata.merge(dorothea, left_on=["TF","Gene"], right_on=["source","target"], how='inner') +df_comb['sourceDB'] ='DoRothEA & TRRUST' +df_comb.to_csv('./tf.tsv', sep="\t", header=True, index=False) \ No newline at end of file diff --git a/scripts/alzkb_parse_drugbank.py b/scripts/alzkb_parse_drugbank.py new file mode 100644 index 0000000..abfa7dd --- /dev/null +++ b/scripts/alzkb_parse_drugbank.py @@ -0,0 +1,14 @@ +import pandas as pd +from pathlib import Path + +df = pd.read_csv('./drug_links.csv') +print(df.shape) + +# add "data_source" colomn +df['data_resource'] ='DrugBank' + +# if we don't have the CUSTOM subdirectory, create it +Path("CUSTOM").mkdir(exist_ok=True) + +df.to_csv("./CUSTOM/drug_links.tsv", sep="\t", header=True, index=False) +print(df.shape) \ No newline at end of file diff --git a/scripts/alzkb_parse_ncbigene.py b/scripts/alzkb_parse_ncbigene.py index d502efe..4ef351f 100644 --- a/scripts/alzkb_parse_ncbigene.py +++ b/scripts/alzkb_parse_ncbigene.py @@ -29,13 +29,15 @@ def filterLargeTextFile(source, destination, delimiter, keep_index): #load body for line in r: - if line is not None: + #if line is not None: + if line.startswith('9606'): #filter to Homo sapiens (human) w.write(keepDesiredColums(line, keep_index, delimiter) + '\n') r.close(), w.close() def fileIndexFinder(source, destination, keep_set, compare_column_index, separator): + count_rows =0 with open(source, "r") as r, open(destination, "w") as w: - w.write('Ensembl' + separator + r.readline()) + w.write('data_resource' + separator + 'Ensembl' + separator + r.readline()) for line in r: columns = line.split(separator) @@ -46,23 +48,26 @@ def fileIndexFinder(source, destination, keep_set, compare_column_index, separat if len(parsed_column_split) > 2: parsed_column = parsed_column_split[2].replace('Ensembl:', '') - if parsed_column in keep_set: - w.write(parsed_column + separator + line) + #if parsed_column in keep_set: # keep all instead of filtering to brain + w.write('NCBI Gene' + separator + parsed_column + separator + line) + count_rows +=1 + + print(count_rows) r.close() -brain_file='./Homo_sapiens_expr_advanced_development.tsv' #https://bgee.org/?page=download&action=expr_calls#id1 +brain_file='./Homo_sapiens_expr_advanced.tsv' #https://bgee.org/?page=download&action=expr_calls#id1 Homo_sapiens_expr_advanced_development gene_file='../Homo_sapiens.gene_info' #https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz gene_dest_file='./Homo_sapiens.gene_info_filtered' final_out='./output.tsv' delimiter = '\t' -keep_index = [1,2,4,5,6,8,9] +keep_index = [1,2,4,5,6,8,9,11] compare_index = 0 processLargeTextFile(brain_file, compare_index, delimiter) +print(len(my_set)) filterLargeTextFile(gene_file, gene_dest_file, delimiter, keep_index) - -fileIndexFinder(gene_dest_file, final_out, my_set, 3, delimiter) \ No newline at end of file +fileIndexFinder(gene_dest_file, final_out, my_set, 3, delimiter) \ No newline at end of file diff --git a/scripts/dorothea.R b/scripts/dorothea.R new file mode 100644 index 0000000..6ce3e0c --- /dev/null +++ b/scripts/dorothea.R @@ -0,0 +1,6 @@ +library(dorothea) +library(decoupleR) +library(ggplot2) +library(dplyr) + +net <- decoupleR::get_dorothea(levels = c('A', 'B', 'C', 'D')) \ No newline at end of file