From 80b09ec1d6611d4ce307387a5e974c6522da3f83 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Sun, 5 May 2024 07:17:24 -0700 Subject: [PATCH] Fix FTP calls and remove loading_bar (#294) * partially remove loading_bar (unused) * remove loading_bar * Replace FTP call with ftpretty * Fix loading_bar in tests --- goatools/anno/dnld_ebi_goa.py | 62 +- goatools/associations.py | 147 +- goatools/base.py | 70 +- goatools/cli/compare_gos.py | 252 +-- goatools/cli/wr_sections.py | 74 +- goatools/godag/prttime.py | 17 +- goatools/grouper/grprdflts.py | 22 +- goatools/test_data/nature3102_goea.py | 14 +- setup.cfg | 11 +- tests/godagtimed_old.py | 9 +- tests/i148_semsim_lin.py | 60 +- tests/test_altid_godag.py | 7 +- tests/test_anno_rd_gene2go.py | 84 +- tests/test_annotations_gaf.py | 41 +- tests/test_assc_stats.py | 37 +- tests/test_asscs_ns.py | 59 +- tests/test_cli_write_hierarchy.py | 42 +- tests/test_cmds_find_enrichment_md.py | 66 +- tests/test_dcnt_r01.py | 78 +- tests/test_dnlds.py | 43 +- tests/test_find_enrichment_overlap.py | 27 +- tests/test_find_enrichment_run.py | 18 +- tests/test_genes_cell_cycle.py | 47 +- tests/test_get_godag.py | 3 +- tests/test_go_print.py | 9 +- tests/test_goea_errors.py | 25 +- tests/test_goea_local.py | 61 +- tests/test_goea_quiet.py | 41 +- tests/test_goea_rpt_bonferroni.py | 96 +- tests/test_gosearch_emptydict.py | 13 +- tests/test_gosubdag_relationships.py | 35 +- tests/test_gosubdag_relationships_i126.py | 165 +- tests/test_grpr_get_sections_2d.py | 42 +- tests/test_grprobj.py | 308 +++- tests/test_i122_goea.py | 68 +- tests/test_i96_goea_ncbi.py | 1366 ++++++++++++++++- tests/test_ncbi_entrez_annotations.py | 83 +- tests/test_plot_relationship_part_of.py | 60 +- .../test_propagate_counts_w_relationships.py | 56 +- tests/test_pvalcalc.py | 46 +- tests/test_read_gaf_allow_nd.py | 19 +- tests/test_relationships_usr.py | 153 +- tests/test_rpt_gene2go_evidencecodes.py | 25 +- tests/test_semantic_similarity.py | 74 +- tests/test_semantic_similarity_best4lex.py | 39 +- tests/test_sorter.py | 200 +-- tests/test_sorter_desc2nts.py | 154 +- tests/test_sorter_sections.py | 36 +- tests/test_tcntobj_relationships.py | 36 +- tests/test_termcounts_asscs.py | 63 +- tests/test_typedefs.py | 6 +- tests/test_wr_sections_txt.py | 57 +- tests/test_write_hier_ns.py | 127 +- tests/test_write_summary_cnts.py | 15 +- tests/utils.py | 29 +- 55 files changed, 3459 insertions(+), 1338 deletions(-) diff --git a/goatools/anno/dnld_ebi_goa.py b/goatools/anno/dnld_ebi_goa.py index 30243375..5f9e8eaa 100644 --- a/goatools/anno/dnld_ebi_goa.py +++ b/goatools/anno/dnld_ebi_goa.py @@ -1,61 +1,67 @@ """Download GOA files from the Gene Ontology Annotation (GOA) resource http://www.ebi.ac.uk/GOA.""" -__copyright__ = "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +) __author__ = "DV Klopfenstein" import os import sys -from goatools.base import dnld_file + +from ..base import dnld_file + class DnldGoa: """Download files from the Gene Ontology Annotation (GOA) resource http://www.ebi.ac.uk/GOA.""" # European Bioinformatics Institute (EMBL-EBI) ftp site - ftp_pub = 'ftp://ftp.ebi.ac.uk/pub/' + ftp_pub = "ftp://ftp.ebi.ac.uk/pub/" # Species available from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/ # Example: https://ftp.ebi.ac.uk/pub/databases/GO/goa/CHICKEN/ species = [ - 'arabidopsis', - 'chicken', - 'cow', - 'dicty', - 'dog', - 'fly', - 'human', - 'mouse', + "arabidopsis", + "chicken", + "cow", + "dicty", + "dog", + "fly", + "human", + "mouse", #'pdb', - 'pig', - 'rat', - 'uniprot', - 'worm', - 'yeast', - 'zebrafish', + "pig", + "rat", + "uniprot", + "worm", + "yeast", + "zebrafish", ] - species_items = ['complex', 'isoform', 'rna'] - exts = ['gaf', 'gpa', 'gpi'] + species_items = ["complex", "isoform", "rna"] + exts = ["gaf", "gpa", "gpi"] def __init__(self): - self.ftp_src_goa = os.path.join(self.ftp_pub, 'databases/GO/goa/') + self.ftp_src_goa = os.path.join(self.ftp_pub, "databases/GO/goa/") - def dnld_goa(self, species, ext='gaf', item=None, fileout=None): + def dnld_goa(self, species, ext="gaf", item=None, fileout=None): """Download GOA source file name on EMBL-EBI ftp server.""" basename = self.get_basename(species, ext, item) - src = os.path.join(self.ftp_src_goa, species.upper(), "{F}.gz".format(F=basename)) + src = os.path.join( + self.ftp_src_goa, species.upper(), "{F}.gz".format(F=basename) + ) dst = os.path.join(os.getcwd(), basename) if fileout is None else fileout - dnld_file(src, dst, prt=sys.stdout, loading_bar=None) + dnld_file(src, dst, prt=sys.stdout) return dst - def get_basename(self, species, ext='gaf', item=None): + def get_basename(self, species, ext="gaf", item=None): """Get GOA basename for a specific species. Ex: goa_human.gaf""" assert ext in self.exts, " ".join(self.exts) - if species == 'uniprot': - species = 'uniprot_all' if item != 'gcrp' else 'uniprot_gcrp' + if species == "uniprot": + species = "uniprot_all" if item != "gcrp" else "uniprot_gcrp" if item is None: - return 'goa_{SPECIES}.{EXT}'.format(SPECIES=species, EXT=ext) + return "goa_{SPECIES}.{EXT}".format(SPECIES=species, EXT=ext) assert item in self.species_items - return 'goa_{SPECIES}_{ITEM}.{EXT}'.format(SPECIES=species, ITEM=item, EXT=ext) + return "goa_{SPECIES}_{ITEM}.{EXT}".format(SPECIES=species, ITEM=item, EXT=ext) # Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." diff --git a/goatools/associations.py b/goatools/associations.py index fcf1a3db..795b30a8 100755 --- a/goatools/associations.py +++ b/goatools/associations.py @@ -5,21 +5,23 @@ __copyright__ = "Copyright (C) 2010-present, H Tang et al. All rights reserved." __author__ = "various" -from collections import defaultdict +import gzip import os import sys -from goatools.base import dnld_file -from goatools.base import ftp_get -from goatools.anno.factory import get_objanno -from goatools.anno.factory import get_anno_desc -from goatools.anno.factory import get_objanno_g_kws -from goatools.semantic import TermCounts -from goatools.anno.gaf_reader import GafReader -from goatools.anno.genetogo_reader import Gene2GoReader -from goatools.anno.opts import AnnoOptions -from goatools.utils import get_b2aset as utils_get_b2aset - -def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): + +from collections import defaultdict + +from .anno.factory import get_anno_desc, get_objanno, get_objanno_g_kws +from .anno.gaf_reader import GafReader +from .anno.genetogo_reader import Gene2GoReader +from .anno.opts import AnnoOptions +from .semantic import TermCounts +from .utils import get_b2aset as utils_get_b2aset + +from .base import dnld_file, ftp_get + + +def dnld_assc(assc_name, go2obj=None, namespace="BP", prt=sys.stdout): """Download association from http://geneontology.org/gene-associations.""" # Example assc_name: "tair.gaf" # Download the Association @@ -39,113 +41,144 @@ def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): assc[gene] = goids_cur.intersection(goids_dag) return assc + def dnld_annotation(assc_file, prt=sys.stdout): """Download gaf, gpad, or gpi from http://current.geneontology.org/annotations/""" if not os.path.isfile(assc_file): - # assc_http = "http://geneontology.org/gene-associations/" assc_http = "http://current.geneontology.org/annotations/" _, assc_base = os.path.split(assc_file) src = os.path.join(assc_http, "{ASSC}.gz".format(ASSC=assc_base)) - dnld_file(src, assc_file, prt, loading_bar=None) + dnld_file(src, assc_file, prt) -def read_associations(assoc_fn, anno_type='id2gos', namespace='BP', **kws): + +def read_associations(assoc_fn, anno_type="id2gos", namespace="BP", **kws): """Return associatinos in id2gos format""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol obj = get_objanno(assoc_fn, anno_type, **kws) # kws get_id2gos: ev_include ev_exclude keep_ND keep_NOT b_geneid2gos go2geneids return obj.get_id2gos(namespace, **kws) -def get_assoc_ncbi_taxids(taxids, force_dnld=False, loading_bar=True, **kws): + +def get_assoc_ncbi_taxids(taxids, force_dnld=False, **kws): """Download NCBI's gene2go. Return annotations for user-specified taxid(s).""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) - fin = kws['gene2go'] if 'gene2go' in kws else os.path.join(os.getcwd(), "gene2go") - dnld_ncbi_gene_file(fin, force_dnld, loading_bar=loading_bar) + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) + fin = kws["gene2go"] if "gene2go" in kws else os.path.join(os.getcwd(), "gene2go") + dnld_ncbi_gene_file(fin, force_dnld) return read_ncbi_gene2go(fin, taxids, **kws) + # pylint: disable=unused-argument -def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout, loading_bar=True): +def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout): """Download a file from NCBI Gene's ftp server.""" if not os.path.exists(fin) or force_dnld: - import gzip fin_dir, fin_base = os.path.split(fin) fin_gz = "{F}.gz".format(F=fin_base) fin_gz = os.path.join(fin_dir, fin_gz) if os.path.exists(fin_gz): os.remove(fin_gz) fin_ftp = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{F}.gz".format(F=fin_base) - ## if log is not None: - ## log.write(" DOWNLOADING GZIP: {GZ}\n".format(GZ=fin_ftp)) - ## if loading_bar: - ## loading_bar = wget.bar_adaptive - ## wget.download(fin_ftp, bar=loading_bar) - ## rsp = wget(fin_ftp) ftp_get(fin_ftp, fin_gz) - with gzip.open(fin_gz, 'rb') as zstrm: + with gzip.open(fin_gz, "rb") as zstrm: if log is not None: log.write("\n READ GZIP: {F}\n".format(F=fin_gz)) - with open(fin, 'wb') as ostrm: + with open(fin, "wb") as ostrm: ostrm.write(zstrm.read()) if log is not None: log.write(" WROTE UNZIPPED: {F}\n".format(F=fin)) + def dnld_annofile(fin_anno, anno_type): """Download annotation file, if needed""" if os.path.exists(fin_anno): return anno_type = get_anno_desc(fin_anno, anno_type) - if anno_type == 'gene2go': + if anno_type == "gene2go": dnld_ncbi_gene_file(fin_anno) - if anno_type in {'gaf', 'gpad'}: + if anno_type in {"gaf", "gpad"}: dnld_annotation(fin_anno) -def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws): + +def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace="BP", **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) obj = Gene2GoReader(fin_gene2go, taxids=taxids) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True - if 'taxid2asscs' not in kws: + if "taxid2asscs" not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) - kws_ncbi = {k:v for k, v in kws.items() if k in AnnoOptions.keys_exp} - kws_ncbi['taxid'] = taxid + kws_ncbi = {k: v for k, v in kws.items() if k in AnnoOptions.keys_exp} + kws_ncbi["taxid"] = taxid return obj.get_id2gos(namespace, **kws_ncbi) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_taxid2asscs(taxids, **kws) - t2asscs_usr = kws.get('taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) - if 'taxid2asscs' in kws: + t2asscs_usr = kws.get( + "taxid2asscs", defaultdict(lambda: defaultdict(lambda: defaultdict(set))) + ) + if "taxid2asscs" in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret) + def get_gaf_hdr(fin_gaf): """Read Gene Association File (GAF). Return GAF version and data info.""" return GafReader(fin_gaf, hdr_only=True).hdr + # pylint: disable=line-too-long -def read_gaf(fin_gaf, prt=sys.stdout, hdr_only=False, namespace='BP', allow_missing_symbol=False, **kws): +def read_gaf( + fin_gaf, + prt=sys.stdout, + hdr_only=False, + namespace="BP", + allow_missing_symbol=False, + **kws +): """Read Gene Association File (GAF). Return data.""" return GafReader( - fin_gaf, hdr_only=hdr_only, prt=prt, allow_missing_symbol=allow_missing_symbol, godag=kws.get('godag')).get_id2gos( - namespace, **kws) + fin_gaf, + hdr_only=hdr_only, + prt=prt, + allow_missing_symbol=allow_missing_symbol, + godag=kws.get("godag"), + ).get_id2gos(namespace, **kws) + def get_b2aset(a2bset): """Given gene2gos, return go2genes. Given go2genes, return gene2gos.""" - print('DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils') + print("DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils") # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}'.format(PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) return utils_get_b2aset(a2bset) -def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout): + +def get_assc_pruned( + assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout +): """Remove GO IDs associated with large numbers of genes. Used in stochastic simulations.""" # DEFN WAS: get_assc_pruned(assc_geneid2gos, max_genecnt=None, prt=sys.stdout): # ADDED min_genecnt argument and functionality @@ -156,22 +189,27 @@ def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys go2genes_prun = {} for goid, genes in go2genes_orig.items(): num_genes = len(genes) - if (min_genecnt is None or num_genes >= min_genecnt) and \ - (max_genecnt is None or num_genes <= max_genecnt): + if (min_genecnt is None or num_genes >= min_genecnt) and ( + max_genecnt is None or num_genes <= max_genecnt + ): go2genes_prun[goid] = genes num_was = len(go2genes_orig) num_now = len(go2genes_prun) gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys())) - assert num_was-num_now == len(gos_rm) + assert num_was - num_now == len(gos_rm) if prt is not None: if min_genecnt is None: min_genecnt = 1 if max_genecnt is None: max_genecnt = "Max" - prt.write("{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( - m=min_genecnt, M=max_genecnt, N=num_was-num_now, NOW=num_now)) + prt.write( + "{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( + m=min_genecnt, M=max_genecnt, N=num_was - num_now, NOW=num_now + ) + ) return utils_get_b2aset(go2genes_prun), gos_rm + def read_annotations(**kws): """Read annotations from either a GAF file or NCBI's gene2go file.""" # Read and save annotation lines @@ -179,6 +217,7 @@ def read_annotations(**kws): # Return associations return objanno.get_id2gos(**kws) if objanno is not None else {} + def get_tcntobj(go2obj, **kws): """Return a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gpad gaf gene2go id2gos diff --git a/goatools/base.py b/goatools/base.py index b7749054..8cfa3b7c 100644 --- a/goatools/base.py +++ b/goatools/base.py @@ -10,25 +10,27 @@ import traceback import zlib -from ftplib import FTP from os.path import isfile from subprocess import PIPE, Popen from urllib.request import urlopen import requests +from ftpretty import ftpretty from rich.logging import RichHandler def get_logger(name: str): - """Return a logger with a default ColoredFormatter.""" - logger = logging.getLogger(name) - if logger.hasHandlers(): - logger.handlers.clear() - logger.addHandler(RichHandler()) - logger.propagate = False - logger.setLevel(logging.INFO) - return logger + """ + Return a logger with a default ColoredFormatter. + """ + log = logging.getLogger(name) + if log.hasHandlers(): + log.handlers.clear() + log.addHandler(RichHandler()) + log.propagate = False + log.setLevel(logging.INFO) + return log logger = get_logger("goatools") @@ -70,7 +72,6 @@ def nopen(f, mode="r"): stderr=sys.stderr if mode == "r" else PIPE, shell=True, bufsize=-1, # use system default for buffering - preexec_fn=prefunc, close_fds=False, executable=os.environ.get("SHELL"), ) @@ -79,8 +80,6 @@ def nopen(f, mode="r"): if mode != "r": p.stderr = io.TextIOWrapper(p.stderr) - if mode and mode[0] == "r": - return process_iter(p, f[1:]) return p if f.startswith(("http://", "https://", "ftp://")): @@ -96,7 +95,11 @@ def nopen(f, mode="r"): fh = bz2.BZ2File(f, mode) return io.TextIOWrapper(fh) - return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" else open(f, mode) + return ( + {"r": sys.stdin, "w": sys.stdout}[mode[0]] + if f == "-" + else open(f, mode, encoding="utf-8") + ) def ungzipper(fh, blocksize=16384): @@ -116,27 +119,27 @@ def ungzipper(fh, blocksize=16384): data[0] = save + data[0] -def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout, loading_bar=True): +def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout): """Download Ontologies, if necessary.""" if not isfile(obo): http = "http://purl.obolibrary.org/obo/go" if "slim" in obo: http = "http://www.geneontology.org/ontology/subsets" obo_remote = f"{http}/{op.basename(obo)}" - dnld_file(obo_remote, obo, prt, loading_bar) + dnld_file(obo_remote, obo, prt) else: if prt: prt.write(" EXISTS: {FILE}\n".format(FILE=obo)) return obo -def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout, loading_bar=True): +def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout): """Download associations from NCBI, if necessary""" # Download: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz gzip_file = "{GENE2GO}.gz".format(GENE2GO=gene2go) if not isfile(gene2go): file_remote = f"ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{op.basename(gzip_file)}" - dnld_file(file_remote, gene2go, prt, loading_bar) + dnld_file(file_remote, gene2go, prt) else: if prt is not None: prt.write(" EXISTS: {FILE}\n".format(FILE=gene2go)) @@ -151,22 +154,20 @@ def gunzip(gzip_file, file_gunzip=None): return file_gunzip -def get_godag( - fin_obo="go-basic.obo", prt=sys.stdout, loading_bar=True, optional_attrs=None -): +def get_godag(fin_obo="go-basic.obo", prt=sys.stdout, optional_attrs=None): """Return GODag object. Initialize, if necessary.""" from .obo_parser import GODag - download_go_basic_obo(fin_obo, prt, loading_bar) + download_go_basic_obo(fin_obo, prt) return GODag(fin_obo, optional_attrs, load_obsolete=False, prt=prt) -def dnld_gaf(species_txt, prt=sys.stdout, loading_bar=True): +def dnld_gaf(species_txt, prt=sys.stdout): """Download GAF file if necessary.""" - return dnld_gafs([species_txt], prt, loading_bar)[0] + return dnld_gafs([species_txt], prt)[0] -def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): +def dnld_gafs(species_list, prt=sys.stdout): """Download GAF files if necessary.""" # Example GAF files in http://current.geneontology.org/annotations/: # http://current.geneontology.org/annotations/mgi.gaf.gz @@ -180,7 +181,7 @@ def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): gaf_base = "{ABC}.gaf".format(ABC=species_txt) # goa_human.gaf gaf_cwd = os.path.join(cwd, gaf_base) # {CWD}/goa_human.gaf remove_filename = "{HTTP}/{GAF}.gz".format(HTTP=http, GAF=gaf_base) - dnld_file(remove_filename, gaf_cwd, prt, loading_bar) + dnld_file(remove_filename, gaf_cwd, prt) fin_gafs.append(gaf_cwd) return fin_gafs @@ -188,7 +189,7 @@ def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): def http_get(url, fout=None): """Download a file from http. Save it in a file named by fout""" print("requests.get({URL}, stream=True)".format(URL=url)) - rsp = requests.get(url, stream=True) + rsp = requests.get(url, stream=True, timeout=10) if rsp.status_code == 200 and fout is not None: with open(fout, "wb") as prt: for chunk in rsp: # .iter_content(chunk_size=128): @@ -200,8 +201,10 @@ def http_get(url, fout=None): return rsp -def ftp_get(fin_src, fout): - """Download a file from an ftp server""" +def ftp_get(fin_src: str, fout: str): + """ + Download a file from an ftp server, e.g., ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz + """ assert fin_src[:6] == "ftp://", fin_src dir_full, fin_ftp = os.path.split(fin_src[6:]) pt0 = dir_full.find("/") @@ -213,15 +216,11 @@ def ftp_get(fin_src, fout): HOST=ftphost, DIR=chg_dir, SRC=fin_ftp, DST=fout ) ) - ftp = FTP(ftphost) # connect to host, default port ftp.ncbi.nlm.nih.gov - ftp.login() # user anonymous, passwd anonymous@ - ftp.cwd(chg_dir) # change into "debian" directory gene/DATA - cmd = "RETR {F}".format(F=fin_ftp) # gene2go.gz - ftp.retrbinary(cmd, open(fout, "wb").write) # /usr/home/gene2go.gz - ftp.quit() + ftp = ftpretty(ftphost, "anonymous", "anonymous@") + ftp.get(chg_dir + "/" + fin_ftp, fout) -def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): +def dnld_file(src_ftp, dst_file, prt=sys.stdout): """Download specified file if necessary.""" if isfile(dst_file): return @@ -231,7 +230,6 @@ def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): cmd_msg = "get({SRC} out={DST})\n".format(SRC=src_ftp, DST=dst_gz) try: print("$ get {SRC}".format(SRC=src_ftp)) - #### wget.download(src_ftp, out=dst_gz, bar=loading_bar) if src_ftp[:4] == "http": http_get(src_ftp, dst_gz) else: diff --git a/goatools/cli/compare_gos.py b/goatools/cli/compare_gos.py index c8df612c..46b86dc1 100644 --- a/goatools/cli/compare_gos.py +++ b/goatools/cli/compare_gos.py @@ -24,81 +24,109 @@ from __future__ import print_function -__copyright__ = "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +) __author__ = "DV Klopfenstein" import os import sys -from collections import namedtuple -# from collections import OrderedDict -from goatools.base import get_godag -from goatools.associations import get_tcntobj -from goatools.godag.relationship_str import RelationshipStr +from collections import namedtuple -from goatools.cli.docopt_parse import DocOptParse -from goatools.cli.gos_get import GetGOs -from goatools.cli.grouped import Grouped +from ..associations import get_tcntobj +from ..base import get_godag, logger +from ..godag.relationship_str import RelationshipStr +from ..gosubdag.gosubdag import GoSubDag +from ..gosubdag.rpt.wr_xlsx import GoDepth1LettersWr +from ..grouper.sorter import Sorter +from ..grouper.wrxlsx import WrXlsxSortedGos -from goatools.gosubdag.gosubdag import GoSubDag -from goatools.gosubdag.rpt.wr_xlsx import GoDepth1LettersWr -from goatools.grouper.sorter import Sorter -from goatools.grouper.wrxlsx import WrXlsxSortedGos +from .docopt_parse import DocOptParse +from .gos_get import GetGOs +from .grouped import Grouped # pylint: disable=too-few-public-methods class CompareGOsCli: """Class for command-line interface for creating GO term diagrams""" - kws_dict = set(['GO_FILE', - 'sections', 'S', - 'obo', 'slims', - 'ofile', 'xlsx', - 'gaf', 'gene2go', 'taxid', - ]) - kws_set = set(['verbose']) + kws_dict = set( + [ + "GO_FILE", + "sections", + "S", + "obo", + "slims", + "ofile", + "xlsx", + "gaf", + "gene2go", + "taxid", + ] + ) + kws_set = set(["verbose"]) # Print fields to exclude, unless verbose is used - excl_flds = {'level', 'reldepth', 'alt', 'D1', 'childcnt', - 'format_txt', 'num_usrgos', 'is_hdrgo', 'is_usrgo', 'hdr_idx', 'hdr1usr01', - 'REL', 'REL_short', 'rel', 'id'} + excl_flds = { + "level", + "reldepth", + "alt", + "D1", + "childcnt", + "format_txt", + "num_usrgos", + "is_hdrgo", + "is_usrgo", + "hdr_idx", + "hdr1usr01", + "REL", + "REL_short", + "rel", + "id", + } def __init__(self, **kws): _objdoc = DocOptParse(__doc__, self.kws_dict, self.kws_set) self.kws = _objdoc.get_docargs(prt=None) if not kws else kws - self.godag = get_godag(self.kws.get('obo'), prt=sys.stdout, - loading_bar=False, optional_attrs=['relationship']) + self.godag = get_godag( + self.kws.get("obo"), prt=sys.stdout, optional_attrs=["relationship"] + ) _ini = _Init(self.godag) - self.go_ntsets = _ini.get_go_ntsets(self.kws.get('GO_FILE')) + self.go_ntsets = _ini.get_go_ntsets(self.kws.get("GO_FILE")) self.go_all = set.union(*[nt.go_set for nt in self.go_ntsets]) _tcntobj = _ini.get_tcntobj(self.go_all, **self.kws) # Gets TermCounts or None - self.gosubdag = GoSubDag(self.go_all, self.godag, True, tcntobj=_tcntobj, prt=sys.stdout) - self.objgrpd = _ini.get_grouped(self.go_ntsets, self.go_all, self.gosubdag, **self.kws) + self.gosubdag = GoSubDag( + self.go_all, self.godag, True, tcntobj=_tcntobj, prt=sys.stdout + ) + self.objgrpd = _ini.get_grouped( + self.go_ntsets, self.go_all, self.gosubdag, **self.kws + ) # KWS: sortby hdrgo_sortby section_sortby def write(self, fout_xlsx=None, fout_txt=None, verbose=False): """Command-line interface for go_draw script.""" - sortby = self._get_fncsortnt(self.objgrpd.grprobj.gosubdag.prt_attr['flds']) - kws_sort = {'sortby' if verbose else 'section_sortby': sortby} + sortby = self._get_fncsortnt(self.objgrpd.grprobj.gosubdag.prt_attr["flds"]) + kws_sort = {"sortby" if verbose else "section_sortby": sortby} sortobj = Sorter(self.objgrpd.grprobj, **kws_sort) # KWS: hdrgo_prt=True section_prt=None top_n=None use_sections=True # RET: {sortobj, sections, hdrgo_prt} or {sortobj flat hdrgo_prt} desc2nts = sortobj.get_desc2nts_fnc( - hdrgo_prt=verbose, - section_prt=True, - top_n=None, - use_sections=True) + hdrgo_prt=verbose, section_prt=True, top_n=None, use_sections=True + ) # print('FFFF', desc2nts['flds']) # Write user GO IDs in sections objgowr = WrXlsxSortedGos("init", sortobj, self.objgrpd.ver_list) if fout_xlsx is not None: - kws_xlsx = {'shade_hdrgos':verbose} + kws_xlsx = {"shade_hdrgos": verbose} if not verbose: - kws_xlsx['prt_flds'] = [f for f in desc2nts['flds'] if f not in self.excl_flds] + kws_xlsx["prt_flds"] = [ + f for f in desc2nts["flds"] if f not in self.excl_flds + ] self._adj_hdrs(kws_xlsx, desc2nts) objgowr.wr_xlsx_nts(fout_xlsx, desc2nts, **kws_xlsx) - fout_desc = '{BASE}_desc.txt'.format(BASE=os.path.splitext(fout_xlsx)[0]) + fout_desc = "{BASE}_desc.txt".format(BASE=os.path.splitext(fout_xlsx)[0]) self._wr_ver_n_key(fout_desc, verbose) if fout_txt is not None: self._wr_txt_nts(fout_txt, desc2nts, objgowr, verbose) @@ -108,98 +136,121 @@ def write(self, fout_xlsx=None, fout_txt=None, verbose=False): summary_dct = objgowr.prt_txt_desc2nts(sys.stdout, desc2nts, prtfmt) self._prt_ver_n_key(sys.stdout, verbose) if summary_dct: - print("\n{N} GO IDs in {S} sections".format( - N=desc2nts['num_items'], S=desc2nts['num_sections'])) + print( + "\n{N} GO IDs in {S} sections".format( + N=desc2nts["num_items"], S=desc2nts["num_sections"] + ) + ) def _adj_hdrs(self, kws_xlsx, desc2nts): """Replace xlsx column header, fileN, with base input filenames""" filehdrs = [nt.hdr for nt in self.go_ntsets] num_files = len(filehdrs) if num_files == len(set(filehdrs)): - kws_xlsx['hdrs'] = filehdrs + list(desc2nts['flds'][num_files:]) + kws_xlsx["hdrs"] = filehdrs + list(desc2nts["flds"][num_files:]) def _get_prtfmt(self, objgowr, verbose): """Get print format containing markers.""" - prtfmt = objgowr.get_prtfmt('fmt') - prtfmt = prtfmt.replace('# ', '') + prtfmt = objgowr.get_prtfmt("fmt") + prtfmt = prtfmt.replace("# ", "") if not verbose: - prtfmt = prtfmt.replace('{hdr1usr01:2}', '') - prtfmt = prtfmt.replace('{childcnt:3} L{level:02} ', '') - prtfmt = prtfmt.replace('{num_usrgos:>4} uGOs ', '') - prtfmt = prtfmt.replace('{D1:5} {REL} {rel}', '') - prtfmt = prtfmt.replace('R{reldepth:02} ', '') - marks = ''.join(['{{{}}}'.format(nt.fileN) for nt in self.go_ntsets]) - return '{MARKS} {PRTFMT}'.format(MARKS=marks, PRTFMT=prtfmt) + prtfmt = prtfmt.replace("{hdr1usr01:2}", "") + prtfmt = prtfmt.replace("{childcnt:3} L{level:02} ", "") + prtfmt = prtfmt.replace("{num_usrgos:>4} uGOs ", "") + prtfmt = prtfmt.replace("{D1:5} {REL} {rel}", "") + prtfmt = prtfmt.replace("R{reldepth:02} ", "") + marks = "".join(["{{{}}}".format(nt.fileN) for nt in self.go_ntsets]) + return "{MARKS} {PRTFMT}".format(MARKS=marks, PRTFMT=prtfmt) @staticmethod def _get_fncsortnt(flds): """Return a sort function for sorting header GO IDs found in sections.""" - if 'tinfo' in flds: - return lambda ntgo: [ntgo.NS, -1*ntgo.tinfo, ntgo.depth, ntgo.alt] - if 'dcnt' in flds: - return lambda ntgo: [ntgo.NS, -1*ntgo.dcnt, ntgo.depth, ntgo.alt] - return lambda ntgo: [ntgo.NS, -1*ntgo.depth, ntgo.alt] + if "tinfo" in flds: + return lambda ntgo: [ntgo.NS, -1 * ntgo.tinfo, ntgo.depth, ntgo.alt] + if "dcnt" in flds: + return lambda ntgo: [ntgo.NS, -1 * ntgo.dcnt, ntgo.depth, ntgo.alt] + return lambda ntgo: [ntgo.NS, -1 * ntgo.depth, ntgo.alt] def _wr_txt_nts(self, fout_txt, desc2nts, objgowr, verbose): """Write grouped and sorted GO IDs to GOs.""" - with open(fout_txt, 'w') as prt: + with open(fout_txt, "w", encoding="utf-8") as prt: self._prt_ver_n_key(prt, verbose) - prt.write('\n\n') - prt.write('# ----------------------------------------------------------------\n') - prt.write('# - Sections and GO IDs\n') - prt.write('# ----------------------------------------------------------------\n') + prt.write("\n\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write("# - Sections and GO IDs\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) prtfmt = self._get_prtfmt(objgowr, verbose) summary_dct = objgowr.prt_txt_desc2nts(prt, desc2nts, prtfmt) if summary_dct: - print(" {N:>5} GO IDs WROTE: {FOUT} ({S} sections)".format( - N=desc2nts['num_items'], FOUT=fout_txt, S=desc2nts['num_sections'])) + print( + " {N:>5} GO IDs WROTE: {FOUT} ({S} sections)".format( + N=desc2nts["num_items"], + FOUT=fout_txt, + S=desc2nts["num_sections"], + ) + ) else: print(" WROTE: {TXT}".format(TXT=fout_txt)) def _wr_ver_n_key(self, fout_txt, verbose): """Write GO DAG version and key indicating presence of GO ID in a list.""" - with open(fout_txt, 'w') as prt: + with open(fout_txt, "w", encoding="utf-8") as prt: self._prt_ver_n_key(prt, verbose) - print(' WROTE: {TXT}'.format(TXT=fout_txt)) - + print(" WROTE: {TXT}".format(TXT=fout_txt)) def _prt_ver_n_key(self, prt, verbose): """Print GO DAG version and key indicating presence of GO ID in a list.""" - pre = '# ' - prt.write('# ----------------------------------------------------------------\n') - prt.write('# - Description of GO ID fields\n') - prt.write('# ----------------------------------------------------------------\n') - prt.write("# Versions:\n# {VER}\n".format(VER="\n# ".join(self.objgrpd.ver_list))) - prt.write('\n# Marker keys:\n') + pre = "# " + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write("# - Description of GO ID fields\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write( + "# Versions:\n# {VER}\n".format( + VER="\n# ".join(self.objgrpd.ver_list) + ) + ) + prt.write("\n# Marker keys:\n") for ntgos in self.go_ntsets: - prt.write('# X -> GO is present in {HDR}\n'.format(HDR=ntgos.hdr)) + prt.write("# X -> GO is present in {HDR}\n".format(HDR=ntgos.hdr)) if verbose: - prt.write('\n# Markers for header GO IDs and user GO IDs:\n') + prt.write("\n# Markers for header GO IDs and user GO IDs:\n") prt.write("# '**' -> GO term is both a header and a user GO ID\n") prt.write("# '* ' -> GO term is a header, but not a user GO ID\n") prt.write("# ' ' -> GO term is a user GO ID\n") - prt.write('\n# GO Namspaces:\n') - prt.write('# BP -> Biological Process\n') - prt.write('# MF -> Molecular Function\n') - prt.write('# CC -> Cellular Component\n') + prt.write("\n# GO Namspaces:\n") + prt.write("# BP -> Biological Process\n") + prt.write("# MF -> Molecular Function\n") + prt.write("# CC -> Cellular Component\n") if verbose: - prt.write('\n# Example fields: 5 uGOs 362 47 L04 D04 R04\n') - prt.write('# N uGOs -> number of user GO IDs under this GO header\n') - prt.write('# First integer -> number of GO descendants\n') - prt.write('# Second integer -> number of GO children for the current GO ID\n') - prt.write('\n# Depth information:\n') + prt.write("\n# Example fields: 5 uGOs 362 47 L04 D04 R04\n") + prt.write( + "# N uGOs -> number of user GO IDs under this GO header\n" + ) + prt.write("# First integer -> number of GO descendants\n") + prt.write( + "# Second integer -> number of GO children for the current GO ID\n" + ) + prt.write("\n# Depth information:\n") if not verbose: - prt.write('# int -> number of GO descendants\n') + prt.write("# int -> number of GO descendants\n") if verbose: - prt.write('# Lnn -> level (minimum distance from root to node)\n') - prt.write('# Dnn -> depth (maximum distance from root to node)\n') + prt.write("# Lnn -> level (minimum distance from root to node)\n") + prt.write("# Dnn -> depth (maximum distance from root to node)\n") if verbose: - prt.write('# Rnn -> depth accounting for relationships\n\n') + prt.write("# Rnn -> depth accounting for relationships\n\n") RelationshipStr().prt_keys(prt, pre) if verbose: - prt.write('\n') + prt.write("\n") objd1 = GoDepth1LettersWr(self.gosubdag.rcntobj) - objd1.prt_header(prt, 'DEPTH-01 GO terms and their aliases', pre) + objd1.prt_header(prt, "DEPTH-01 GO terms and their aliases", pre) objd1.prt_txt(prt, pre) @@ -212,7 +263,7 @@ def __init__(self, godag): def get_tcntobj(self, go_all, **kws): """Get a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gaf (gene2go taxid) - if 'gaf' in kws or 'gene2go' in kws: + if "gaf" in kws or "gene2go" in kws: # Get a reduced go2obj set for TermCounts _gosubdag = GoSubDag(go_all, self.godag, rcntobj=False, prt=None) return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts @@ -220,8 +271,8 @@ def get_tcntobj(self, go_all, **kws): def get_grouped(self, go_ntsets, go_all, gosubdag, **kws): """Get Grouped object.""" - kws_grpd = {k:v for k, v in kws.items() if k in Grouped.kws_dict} - kws_grpd['go2nt'] = self._init_go2ntpresent(go_ntsets, go_all, gosubdag) + kws_grpd = {k: v for k, v in kws.items() if k in Grouped.kws_dict} + kws_grpd["go2nt"] = self._init_go2ntpresent(go_ntsets, go_all, gosubdag) return Grouped(gosubdag, self.godag.version, **kws_grpd) @staticmethod @@ -229,16 +280,16 @@ def _init_go2ntpresent(go_ntsets, go_all, gosubdag): """Mark all GO IDs with an X if present in the user GO list.""" go2ntpresent = {} flds = " ".join(nt.fileN for nt in go_ntsets) - ntobj = namedtuple('NtPresent', flds) + ntobj = namedtuple("NtPresent", flds) # Get present marks for GO sources for goid_all in go_all: present_true = [goid_all in nt.go_set for nt in go_ntsets] - present_str = ['X' if tf else '.' for tf in present_true] + present_str = ["X" if tf else "." for tf in present_true] go2ntpresent[goid_all] = ntobj._make(present_str) # Get present marks for all other GO ancestors goids_ancestors = set(gosubdag.go2obj).difference(go2ntpresent) assert not goids_ancestors.intersection(go_all) - strmark = ['.' for _ in range(len(go_ntsets))] + strmark = ["." for _ in range(len(go_ntsets))] for goid in goids_ancestors: go2ntpresent[goid] = ntobj._make(strmark) return go2ntpresent @@ -247,7 +298,7 @@ def get_go_ntsets(self, go_fins): """For each file containing GOs, extract GO IDs, store filename and header.""" nts = [] go_fins = list(go_fins) - ntobj = namedtuple('NtGOFiles', 'fileN hdr go_set go_fin') + ntobj = namedtuple("NtGOFiles", "fileN hdr go_set go_fin") go_sets = self._init_go_sets(go_fins) hdrs = [os.path.splitext(os.path.basename(f))[0] for f in go_fins] assert len(go_fins) == len(go_sets) @@ -256,10 +307,14 @@ def get_go_ntsets(self, go_fins): for idx, (hdr, go_set, go_fin) in enumerate(zip(hdrs, go_sets, go_fins), 1): goids.update(go_set) if not go_set: - print('**WARNING: NO GO IDs FOUND IN {FIN}'.format(FIN=go_fin)) - nts.append(ntobj(fileN='file{I}'.format(I=idx), hdr=hdr, go_set=go_set, go_fin=go_fin)) + logger.warning("NO GO IDs FOUND IN %s", go_fin) + nts.append( + ntobj( + fileN="file{I}".format(I=idx), hdr=hdr, go_set=go_set, go_fin=go_fin + ) + ) if not goids: - print('**WARNING: NO GO IDs FOUND') + logger.warning("NO GO IDs FOUND") sys.exit(1) return nts @@ -268,7 +323,8 @@ def _init_go_sets(self, go_fins): go_sets = [] assert go_fins, "EXPECTED FILES CONTAINING GO IDs" assert len(go_fins) >= 2, "EXPECTED 2+ GO LISTS. FOUND: {L}".format( - L=' '.join(go_fins)) + L=" ".join(go_fins) + ) obj = GetGOs(self.godag) for fin in go_fins: assert os.path.exists(fin), "GO FILE({F}) DOES NOT EXIST".format(F=fin) diff --git a/goatools/cli/wr_sections.py b/goatools/cli/wr_sections.py index ab65dd03..6dc9f50c 100644 --- a/goatools/cli/wr_sections.py +++ b/goatools/cli/wr_sections.py @@ -31,32 +31,40 @@ import os import sys -from goatools.base import get_godag -from goatools.associations import get_tcntobj +from ..associations import get_tcntobj +from ..base import get_godag +from ..gosubdag.gosubdag import GoSubDag +from ..grouper.grprdflts import GrouperDflts +from ..grouper.grprobj import Grouper +from ..grouper.hdrgos import HdrgosSections +from ..grouper.read_goids import read_sections +from ..grouper.sorter import Sorter +from ..grouper.wr_sections import WrSectionsPy, WrSectionsTxt +from ..grouper.wrxlsx import WrXlsxSortedGos -from goatools.cli.docopt_parse import DocOptParse -from goatools.cli.gos_get import GetGOs - -from goatools.gosubdag.gosubdag import GoSubDag - -from goatools.grouper.read_goids import read_sections -from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections -from goatools.grouper.grprobj import Grouper -from goatools.grouper.wr_sections import WrSectionsTxt -from goatools.grouper.wr_sections import WrSectionsPy -from goatools.grouper.sorter import Sorter -from goatools.grouper.wrxlsx import WrXlsxSortedGos +from .docopt_parse import DocOptParse +from .gos_get import GetGOs # pylint: disable=too-few-public-methods class WrSectionsCli(object): """Class for command-line interface for creating GO term diagrams""" - kws_dict = set(['GO_FILE', 'obo', 'slims', - 'ifile', 'ofile', 'txt', - 'py', 'xlsx', - 'gaf', 'gene2go', 'taxid']) + kws_dict = set( + [ + "GO_FILE", + "obo", + "slims", + "ifile", + "ofile", + "txt", + "py", + "xlsx", + "gaf", + "gene2go", + "taxid", + ] + ) kws_set = set() def __init__(self, gosubdag=None): @@ -66,30 +74,32 @@ def __init__(self, gosubdag=None): def cli(self, prt=sys.stdout): """Command-line interface for go_draw script.""" kws = self.objdoc.get_docargs(prt=None) - godag = get_godag(kws['obo'], prt=None, loading_bar=False, optional_attrs=['relationship']) - usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get('GO_FILE'), prt) + godag = get_godag(kws["obo"], prt=None, optional_attrs=["relationship"]) + usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get("GO_FILE"), prt) tcntobj = self._get_tcntobj(usrgos, godag, **kws) # Gets TermCounts or None - self.gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None) - grprdflt = GrouperDflts(self.gosubdag, kws['slims']) + self.gosubdag = GoSubDag( + usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None + ) + grprdflt = GrouperDflts(self.gosubdag, kws["slims"]) ver_list = [godag.version, grprdflt.ver_goslims] prt.write("{VER}\n".format(VER="\n".join(ver_list))) - sections = self._read_sections(kws['ifile']) + sections = self._read_sections(kws["ifile"]) # print("SECSECSEC", sections) hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections) grprobj = Grouper("init", usrgos, hdrobj, self.gosubdag) # Write sections objsecwr = WrSectionsTxt(grprobj, ver_list) - if not os.path.exists(kws['ifile']): - objsecwr.wr_txt_section_hdrgos(kws['ifile']) - objsecwr.wr_txt_section_hdrgos(kws['ofile']) + if not os.path.exists(kws["ifile"]): + objsecwr.wr_txt_section_hdrgos(kws["ifile"]) + objsecwr.wr_txt_section_hdrgos(kws["ofile"]) objsecpy = WrSectionsPy(grprobj, ver_list) - if 'py' in kws: - objsecpy.wr_py_sections(kws['py'], sections, doc=godag.version) + if "py" in kws: + objsecpy.wr_py_sections(kws["py"], sections, doc=godag.version) # Write user GO IDs in sections sortobj = Sorter(grprobj) objgowr = WrXlsxSortedGos("init", sortobj, ver_list) - objgowr.wr_txt_gos(kws['txt'], sortby=objsecpy.fncsortnt) - #objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt) + objgowr.wr_txt_gos(kws["txt"], sortby=objsecpy.fncsortnt) + # objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt) self._prt_cnt_usrgos(usrgos, sys.stdout) @staticmethod @@ -109,7 +119,7 @@ def _prt_cnt_usrgos(self, usrgos_read, prt): def _get_tcntobj(goids, go2obj, **kws): """Get a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gaf (gene2go taxid) - if 'gaf' in kws or 'gene2go' in kws: + if "gaf" in kws or "gene2go" in kws: # Get a reduced go2obj set for TermCounts _gosubdag = GoSubDag(goids, go2obj, rcntobj=False, prt=None) return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts diff --git a/goatools/godag/prttime.py b/goatools/godag/prttime.py index b86b2a92..3df0e9fb 100755 --- a/goatools/godag/prttime.py +++ b/goatools/godag/prttime.py @@ -8,8 +8,9 @@ import sys import timeit import datetime -from goatools.obo_parser import GODag -from goatools.base import download_go_basic_obo + +from ..base import download_go_basic_obo +from ..obo_parser import GODag # pylint: disable=too-few-public-methods @@ -21,12 +22,14 @@ def __init__(self, fin_obo, opt_field=None, keep_alt_ids=False): self.obo = fin_obo self._init_dnld_dag() self.godag = self.load_dag(self.opt) - self.go2obj = self.godag if keep_alt_ids else {o.id:o for o in self.godag.values()} + self.go2obj = ( + self.godag if keep_alt_ids else {o.id: o for o in self.godag.values()} + ) def _init_dnld_dag(self): """If dag does not exist, download it.""" if not os.path.exists(self.obo): - download_go_basic_obo(self.obo, loading_bar=None) + download_go_basic_obo(self.obo) def load_dag(self, opt_fields=None): """Run numerous tests for various self.reports.""" @@ -34,13 +37,15 @@ def load_dag(self, opt_fields=None): dag = GODag(self.obo, opt_fields) toc = timeit.default_timer() msg = "Elapsed HMS for OBO DAG load: {HMS} OPTIONAL_ATTR({O})\n".format( - HMS=str(datetime.timedelta(seconds=(toc-tic))), O=opt_fields) + HMS=str(datetime.timedelta(seconds=(toc - tic))), O=opt_fields + ) sys.stdout.write(msg) return dag + def prt_hms(tic, msg, prt=sys.stdout): """Print elapsed time including Hours, Minutes, and seconds with a user message.""" - hms = str(datetime.timedelta(seconds=(timeit.default_timer()-tic))) + hms = str(datetime.timedelta(seconds=(timeit.default_timer() - tic))) prt.write("{HMS}: {MSG}\n".format(HMS=hms, MSG=msg)) return timeit.default_timer() diff --git a/goatools/grouper/grprdflts.py b/goatools/grouper/grprdflts.py index 6d44cb75..784158af 100644 --- a/goatools/grouper/grprdflts.py +++ b/goatools/grouper/grprdflts.py @@ -16,23 +16,26 @@ __author__ = "DV Klopfenstein" import sys -from goatools.base import get_godag -from goatools.gosubdag.gosubdag import GoSubDag + +from ..base import get_godag +from ..gosubdag.gosubdag import GoSubDag class GrouperDflts(object): """Holds objects that we would like to load and initialize once. - Files used for grouping GO IDs: + Files used for grouping GO IDs: - http://geneontology.org/ontology/go-basic.obo - http://geneontology.org/ontology/subsets/goslim_generic.obo + http://geneontology.org/ontology/go-basic.obo + http://geneontology.org/ontology/subsets/goslim_generic.obo """ - def __init__(self, gosubdag=None, goslim_filename="goslim_generic.obo", hdrgos=None): + def __init__( + self, gosubdag=None, goslim_filename="goslim_generic.obo", hdrgos=None + ): self.gosubdag = self.get_gosubdag(gosubdag) - _dagslim = get_godag(goslim_filename, prt=sys.stdout, loading_bar=False) + _dagslim = get_godag(goslim_filename, prt=sys.stdout) self.ver_goslims = _dagslim.version self.goslims = self._init_goslims(_dagslim) self.hdrgos_dflt = self._init_hdrgos() if hdrgos is None else hdrgos # goid set @@ -42,7 +45,6 @@ def _init_hdrgos(self): # Get all GO terms that are at depth-00 or depth-01 hdrgos = self.get_gos_d0d1() hdrgos |= self.goslims - # self.gosubdag.prt_goids(hdrgos) return hdrgos def _init_goslims(self, dagslim): @@ -55,7 +57,9 @@ def _init_goslims(self, dagslim): def get_gos_d0d1(self): """Return GO IDs whose depth is 0 (BP, MF, CC) or depth is 1.""" - return set([o.id for d in [0, 1] for o in self.gosubdag.rcntobj.depth2goobjs.get(d)]) + return set( + [o.id for d in [0, 1] for o in self.gosubdag.rcntobj.depth2goobjs.get(d)] + ) def _get_goslimids_norel(self, dagslim): """Get all GO slim GO IDs that do not have a relationship.""" diff --git a/goatools/test_data/nature3102_goea.py b/goatools/test_data/nature3102_goea.py index 4624fbdf..03a43d78 100644 --- a/goatools/test_data/nature3102_goea.py +++ b/goatools/test_data/nature3102_goea.py @@ -7,11 +7,13 @@ import pandas as pd from tests.utils import repofn -from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus -from goatools.base import get_godag -from goatools.associations import dnld_ncbi_gene_file -from goatools.go_enrichment import GOEnrichmentStudy -from goatools.anno.genetogo_reader import Gene2GoReader + +from ..anno.genetogo_reader import Gene2GoReader +from ..associations import dnld_ncbi_gene_file +from ..base import get_godag +from ..go_enrichment import GOEnrichmentStudy + +from .genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus def get_goea_results(keep_if=None): @@ -55,7 +57,7 @@ def get_geneid2symbol(fin_xlsx): def get_goeaobj(method, geneids_pop, taxid, nspc="BP"): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") - godag = get_godag(fin_obo, loading_bar=None) + godag = get_godag(fin_obo) assoc_geneid2gos = get_annotations(taxid, nspc) goeaobj = GOEnrichmentStudy( geneids_pop, diff --git a/setup.cfg b/setup.cfg index 9efe894b..4c9f4a1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,16 +37,17 @@ packages = goatools.semsim goatools.semsim.termwise install_requires = - pandas + docopt + ftpretty numpy - scipy - xlsxwriter - statsmodels openpyxl - docopt + pandas pydot requests rich + scipy + statsmodels + xlsxwriter include_package_data = True scripts = scripts/wr_sections.py diff --git a/tests/godagtimed_old.py b/tests/godagtimed_old.py index c8de9a29..f2c4049e 100755 --- a/tests/godagtimed_old.py +++ b/tests/godagtimed_old.py @@ -3,20 +3,23 @@ import os import timeit + from goatools.test_data.godag_timed import GoDagTimed from goatools.test_data.godag_timed import prt_hms from goatools.base import download_go_basic_obo REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_deprecatedloc_godagtimed(): """Test deprecated location of GoDagTimed""" tic = timeit.default_timer() - prt_hms(tic, 'prt_hms TESTED') + prt_hms(tic, "prt_hms TESTED") fin_go_obo = os.path.join(REPO, "go-basic.obo") - download_go_basic_obo(fin_go_obo, loading_bar=None) + download_go_basic_obo(fin_go_obo) GoDagTimed(fin_go_obo) -if __name__ == '__main__': + +if __name__ == "__main__": test_deprecatedloc_godagtimed() diff --git a/tests/i148_semsim_lin.py b/tests/i148_semsim_lin.py index 773ff052..3f515b0f 100755 --- a/tests/i148_semsim_lin.py +++ b/tests/i148_semsim_lin.py @@ -4,57 +4,59 @@ import os import sys + from itertools import combinations_with_replacement as combo_w_rplc -from goatools.base import get_godag -from goatools.associations import dnld_annofile + from goatools.anno.gpad_reader import GpadReader -#### from goatools.semantic import semantic_similarity -from goatools.semantic import TermCounts -#### from goatools.semantic import get_info_content -#### from goatools.semantic import deepest_common_ancestor -from goatools.semantic import resnik_sim +from goatools.associations import dnld_annofile +from goatools.base import get_godag from goatools.semantic import lin_sim -#### from goatools.godag.consts import NS2GO +from goatools.semantic import TermCounts REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") -def test_i148_semsim_lin(prt=sys.stdout): + +def test_i148_semsim_lin(): """Test for issue 148, Lin Similarity if a term has no annotations""" - fin_gpad = os.path.join(REPO, 'goa_human.gpad') - dnld_annofile(fin_gpad, 'gpad') + fin_gpad = os.path.join(REPO, "goa_human.gpad") + dnld_annofile(fin_gpad, "gpad") - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) + godag = get_godag(os.path.join(REPO, "go-basic.obo")) annoobj = GpadReader(fin_gpad, godag=godag) goids = [ - 'GO:0042581', - 'GO:0101002', - 'GO:0042582', - 'GO:0070820', - 'GO:0008021', - 'GO:0005766', - 'GO:0016591'] - - associations = annoobj.get_id2gos('CC') + "GO:0042581", + "GO:0101002", + "GO:0042582", + "GO:0070820", + "GO:0008021", + "GO:0005766", + "GO:0016591", + ] + + associations = annoobj.get_id2gos("CC") termcounts = TermCounts(godag, associations) # Calculate Lin values - p2v = {frozenset([a, b]): lin_sim(a, b, godag, termcounts) for a, b in combo_w_rplc(goids, 2)} + p2v = { + frozenset([a, b]): lin_sim(a, b, godag, termcounts) + for a, b in combo_w_rplc(goids, 2) + } _prt_values(goids, p2v, prt=sys.stdout) def _prt_values(goids, p2v, prt=sys.stdout): """Print values""" - prt.write(' {HDR}\n'.format(HDR=' '.join(goids))) - none = 'None ' + prt.write(" {HDR}\n".format(HDR=" ".join(goids))) + none = "None " for go_row in goids: - prt.write('{GO} '.format(GO=go_row)) + prt.write("{GO} ".format(GO=go_row)) for go_col in goids: val = p2v[frozenset([go_row, go_col])] - txt = '{L:<9.6} '.format(L=val) if val is not None else none - prt.write('{T:10} '.format(T=txt)) - prt.write('\n') + txt = "{L:<9.6} ".format(L=val) if val is not None else none + prt.write("{T:10} ".format(T=txt)) + prt.write("\n") -if __name__ == '__main__': +if __name__ == "__main__": test_i148_semsim_lin() diff --git a/tests/test_altid_godag.py b/tests/test_altid_godag.py index bdca9690..94b197a8 100644 --- a/tests/test_altid_godag.py +++ b/tests/test_altid_godag.py @@ -2,14 +2,16 @@ from goatools.base import get_godag + def test_alt_id(): """Ensure that alternate GO IDs.""" - obo_dag = get_godag("go-basic.obo", loading_bar=None) + obo_dag = get_godag("go-basic.obo") alt_ids = get_altids(obo_dag) obo_goids = obo_dag.keys() obo_goids_set = set(obo_goids) assert len(alt_ids.intersection(obo_goids_set)) == len(alt_ids) + def get_altids(obo_dag): """Get all alternate GO ids for entire go-basic.obo DAG.""" alt_ids_all = set() @@ -19,5 +21,6 @@ def get_altids(obo_dag): alt_ids_all |= set(alt_ids_cur) return alt_ids_all -if __name__ == '__main__': + +if __name__ == "__main__": test_alt_id() diff --git a/tests/test_anno_rd_gene2go.py b/tests/test_anno_rd_gene2go.py index da28910e..7b0275cd 100755 --- a/tests/test_anno_rd_gene2go.py +++ b/tests/test_anno_rd_gene2go.py @@ -1,64 +1,69 @@ #!/usr/bin/env python """Ensure NEW results are equal to OLD results: read_ncbi_gene2go.""" -from __future__ import print_function - import os import sys + from collections import defaultdict + from goatools.associations import dnld_ncbi_gene_file from goatools.anno.genetogo_reader import Gene2GoReader REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_anno_read(): """Test reading an NCBI gene2go annotation file.""" - fin_anno = os.path.join(REPO, 'gene2go') + fin_anno = os.path.join(REPO, "gene2go") _dnld_anno(fin_anno) - #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None) - print('\nTEST STORING ONLY ONE SPECIES') + print("\nTEST STORING ONLY ONE SPECIES") obj = Gene2GoReader(fin_anno) assert len(obj.taxid2asscs) == 1 obj.prt_summary_anno2ev() - print('\nTEST STORING ALL SPECIES') + print("\nTEST STORING ALL SPECIES") obj = Gene2GoReader(fin_anno, taxids=True) - assert len(obj.taxid2asscs) > 1, '**EXPECTED MORE: len(taxid2asscs) == {N}'.format( - N=len(obj.taxid2asscs)) + assert len(obj.taxid2asscs) > 1, "**EXPECTED MORE: len(taxid2asscs) == {N}".format( + N=len(obj.taxid2asscs) + ) obj.prt_summary_anno2ev() - print('\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES') + print("\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES") print("\nTEST read_ncbi_gene2go_old: [9606]") old_g2go_hsa = read_ncbi_gene2go_old(fin_anno, [9606]) ## new_g2go_hsa = read_ncbi_gene2go(fin_anno, [9606]) new_g2go_hsa = obj.get_id2gos_nss(taxids=[9606]) - assert old_g2go_hsa == new_g2go_hsa, \ - 'OLD({O}) != NEW({N})'.format(O=len(old_g2go_hsa), N=len(new_g2go_hsa)) + assert old_g2go_hsa == new_g2go_hsa, "OLD({O}) != NEW({N})".format( + O=len(old_g2go_hsa), N=len(new_g2go_hsa) + ) print("\nTEST read_ncbi_gene2go_old: 9606") ## assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, 9606) assert old_g2go_hsa == obj.get_id2gos_nss(taxid=9606) - print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES') + print("\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES") go2geneids = True print("\nTEST read_ncbi_gene2go_old: 9606 go2geneids=True") old_go2gs_hsa = read_ncbi_gene2go_old(fin_anno, [9606], go2geneids=go2geneids) ## new_go2gs_hsa = read_ncbi_gene2go(fin_anno, 9606, go2geneids=go2geneids) new_go2gs_hsa = obj.get_id2gos_nss(taxid=9606, go2geneids=go2geneids) - print('OLD:', next(iter(old_go2gs_hsa.items()))) - print('NEW:', next(iter(new_go2gs_hsa.items()))) - assert old_go2gs_hsa == new_go2gs_hsa, \ - 'OLD({O}) != NEW({N})'.format(O=len(old_go2gs_hsa), N=len(new_go2gs_hsa)) + print("OLD:", next(iter(old_go2gs_hsa.items()))) + print("NEW:", next(iter(new_go2gs_hsa.items()))) + assert old_go2gs_hsa == new_go2gs_hsa, "OLD({O}) != NEW({N})".format( + O=len(old_go2gs_hsa), N=len(new_go2gs_hsa) + ) - print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES') - evcodes = set(['ISO', 'IKR']) + print("\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES") + evcodes = set(["ISO", "IKR"]) print("\nTEST read_ncbi_gene2go_old: 9606 evcodes=True") - old_gene2gos_evc = read_ncbi_gene2go_old(fin_anno, taxids=[9606], ev_include=evcodes) + old_gene2gos_evc = read_ncbi_gene2go_old( + fin_anno, taxids=[9606], ev_include=evcodes + ) ## new_gene2gos_evc = read_ncbi_gene2go(fin_anno, 9606, ev_include=evcodes) new_gene2gos_evc = obj.get_id2gos_nss(taxid=9606, ev_include=evcodes) - print('OLD:', next(iter(old_gene2gos_evc.items()))) - print('NEW:', next(iter(new_gene2gos_evc.items()))) + print("OLD:", next(iter(old_gene2gos_evc.items()))) + print("NEW:", next(iter(new_gene2gos_evc.items()))) assert old_gene2gos_evc == new_gene2gos_evc @@ -67,10 +72,11 @@ def _dnld_anno(file_anno): if os.path.exists(file_anno): assert os.path.getsize(file_anno) > 1000000, "BAD ANNO({F})".format(F=file_anno) return - dnld_ncbi_gene_file(file_anno, loading_bar=None) + dnld_ncbi_gene_file(file_anno) assert os.path.isfile(file_anno), "MISSING ANNO({F})".format(F=file_anno) assert os.path.getsize(file_anno) > 1000000, "BAD ANNO({F})".format(F=file_anno) + # Fomerly in goatools/associations.py file def read_ncbi_gene2go_old(fin_gene2go, taxids=None, **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" @@ -79,26 +85,30 @@ def read_ncbi_gene2go_old(fin_gene2go, taxids=None, **kws): id2gos = defaultdict(set) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) - taxid2asscs = kws.get('taxid2asscs', None) - evs = kws.get('ev_include', None) + taxid2asscs = kws.get("taxid2asscs", None) + evs = kws.get("ev_include", None) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True - b_geneid2gos = not kws.get('go2geneids', False) - if taxids is None: # Default taxid is Human + b_geneid2gos = not kws.get("go2geneids", False) + if taxids is None: # Default taxid is Human taxids = [9606] - with open(fin_gene2go) as ifstrm: + with open(fin_gene2go, encoding="utf-8") as ifstrm: # pylint: disable=too-many-nested-blocks for line in ifstrm: - if line[0] != '#': # Line contains data. Not a comment - line = line.rstrip() # chomp - flds = line.split('\t') + if line[0] != "#": # Line contains data. Not a comment + line = line.rstrip() # chomp + flds = line.split("\t") if len(flds) >= 5: taxid_curr, geneid, go_id, evidence, qualifier = flds[:5] taxid_curr = int(taxid_curr) # NOT: Used when gene is expected to have function F, but does NOT. # ND : GO function not seen after exhaustive annotation attempts to the gene. ## if taxid_curr in taxids and qualifier != 'NOT' and evidence != 'ND': - if taxid_curr in taxids and 'NOT' not in qualifier and evidence != 'ND': + if ( + taxid_curr in taxids + and "NOT" not in qualifier + and evidence != "ND" + ): # Optionally specify a subset of GOs based on their evidence. if evs is None or evidence in evs: geneid = int(geneid) @@ -107,11 +117,13 @@ def read_ncbi_gene2go_old(fin_gene2go, taxids=None, **kws): else: id2gos[go_id].add(geneid) if taxid2asscs is not None: - taxid2asscs[taxid_curr]['ID2GOs'][geneid].add(go_id) - taxid2asscs[taxid_curr]['GO2IDs'][go_id].add(geneid) - sys.stdout.write(" {N:,} items READ: {ASSC}\n".format(N=len(id2gos), ASSC=fin_gene2go)) - return id2gos # return simple associations + taxid2asscs[taxid_curr]["ID2GOs"][geneid].add(go_id) + taxid2asscs[taxid_curr]["GO2IDs"][go_id].add(geneid) + sys.stdout.write( + " {N:,} items READ: {ASSC}\n".format(N=len(id2gos), ASSC=fin_gene2go) + ) + return id2gos # return simple associations -if __name__ == '__main__': +if __name__ == "__main__": test_anno_read() diff --git a/tests/test_annotations_gaf.py b/tests/test_annotations_gaf.py index 320354be..770ebea8 100755 --- a/tests/test_annotations_gaf.py +++ b/tests/test_annotations_gaf.py @@ -11,23 +11,24 @@ import sys from collections import defaultdict + from goatools.associations import read_gaf from goatools.base import dnld_gafs def test_gaf_read_goa_human(): """Get associations for human(9606).""" - _test_gaf_read_species(['goa_human']) + _test_gaf_read_species(["goa_human"]) def test_gaf_read_mgi(): """Get associations for mouse(10090).""" - _test_gaf_read_species(['mgi']) + _test_gaf_read_species(["mgi"]) def test_gaf_read_fb(): """Get associations for fly(7227).""" - _test_gaf_read_species(['fb']) + _test_gaf_read_species(["fb"]) def _test_gaf_read_species(species_ids, log=sys.stdout): @@ -37,33 +38,40 @@ def _test_gaf_read_species(species_ids, log=sys.stdout): _test_gaf_read(msg, species_ids, None, log) # Read GAF associations msg = "Read GAF associations; keepif is default in goatools.associations.read_gaf" - keepif = lambda nt: 'NOT' not in nt.Qualifier and nt.Evidence_Code != 'ND' + keepif = lambda nt: "NOT" not in nt.Qualifier and nt.Evidence_Code != "ND" _test_gaf_read(msg, species_ids, keepif, log) # Read GAF associations, allowing ND Evidence codes msg = "Read GAF associations; Allow ND Evidence codes" - keepif = lambda nt: 'NOT' not in nt.Qualifier + keepif = lambda nt: "NOT" not in nt.Qualifier _test_gaf_read(msg, species_ids, keepif, log) # Read GAF associations, allowing ND entries and NOT Qualifiers msg = "Read GAF associations; Allow ND Evidence codes and NOT Qualifiers" keepif = lambda nt: True - #_test_gaf_read(msg, species_ids, keepif, log) + # _test_gaf_read(msg, species_ids, keepif, log) # Limit number of tests for speed _test_gaf_read(msg, species_ids[-1:], keepif, log) + def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) local_dir = os.path.dirname(os.path.abspath(__file__)) - for fin_gaf in dnld_gafs(species_ids, loading_bar=None): + for fin_gaf in dnld_gafs(species_ids): fin_gaf = os.path.join(local_dir, fin_gaf) log.write("\n") id2gos_bp = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif) - id2gos_all = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif, namespace='all') + id2gos_all = read_gaf( + fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif, namespace="all" + ) assert len(id2gos_all) > len(id2gos_bp) if "mgi.gaf" in fin_gaf: _chk_key(id2gos_bp, "MGI:") - log.write(" {N:>6,} IDs found in BP {F}\n".format(N=len(id2gos_bp), F=fin_gaf)) - log.write(" {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all), F=fin_gaf)) + log.write( + " {N:>6,} IDs found in BP {F}\n".format(N=len(id2gos_bp), F=fin_gaf) + ) + log.write( + " {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all), F=fin_gaf) + ) go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif) _chk_key(go2ids, "GO:") log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) @@ -71,22 +79,25 @@ def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): log.write("\n{MSG}\n".format(MSG=msg)) txtpat = " {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n" for taxid, asscs in taxid2asscs.items(): - num_gene2gos = len(asscs.get('ID2GOs')) - num_go2genes = len(asscs.get('GO2IDs')) + num_gene2gos = len(asscs.get("ID2GOs")) + num_go2genes = len(asscs.get("GO2IDs")) log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000 + def _chk_key(a2bs, pattern): """Confirm format of dictionary key.""" for key in a2bs.keys(): if pattern in key: return - raise RuntimeError("PATTERN({P}) NOT FOUND IN KEY({K})".format( - P=pattern, K=key)) + raise RuntimeError( + "PATTERN({P}) NOT FOUND IN KEY({K})".format(P=pattern, K=key) + ) + -if __name__ == '__main__': +if __name__ == "__main__": test_gaf_read_fb() diff --git a/tests/test_assc_stats.py b/tests/test_assc_stats.py index 339317c0..179b16a9 100644 --- a/tests/test_assc_stats.py +++ b/tests/test_assc_stats.py @@ -2,39 +2,49 @@ import sys import os -from goatools.base import get_godag + from goatools.associations import dnld_assc -from goatools.utils import get_b2aset +from goatools.base import get_godag from goatools.statsdescribe import StatsDescribe +from goatools.utils import get_b2aset REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_assc_stats(prt=sys.stdout): """Test association statistics.""" associations = [ - ('hsa', 'goa_human.gaf'), # human - ('mus', 'mgi.gaf'), # mouse - ('dme', 'fb.gaf')] # fly - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) + ("hsa", "goa_human.gaf"), # human + ("mus", "mgi.gaf"), # mouse + ("dme", "fb.gaf"), + ] # fly + godag = get_godag(os.path.join(REPO, "go-basic.obo")) describe_go2obj(godag, prt) - obj = StatsDescribe('Assc', "{:6,}") + obj = StatsDescribe("Assc", "{:6,}") obj.prt_hdr(prt, "Assc.") for org, assc_name in associations: fin_assc = os.path.join(REPO, assc_name) describe_assc(org, fin_assc, godag, obj, prt) + def describe_go2obj(go2obj, prt): """Describe distribution of parent and child GO term counts.""" # Related GO | # GO | range | 25th | median | 75th | mean | stddev # -----------|-------|----------|------|--------|------|------|------- # Parents | 44961 | 0 to 8 | 1 | 1 | 2 | 2 | 1 # Children | 17597 | 1 to 480 | 1 | 2 | 4 | 4 | 10 - cnts_all = [(len(o.children), len(o.parents)) for go, o in go2obj.items() if go == o.id] + cnts_all = [ + (len(o.children), len(o.parents)) for go, o in go2obj.items() if go == o.id + ] cnts_c, cnts_p = zip(*cnts_all) - cnts_c = [n for n in cnts_c if n != 0] # Remove leaf-level counts from reported stats - cnts_p = [n for n in cnts_p if n != 0] # Remove top-level counts from reported stats - obj = StatsDescribe('GO', "{:6,}") + cnts_c = [ + n for n in cnts_c if n != 0 + ] # Remove leaf-level counts from reported stats + cnts_p = [ + n for n in cnts_p if n != 0 + ] # Remove top-level counts from reported stats + obj = StatsDescribe("GO", "{:6,}") obj.prt_hdr(prt, "Related GO") obj.prt_data("Parents", cnts_p, prt) obj.prt_data("Children", cnts_c, prt) @@ -52,7 +62,7 @@ def describe_assc(org, fin_assc, go2obj, obj, prt): # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 - gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations + gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) assert gene2gos assert go2genes @@ -61,5 +71,6 @@ def describe_assc(org, fin_assc, go2obj, obj, prt): obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt) -if __name__ == '__main__': + +if __name__ == "__main__": test_assc_stats() diff --git a/tests/test_asscs_ns.py b/tests/test_asscs_ns.py index 4249975c..fd0ec655 100755 --- a/tests/test_asscs_ns.py +++ b/tests/test_asscs_ns.py @@ -1,27 +1,25 @@ #!/usr/bin/env python """Test TermCounts object used in Resnik and Lin similarity calculations.""" -from __future__ import print_function - +import collections as cx +import datetime import os import sys import timeit -import datetime -import collections as cx + from goatools.base import get_godag from goatools.test_data.gafs import ASSOCIATIONS from goatools.associations import dnld_annotation from goatools.anno.gaf_reader import GafReader from goatools.godag.consts import NS2NAMESPACE -from goatools.godag.consts import NAMESPACE2NS -# from goatools.godag.consts import NAMESPACE2GO TIC = timeit.default_timer() REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" - not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} + not_these = {"goa_uniprot_all.gaf", "goa_uniprot_all_noiea.gaf"} assc_names = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # http://current.geneontology.org/annotations/ @@ -30,7 +28,6 @@ def test_semantic_similarity(usr_assc=None): not_found = set() gaf2errs = cx.defaultdict(list) for assc_name in assc_names: # Limit test numbers for speed - tic = timeit.default_timer() # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, assc_name) if not os.path.exists(fin_gaf): @@ -44,47 +41,61 @@ def test_semantic_similarity(usr_assc=None): gaf2errs[assc_name].append(nta) else: not_found.add(nta.GO_ID) - print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(assc_names))) + print("{HMS} {N} Associations".format(HMS=_hms(TIC), N=len(assc_names))) if not_found: _prt_not_found(not_found) if gaf2errs: - _wr_errs('namespace_errors.txt', gaf2errs, go2obj) + _wr_errs("namespace_errors.txt", gaf2errs, go2obj) + def _wr_errs(fout_err, gaf2errs, go2obj): """Write errors in namespaces seen in annotation files""" - with open(fout_err, 'w') as prt: + with open(fout_err, "w", encoding="utf-8") as prt: err_cnt = 0 gaf_errs = sorted(gaf2errs.items(), key=lambda t: len(t[1])) for gaf, errs in gaf_errs: err_cnt += len(errs) - msg = '{N} mismarked namespaces in {GAF}'.format(GAF=gaf, N=len(errs)) + msg = "{N} mismarked namespaces in {GAF}".format(GAF=gaf, N=len(errs)) print(msg) - prt.write('\n{TITLE}\n'.format(TITLE=msg)) + prt.write("\n{TITLE}\n".format(TITLE=msg)) for nta in errs: - prt.write('\n{GO} ACTUAL({ns}) EXPECTED({NS}) {GAF}:\n'.format( - GO=nta.GO_ID, ns=NS2NAMESPACE[nta.NS], NS=go2obj[nta.GO_ID].namespace, GAF=gaf)) + prt.write( + "\n{GO} ACTUAL({ns}) EXPECTED({NS}) {GAF}:\n".format( + GO=nta.GO_ID, + ns=NS2NAMESPACE[nta.NS], + NS=go2obj[nta.GO_ID].namespace, + GAF=gaf, + ) + ) for fld, val in nta._asdict().items(): - prt.write('{FLD:20}: {VAL}\n'.format(FLD=fld, VAL=val)) + prt.write("{FLD:20}: {VAL}\n".format(FLD=fld, VAL=val)) - print(' {N} GAFs WITH {E} TOTAL ERRORS WROTE: {TXT}'.format( - N=len(gaf_errs), E=err_cnt, TXT=fout_err)) + print( + " {N} GAFs WITH {E} TOTAL ERRORS WROTE: {TXT}".format( + N=len(gaf_errs), E=err_cnt, TXT=fout_err + ) + ) def _prt_not_found(not_found): - print('**WARNING: {N} EMPTY ASSOCIATIONS:'.format(N=len(not_found))) + print("**WARNING: {N} EMPTY ASSOCIATIONS:".format(N=len(not_found))) for idx, assc in enumerate(not_found): - print(' {I}) {ASSC}'.format(I=idx, ASSC=assc)) + print(" {I}) {ASSC}".format(I=idx, ASSC=assc)) + def _hms(tic): """Get Timing.""" - return '{HMS}'.format(HMS=str(datetime.timedelta(seconds=(timeit.default_timer()-tic)))) + return "{HMS}".format( + HMS=str(datetime.timedelta(seconds=(timeit.default_timer() - tic))) + ) def get_go2obj(): """Read GODag and return go2obj.""" - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) - return {go:o for go, o in godag.items() if not o.is_obsolete} + godag = get_godag(os.path.join(REPO, "go-basic.obo")) + return {go: o for go, o in godag.items() if not o.is_obsolete} + -if __name__ == '__main__': +if __name__ == "__main__": ASSC_NAME = None if len(sys.argv) == 1 else sys.argv[1] test_semantic_similarity(ASSC_NAME) diff --git a/tests/test_cli_write_hierarchy.py b/tests/test_cli_write_hierarchy.py index 89862bdb..8a6229fc 100755 --- a/tests/test_cli_write_hierarchy.py +++ b/tests/test_cli_write_hierarchy.py @@ -1,13 +1,15 @@ #!/usr/bin/env python """Test that hierarchy below specified GO terms is printed.""" -from __future__ import print_function - -__copyright__ = "Copyright (c) 2017-present, DV Klopfenstein. Haiboa Tang. All rights reserved." +__copyright__ = ( + "Copyright (c) 2017-present, DV Klopfenstein. Haiboa Tang. All rights reserved." +) import os -from goatools.cli.wr_hierarchy import WrHierCli + from goatools.base import download_go_basic_obo +from goatools.cli.wr_hierarchy import WrHierCli + REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") # --o Output file in ASCII text format @@ -17,22 +19,28 @@ # --concise If a branch has already been printed, do not re-print. # Print '===' instead of dashes to note the point of compression + def test_cli(): """Add and remove markers for a file.""" # pylint: disable=bad-whitespace args_exp = [ # args exp_set expected_dict # -------- ------- --------------------- - ([], {'dag':'go-basic.obo', 'dash_len':6}), - (['--dag=go-basic.obo'], {'dag':'go-basic.obo', 'dash_len':6}), - (['-o rpt.txt'], {'dag':'go-basic.obo', 'dash_len':6, 'o':'rpt.txt'}), - (['--max_indent=7'], {'dag':'go-basic.obo', 'dash_len':6, 'max_indent':7}), - (['CC', '--concise'], {'dag':'go-basic.obo', 'dash_len':6, 'GO':['CC'], 'concise':True}), - (['--no_indent'], {'dag':'go-basic.obo', 'dash_len':6, 'no_indent':True}), - (['--concise', '--no_indent'], {'dag':'go-basic.obo', 'dash_len':6, - 'concise':True, 'no_indent':True}), + ([], {"dag": "go-basic.obo", "dash_len": 6}), + (["--dag=go-basic.obo"], {"dag": "go-basic.obo", "dash_len": 6}), + (["-o rpt.txt"], {"dag": "go-basic.obo", "dash_len": 6, "o": "rpt.txt"}), + (["--max_indent=7"], {"dag": "go-basic.obo", "dash_len": 6, "max_indent": 7}), + ( + ["CC", "--concise"], + {"dag": "go-basic.obo", "dash_len": 6, "GO": ["CC"], "concise": True}, + ), + (["--no_indent"], {"dag": "go-basic.obo", "dash_len": 6, "no_indent": True}), + ( + ["--concise", "--no_indent"], + {"dag": "go-basic.obo", "dash_len": 6, "concise": True, "no_indent": True}, + ), ] - download_go_basic_obo('go-basic.obo', loading_bar=None) + download_go_basic_obo("go-basic.obo") for idx, (args, exp_dict) in enumerate(args_exp): print("ARGS={ARGS}".format(ARGS=args)) print("EXP={EXP}".format(EXP=exp_dict)) @@ -43,13 +51,13 @@ def test_cli(): print("") # Test writing to a file if obj.goids: - fout_txt = os.path.join(REPO, 'wrhier{N}.txt'.format(N=idx)) - os.system('rm -f {FILE}'.format(FILE=fout_txt)) + fout_txt = os.path.join(REPO, "wrhier{N}.txt".format(N=idx)) + os.system("rm -f {FILE}".format(FILE=fout_txt)) obj.wrtxt_hier(fout_txt) - assert os.path.exists(fout_txt), 'FILE NOT FOUND({F})'.format(F=fout_txt) + assert os.path.exists(fout_txt), "FILE NOT FOUND({F})".format(F=fout_txt) -if __name__ == '__main__': +if __name__ == "__main__": test_cli() # Copyright (c) 2017-present, DV Klopfenstein, Haibao Tang. All rights reserved. diff --git a/tests/test_cmds_find_enrichment_md.py b/tests/test_cmds_find_enrichment_md.py index 30a245e2..2a12ffff 100755 --- a/tests/test_cmds_find_enrichment_md.py +++ b/tests/test_cmds_find_enrichment_md.py @@ -1,60 +1,68 @@ #!/usr/bin/env python3 """Test running an enrichment using any annotation file format.""" -__copyright__ = "Copyright (C) 2010-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2010-present, DV Klopfenstein, H Tang. All rights reserved." +) from os import system from os.path import join import sys -from goatools.base import get_godag -from goatools.associations import dnld_annotation + from tests.utils import REPO +from goatools.associations import dnld_annotation +from goatools.base import get_godag + def test_find_enrichment(run_all=False): """RUn an enrichments using all annotation file formats""" if run_all: - fin_obo = join(REPO, 'go-basic.obo') - get_godag(fin_obo, optional_attrs={'relationship'}, loading_bar=None) - fin_gaf = join(REPO, 'goa_human.gaf') + fin_obo = join(REPO, "go-basic.obo") + get_godag(fin_obo, optional_attrs={"relationship"}) + fin_gaf = join(REPO, "goa_human.gaf") dnld_annotation(fin_gaf) for idx, cmd in enumerate(_get_cmds()): - print('------------------- TEST {I} ------------------------------------'.format(I=idx)) - print('CMD: {CMD}'.format(CMD=cmd)) + print( + "------------------- TEST {I} ------------------------------------".format( + I=idx + ) + ) + print("CMD: {CMD}".format(CMD=cmd)) assert system(cmd) == 0 print("TEST PASSED") else: - print('RUN THIS TEST WITH AN ARGUMENT') + print("RUN THIS TEST WITH AN ARGUMENT") def _get_cmds(): """Get commands used in ./doc/md/README_find_enrichment.md""" # pylint: disable=line-too-long return [ - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea.xlsx,goea.tsv --pval_field=fdr_bh', # 0 - 'python3 scripts/find_enrichment.py data/study data/population data/association --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_id2gos.xlsx', # 1 - 'python3 scripts/find_enrichment.py ids_stu_gaf.txt ids_pop_gaf.txt goa_human.gaf --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gaf.xlsx', # 2 - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx', # 3 - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx -r', # 4 - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx --relationships=regulates,negatively_regulates,positively_regulates', - 'python3 scripts/find_enrichment.py ids_stu_gene2go_9606.txt ids_pop_gene2go_9606.txt gene2go --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_9606.xlsx', - 'python3 scripts/find_enrichment.py ids_stu_gene2go_10090.txt ids_pop_gene2go_10090.txt gene2go --taxid=10090 --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_10090.xlsx', - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_exc=IEA --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx', - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_inc=Experimental --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx', - 'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_inc=EXP,IDA,IPI,IMP,IGI,IEP --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx', - 'python3 scripts/find_enrichment.py --ev_help', - 'python3 scripts/find_enrichment.py data/study data/population data/association', - 'python3 scripts/find_enrichment.py data/study data/population data/association --sections=goatools.test_data.sections.data2018_07_find_enrichment', - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_uncorr.xlsx', - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh.xlsx,goea_fdr_bh.tsv --pval_field=fdr_bh', - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh_flat.xlsx --method=fdr_bh', - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh_grpd.xlsx --method=fdr_bh --sections=goatools.test_data.sections.data2018_07_find_enrichment', - 'python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_all.xlsx,goea_all.tsv --pval=-1', + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea.xlsx,goea.tsv --pval_field=fdr_bh", # 0 + "python3 scripts/find_enrichment.py data/study data/population data/association --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_id2gos.xlsx", # 1 + "python3 scripts/find_enrichment.py ids_stu_gaf.txt ids_pop_gaf.txt goa_human.gaf --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gaf.xlsx", # 2 + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx", # 3 + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx -r", # 4 + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx --relationships=regulates,negatively_regulates,positively_regulates", + "python3 scripts/find_enrichment.py ids_stu_gene2go_9606.txt ids_pop_gene2go_9606.txt gene2go --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_9606.xlsx", + "python3 scripts/find_enrichment.py ids_stu_gene2go_10090.txt ids_pop_gene2go_10090.txt gene2go --taxid=10090 --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_10090.xlsx", + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_exc=IEA --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx", + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_inc=Experimental --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx", + "python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_inc=EXP,IDA,IPI,IMP,IGI,IEP --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx", + "python3 scripts/find_enrichment.py --ev_help", + "python3 scripts/find_enrichment.py data/study data/population data/association", + "python3 scripts/find_enrichment.py data/study data/population data/association --sections=goatools.test_data.sections.data2018_07_find_enrichment", + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_uncorr.xlsx", + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh.xlsx,goea_fdr_bh.tsv --pval_field=fdr_bh", + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh_flat.xlsx --method=fdr_bh", + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_fdr_bh_grpd.xlsx --method=fdr_bh --sections=goatools.test_data.sections.data2018_07_find_enrichment", + "python3 scripts/find_enrichment.py data/study data/population data/association --outfile=goea_all.xlsx,goea_all.tsv --pval=-1", ] -if __name__ == '__main__': +if __name__ == "__main__": test_find_enrichment(len(sys.argv) != 1) # Copyright (C) 2010-present, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_dcnt_r01.py b/tests/test_dcnt_r01.py index 692ebec5..95cd428a 100755 --- a/tests/test_dcnt_r01.py +++ b/tests/test_dcnt_r01.py @@ -1,8 +1,6 @@ #!/usr/bin/env python """Ancestors/Descendants.""" -from __future__ import print_function - import os import sys import timeit @@ -11,9 +9,9 @@ from scipy import stats from goatools.base import download_go_basic_obo -from goatools.obo_parser import GODag from goatools.godag.prttime import prt_hms from goatools.gosubdag.gosubdag import GoSubDag +from goatools.obo_parser import GODag def test_go_pools(): @@ -35,7 +33,7 @@ def test_go_pools(): gosubdag_r1 = objr.get_gosubdag_r1(goids) assert gosubdag_r0.go_sources == gosubdag_r1.go_sources assert set(gosubdag_r0.go2obj).issubset(gosubdag_r1.go2obj) - cnts = {'r0_u':[], 'r1_u':[], 'r0_d':[], 'r1_d':[]} + cnts = {"r0_u": [], "r1_u": [], "r0_d": [], "r1_d": []} for goid in gosubdag_r0.go2obj: r1_d = gosubdag_r1.rcntobj.go2descendants.get(goid, set()) r0_d = gosubdag_r0.rcntobj.go2descendants.get(goid, set()) @@ -43,10 +41,10 @@ def test_go_pools(): r0_u = gosubdag_r0.rcntobj.go2ancestors.get(goid, set()) r1_u = gosubdag_r1.rcntobj.go2ancestors.get(goid, set()) assert r0_u.issubset(r1_u), "R1({}) R0({})".format(len(r1_u), len(r0_u)) - cnts['r0_u'].append(len(r0_u)) - cnts['r1_u'].append(len(r1_u)) - cnts['r0_d'].append(len(r0_d)) - cnts['r1_d'].append(len(r1_d)) + cnts["r0_u"].append(len(r0_u)) + cnts["r1_u"].append(len(r1_u)) + cnts["r0_d"].append(len(r0_d)) + cnts["r1_d"].append(len(r1_d)) objr.prt_cnts(cnts) @@ -56,23 +54,33 @@ class _Run: obo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../go-basic.obo") def __init__(self): - download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) + download_go_basic_obo(self.obo, sys.stdout) self.godag_r0 = GODag(self.obo) - self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) + self.godag_r1 = GODag(self.obo, optional_attrs=set(["relationship"])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) - prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( - N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) + prt_hms( + tic, + "GoSubDag r0 {N:4} GOs {S:3} srcs".format( + N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources) + ), + ) # GoSubDag with relationships - self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) - prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( - N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources))) + self.gosubdag_r1 = GoSubDag( + self.goids, self.godag_r1, prt=None, relationships=True + ) + prt_hms( + tic, + "GoSubDag r1 {N:4} GOs {S:3} srcs".format( + N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources) + ), + ) def prt_cnts(self, cnts): """Compare ancestor/descendant counts with relatives=False/True.""" - k2v = {k:self.str_stats(v) for k, v in cnts.items()} + k2v = {k: self.str_stats(v) for k, v in cnts.items()} print(k2v) @staticmethod @@ -85,21 +93,37 @@ def str_stats(vals): def get_gosubdag_r0(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() - gosubdag = GoSubDag(goids, self.godag_r0, relationships=None, - #rcntobj=self.gosubdag_r0.rcntobj, - prt=None) - prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( - N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) + gosubdag = GoSubDag( + goids, + self.godag_r0, + relationships=None, + # rcntobj=self.gosubdag_r0.rcntobj, + prt=None, + ) + prt_hms( + tic, + "GoSubDag r0 {N:4} GOs {S:3} srcs".format( + N=len(gosubdag.go2obj), S=len(gosubdag.go_sources) + ), + ) return gosubdag def get_gosubdag_r1(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() - gosubdag = GoSubDag(goids, self.godag_r1, relationships=True, - #rcntobj=self.gosubdag_r1.rcntobj, - prt=None) - prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( - N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) + gosubdag = GoSubDag( + goids, + self.godag_r1, + relationships=True, + # rcntobj=self.gosubdag_r1.rcntobj, + prt=None, + ) + prt_hms( + tic, + "GoSubDag r1 {N:4} GOs {S:3} srcs".format( + N=len(gosubdag.go2obj), S=len(gosubdag.go_sources) + ), + ) return gosubdag def get_goids_rand(self, qty): @@ -108,5 +132,5 @@ def get_goids_rand(self, qty): return self.goids[:qty] -if __name__ == '__main__': +if __name__ == "__main__": test_go_pools() diff --git a/tests/test_dnlds.py b/tests/test_dnlds.py index e928ea81..58f63a04 100644 --- a/tests/test_dnlds.py +++ b/tests/test_dnlds.py @@ -1,48 +1,43 @@ """tests/test_dnlds.py: Test downloading go-basic.obo file and goslims obo/owl.""" import os -import sys + from goatools.base import download_go_basic_obo -from goatools.base import download_ncbi_associations REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") -def test_dnlds(prt=sys.stdout): +def test_dnlds(): """Test downloads of ontologies and NCBI associations.""" goslims = [ - 'goslim_aspergillus', - 'goslim_candida', + "goslim_aspergillus", + "goslim_candida", #'goslim_chembl', # 404 not found - 'goslim_generic', - 'goslim_metagenomics', - 'goslim_pir', - 'goslim_plant', - 'goslim_pombe', - 'goslim_synapse', - 'goslim_virus', - 'goslim_yeast'] + "goslim_generic", + "goslim_metagenomics", + "goslim_pir", + "goslim_plant", + "goslim_pombe", + "goslim_synapse", + "goslim_virus", + "goslim_yeast", + ] # Test downloads of ontologies. dnld_ontology(os.path.join(REPO, "go-basic.obo")) # Test downloads of go-slim ontologies. for goslim in goslims: - for ext in ['obo', 'owl']: + for ext in ["obo", "owl"]: file_dst = os.path.join(REPO, "{DAG}.{EXT}".format(DAG=goslim, EXT=ext)) dnld_ontology(file_dst) - # Test downloading of associations from NCBI. - file_assc = os.path.join(REPO, "gene2go") - #os.system("rm -f {FILE}".format(FILE=file_assc)) - #download_ncbi_associations(file_assc, prt, loading_bar=None) - #download_ncbi_associations(file_assc, prt, loading_bar=None) - #assert os.path.isfile(file_assc), "FILE({F}) EXPECTED TO EXIST".format(F=file_assc) + def dnld_ontology(filename): """Test downloading of ontologies.""" - # download_go_basic_obo(filename, loading_bar=None) os.system("rm -f {FILE}".format(FILE=filename)) - download_go_basic_obo(filename, loading_bar=None) - download_go_basic_obo(filename, loading_bar=None) + download_go_basic_obo(filename) + download_go_basic_obo(filename) assert os.path.isfile(filename), "FILE({F}) EXPECTED TO EXIST".format(F=filename) -if __name__ == '__main__': + +if __name__ == "__main__": test_dnlds() diff --git a/tests/test_find_enrichment_overlap.py b/tests/test_find_enrichment_overlap.py index 43d99dc5..3af91013 100755 --- a/tests/test_find_enrichment_overlap.py +++ b/tests/test_find_enrichment_overlap.py @@ -1,11 +1,10 @@ #!/usr/bin/env python """fraction of genes/proteins in study are found in the population background""" -from __future__ import print_function - __copyright__ = "Copyright (C) 2010-2019, DV Klopfenstein, H Tang. All rights reserved." import os + from goatools.base import get_godag from goatools.cli.find_enrichment import GoeaCliFnc from goatools.test_data.cli.find_enrichment_dflts import ArgsDict @@ -13,38 +12,44 @@ REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + # This test is not on Travis because non-code file, data/study, is not found by Travis-CI # This test can and should be run before any pull requests using 'make test' def test_find_enrichment(): """Recreate run in run.sh.""" fin_genes = os.path.join(REPO, "data/study") - pop = set(_.strip() for _ in open(fin_genes) if _.strip()) + pop = set(_.strip() for _ in open(fin_genes, encoding="utf-8") if _.strip()) stu_orig = pop num_pop = len(pop) objtest = ArgsDict() - get_godag(objtest.namespace['obo'], loading_bar=None) - for min_overlap in [.25, .50, .75]: - objtest.namespace['min_overlap'] = min_overlap + get_godag(objtest.namespace["obo"]) + for min_overlap in [0.25, 0.50, 0.75]: + objtest.namespace["min_overlap"] = min_overlap args = objtest.ntobj(**objtest.namespace) objcli = GoeaCliFnc(args) - num_stu_in_pop = int(round(min_overlap*num_pop)) + 10 + num_stu_in_pop = int(round(min_overlap * num_pop)) + 10 study = _get_studygenes(stu_orig, num_stu_in_pop) overlap = objcli.get_overlap(study, pop) - print("{N:3} of {M} ({OL}%) in study in pop".format( - N=num_stu_in_pop, M=num_pop, OL=100.0*overlap)) + print( + "{N:3} of {M} ({OL}%) in study in pop".format( + N=num_stu_in_pop, M=num_pop, OL=100.0 * overlap + ) + ) objcli.chk_genes(study, pop) print("TEST PASSED") + def _get_studygenes(study_orig, num_stu_in_pop): """Get a study set having genes not found in the population.""" study = set() for idx, gene in enumerate(study_orig): if idx > num_stu_in_pop: - gene += 'A' + gene += "A" study.add(gene) return study -if __name__ == '__main__': + +if __name__ == "__main__": test_find_enrichment() # Copyright (C) 2010-2019, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_find_enrichment_run.py b/tests/test_find_enrichment_run.py index 879e73e0..5e800fbe 100755 --- a/tests/test_find_enrichment_run.py +++ b/tests/test_find_enrichment_run.py @@ -4,11 +4,10 @@ $ python find_enrichment.py --pval=0.05 --indent data/study data/population data/association. """ -from __future__ import print_function - __copyright__ = "Copyright (C) 2010-2018, DV Klopfenstein, H Tang. All rights reserved." import collections as cx + from goatools.base import get_godag from goatools.cli.find_enrichment import GoeaCliFnc from goatools.test_data.cli.find_enrichment_dflts import ArgsDict @@ -20,15 +19,14 @@ def test_find_enrichment(): """Recreate run in run.sh.""" # Set params objtest = ArgsDict() - get_godag(objtest.namespace['obo'], loading_bar=None) - objtest.namespace['indent'] = True + get_godag(objtest.namespace["obo"]) + objtest.namespace["indent"] = True args = objtest.ntobj(**objtest.namespace) # Run test objcli = GoeaCliFnc(args) # Check results - ## expected_cnts = {'fdr_bh': 17, 'sidak': 5, 'holm': 5, 'bonferroni': 5} - expected_cnts = {'fdr_bh': 19, 'sidak': 9, 'holm': 9, 'bonferroni': 9} + expected_cnts = {"fdr_bh": 19, "sidak": 9, "holm": 9, "bonferroni": 9} _chk_results(objcli.results_all, expected_cnts, objcli) print("TEST PASSED") @@ -41,12 +39,12 @@ def _chk_results(results, expected_cnts, objcli): for method in objcli.methods: ctr[method] += getattr(ntres, "p_{METHOD}".format(METHOD=method)) < alpha for method, num_sig in ctr.most_common(): - assert num_sig == expected_cnts[method], '{EXP} {ACT}'.format( - EXP=expected_cnts, ACT=ctr.most_common()) - + assert num_sig == expected_cnts[method], "{EXP} {ACT}".format( + EXP=expected_cnts, ACT=ctr.most_common() + ) -if __name__ == '__main__': +if __name__ == "__main__": test_find_enrichment() # Copyright (C) 2010-2018, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_genes_cell_cycle.py b/tests/test_genes_cell_cycle.py index c9849763..4c85820b 100755 --- a/tests/test_genes_cell_cycle.py +++ b/tests/test_genes_cell_cycle.py @@ -3,39 +3,43 @@ import sys import os import re + from collections import defaultdict + +from goatools.associations import get_assoc_ncbi_taxids from goatools.base import download_go_basic_obo from goatools.go_search import GoSearch -from goatools.associations import get_assoc_ncbi_taxids from goatools.wr_tbl import prt_txt __copyright__ = "Copyright (C) 2010-2019, DV Klopfenstein, H Tang, All rights reserved." __author__ = "DV Klopfenstein" + def test_cell_cycle(taxid=9606, log=sys.stdout): """Get all genes related to cell cycle. Write results to file.""" geneids = get_genes_cell_cycle(taxid, log) fout = "cell_cycle_genes_{TAXID}.txt".format(TAXID=taxid) prt_genes(fout, geneids, taxid, log) + def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods for cell cycle.""" # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(os.getcwd(), "go-basic.obo") - download_go_basic_obo(fin_go_obo, loading_bar=None) + download_go_basic_obo(fin_go_obo) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2IDs and ID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) - get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs, loading_bar=None) + get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) - srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2IDs']) + srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]["GO2IDs"]) # Compile search pattern for 'cell cycle' - cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) + cell_cycle = re.compile(r"cell cycle", flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) - with open(fout_allgos, "w") as prt: + with open(fout_allgos, "w", encoding="utf-8") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) @@ -43,44 +47,55 @@ def get_genes_cell_cycle(taxid=9606, log=sys.stdout): # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' - cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) + cell_cycle_ind = re.compile(r"cell cycle.independent", flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: - log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) - log.write(' FOUND {N:>5} GOs: {F}\n'.format( - N=len(gos_all), F=fout_allgos)) + log.write(" taxid {TAXID:>5}\n".format(TAXID=taxid)) + log.write( + " FOUND {N:>5} GOs: {F}\n".format(N=len(gos_all), F=fout_allgos) + ) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids + def prt_genes(fout_genes, geneids, taxid, log): """Print 'cell cycle' geneids, with or without Symbol and description information.""" fin_symbols = "genes_NCBI_{TAXID}_All.py".format(TAXID=taxid) # If gene Symbol information is available, print geneid and Symbol if os.path.isfile(fin_symbols): import importlib + module_name = "".join(["goatools.test_data.", fin_symbols[:-3]]) module = importlib.import_module(module_name) geneid2nt = module.GENEID2NT fmtstr = "{GeneID:>9} {Symbol:<16} {description}\n" nts = [geneid2nt[geneid] for geneid in sorted(geneids) if geneid in geneid2nt] - with open(fout_genes, 'w') as prt: + with open(fout_genes, "w", encoding="utf-8") as prt: prt_txt(prt, nts, fmtstr) if log is not None: - log.write(" WROTE {N:>5} genes: {FOUT}\n".format(FOUT=fout_genes, N=len(nts))) + log.write( + " WROTE {N:>5} genes: {FOUT}\n".format( + FOUT=fout_genes, N=len(nts) + ) + ) # Just print geneids else: - with open(fout_genes, 'w') as prt: + with open(fout_genes, "w", encoding="utf-8") as prt: for geneid in geneids: prt.write("{geneid}\n".format(geneid=geneid)) if log is not None: - log.write(" WROTE {N:>5} genes: {FOUT}\n".format( - FOUT=fout_genes, N=len(geneids))) + log.write( + " WROTE {N:>5} genes: {FOUT}\n".format( + FOUT=fout_genes, N=len(geneids) + ) + ) + -if __name__ == '__main__': +if __name__ == "__main__": test_cell_cycle(9606) test_cell_cycle(10090) diff --git a/tests/test_get_godag.py b/tests/test_get_godag.py index 12d58cbe..96ae439a 100755 --- a/tests/test_get_godag.py +++ b/tests/test_get_godag.py @@ -3,6 +3,7 @@ import os import sys + from goatools.base import get_godag @@ -14,7 +15,7 @@ def test_godag(prt=sys.stdout): for fin_obo in ["go-basic.obo", "goslim_generic.obo"]: fin_full = os.path.join(cwd, fin_obo) os.system("rm -f {OBO}".format(OBO=fin_obo)) - godag = get_godag(fin_full, prt, loading_bar=None) # Get GODag object + godag = get_godag(fin_full, prt) # Get GODag object assert godag, "GO-DAG({OBO}) NOT PROPERLY LOADED".format(OBO=fin_obo) diff --git a/tests/test_go_print.py b/tests/test_go_print.py index 87f410e3..fe01b841 100755 --- a/tests/test_go_print.py +++ b/tests/test_go_print.py @@ -4,13 +4,15 @@ import os import sys import goatools + from goatools.base import download_go_basic_obo + def test_go_print(prt=sys.stdout): """Test that all GO Terms can be printed, even if level/depth are not assigned.""" prt_pypath(prt) file_obo = os.path.join(os.getcwd(), "go-basic.obo") - obo_file = download_go_basic_obo(file_obo, prt=prt, loading_bar=None) + obo_file = download_go_basic_obo(file_obo, prt=prt) reader = goatools.obo_parser.OBOReader(obo_file) go_terms = list(reader) prt.write("Python Version: {VER}\n\n".format(VER=sys.version)) @@ -21,9 +23,10 @@ def test_go_print(prt=sys.stdout): for idx, go_rec in enumerate(go_terms): prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec)) + def prt_pypath(prt): """Print PYTHONPATH contents.""" - pypathes = os.environ.get('PYTHONPATH', None) + pypathes = os.environ.get("PYTHONPATH", None) if pypathes: prt.write("\nPYTHONPATH:\n") for idx, pypath in enumerate(pypathes.split(os.pathsep)): @@ -31,5 +34,5 @@ def prt_pypath(prt): prt.write("\n") -if __name__ == '__main__': +if __name__ == "__main__": test_go_print() diff --git a/tests/test_goea_errors.py b/tests/test_goea_errors.py index c23e70af..3e6169c6 100755 --- a/tests/test_goea_errors.py +++ b/tests/test_goea_errors.py @@ -8,6 +8,7 @@ import sys import os + from goatools.base import get_godag from goatools.go_enrichment import GOEnrichmentStudy from goatools.associations import read_associations @@ -17,18 +18,18 @@ def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" - godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) + godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo")) fin_assc = ROOT + "association" - assoc = read_associations(fin_assc, 'id2gos', no_top=True) + assoc = read_associations(fin_assc, "id2gos", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] - methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] + methods = kws["methods"] if "methods" in kws else ["not_bonferroni"] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids def run_method_bad_ini(): """Test attempting to use an unsupported method in initialization.""" - goea, study_ids = init_goea(methods=['not_fdr']) + goea, study_ids = init_goea(methods=["not_fdr"]) # Test that method(s) set during initialization are valid goea.run_study(study_ids) @@ -37,7 +38,7 @@ def run_method_bad_run(): """Test attempting to use an unsupported method in run.""" goea, study_ids = init_goea() # Test that method(s) set while running a GOEA on a study are valid - goea.run_study(study_ids, methods=['invalid_method']) + goea.run_study(study_ids, methods=["invalid_method"]) def test_all(log=sys.stdout): @@ -52,14 +53,18 @@ def test_all(log=sys.stdout): except Exception as inst: # Run next test if str(inst).startswith(exp_errmsg): - log.write("Test PASSED. Expected error message seen: {EXP}\n".format( - EXP=exp_errmsg)) + log.write( + "Test PASSED. Expected error message seen: {EXP}\n".format( + EXP=exp_errmsg + ) + ) else: - raise Exception("EXPECTED({EXP}). ACTUAL({ACT})".format( - EXP=exp_errmsg, ACT=inst)) + raise Exception( + "EXPECTED({EXP}). ACTUAL({ACT})".format(EXP=exp_errmsg, ACT=inst) + ) -if __name__ == '__main__': +if __name__ == "__main__": test_all() # Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_goea_local.py b/tests/test_goea_local.py index 453eddda..576dfe92 100755 --- a/tests/test_goea_local.py +++ b/tests/test_goea_local.py @@ -1,42 +1,49 @@ """Test Gene Ontology Enrichement Analysis.""" -import sys import os -from goatools.go_enrichment import GOEnrichmentStudy +import sys + from goatools.associations import read_associations -from goatools.godag.prtfncs import GoeaPrintFunctions from goatools.base import get_godag +from goatools.go_enrichment import GOEnrichmentStudy +from goatools.godag.prtfncs import GoeaPrintFunctions __copyright__ = "Copyright (C) 2010-2019, H Tang et al., All rights reserved." REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_unknown_gos(): """Ensure that a study with only unknown GO Terms will run gracefully.""" - #pylint: disable=bad-whitespace - code = os.system("python {SCR} --alpha=0.05 {STUDY} {POP} {ASSN} --obo={OBO}".format( - SCR ="{REPO}/scripts/find_enrichment.py".format(REPO=REPO), - OBO ="{REPO}/go-basic.obo".format(REPO=REPO), - STUDY="{REPO}/tests/data/study_unknown".format(REPO=REPO), - POP ="{REPO}/tests/data/small_population".format(REPO=REPO), - ASSN ="{REPO}/tests/data/small_association".format(REPO=REPO))) + # pylint: disable=bad-whitespace + code = os.system( + "python {SCR} --alpha=0.05 {STUDY} {POP} {ASSN} --obo={OBO}".format( + SCR="{REPO}/scripts/find_enrichment.py".format(REPO=REPO), + OBO="{REPO}/go-basic.obo".format(REPO=REPO), + STUDY="{REPO}/tests/data/study_unknown".format(REPO=REPO), + POP="{REPO}/tests/data/small_population".format(REPO=REPO), + ASSN="{REPO}/tests/data/small_association".format(REPO=REPO), + ) + ) assert code != 0, "**FAILED: Simple find_enrichment test" + def test_goea_fdr_dflt(): """Test GOEA with method, fdr. Print original summary""" goeaobj = get_goeaobj() study_fin = "{REPO}/tests/data/small_study".format(REPO=REPO) - study_ids = [line.rstrip() for line in open(study_fin)] + study_ids = [line.rstrip() for line in open(study_fin, encoding="utf-8")] goea_results = goeaobj.run_study(study_ids) objprtres = GoeaPrintFunctions() objprtres.print_results(goea_results) objprtres.print_date() + def test_goea_local(log=sys.stdout): """Test GOEA with local multipletest correction methods for local.""" goeaobj = get_goeaobj() study_fin = "{REPO}/tests/data/small_study".format(REPO=REPO) - study_ids = [line.rstrip() for line in open(study_fin)] + study_ids = [line.rstrip() for line in open(study_fin, encoding="utf-8")] # prt_if = lambda nt: nt.p_uncorrected < 0.00005 prt_if = None for method in ("fdr", "bonferroni", "sidak", "holm"): @@ -45,16 +52,21 @@ def test_goea_local(log=sys.stdout): # "{NS} {p_uncorrected:5.3e} {p_fdr:5.3e} {name} ({study_count} gene(s))\n" # "{NS} {p_uncorrected:5.3e} {p_bonferroni:5.3e} {name} ({study_count} gene(s))\n" # "{NS} {p_uncorrected:5.3e} {p_sidak:5.3e} {name} ({study_count} gene(s))\n" - fmtstr = "".join(["{NS} {p_uncorrected:5.3e} {", - "p_{M}:5.3e".format(M=method), - "} {name} ({study_count} gene(s))\n"]) + fmtstr = "".join( + [ + "{NS} {p_uncorrected:5.3e} {", + "p_{M}:5.3e".format(M=method), + "} {name} ({study_count} gene(s))\n", + ] + ) goeaobj.prt_txt(log, goea_results, fmtstr, prt_if=prt_if) + def test_goea_bonferroni(): """Test GOEA with method, bonferroni.""" - goeaobj = get_goeaobj(['bonferroni']) + goeaobj = get_goeaobj(["bonferroni"]) study_fin = "{REPO}/tests/data/small_study".format(REPO=REPO) - study_ids = [line.rstrip() for line in open(study_fin)] + study_ids = [line.rstrip() for line in open(study_fin, encoding="utf-8")] fout_xlsx = "{REPO}/goea_bonferroni_usrflds.xlsx".format(REPO=REPO) goea_results = goeaobj.run_study(study_ids) @@ -70,24 +82,28 @@ def test_goea_bonferroni(): # prt_if = lambda nt: nt.p_bonferroni < 0.05 prt_if = None # Print to tab-separated table and Excel spreadsheet - goeaobj.wr_tsv("{REPO}/goea_bonferroni.tsv".format(REPO=REPO), goea_results, prt_if=prt_if) + goeaobj.wr_tsv( + "{REPO}/goea_bonferroni.tsv".format(REPO=REPO), goea_results, prt_if=prt_if + ) # Print level in addition to all the regular fields # User can control which fields are printed and the order that they appear in the table prt_flds = "NS level GO name ratio_in_study ratio_in_pop p_uncorrected p_bonferroni".split() fout_xlsx = "{REPO}/goea_bonferroni_lev.xlsx".format(REPO=REPO) goeaobj.wr_xlsx(fout_xlsx, goea_results, prt_if=prt_if, prt_flds=prt_flds) + def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = os.path.join(REPO, "go-basic.obo") - obo_dag = get_godag(obo_fin, loading_bar=None) + obo_dag = get_godag(obo_fin) fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO) - assoc = read_associations(fin_assc, 'id2gos', no_top=True) + assoc = read_associations(fin_assc, "id2gos", no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) - popul_ids = [line.rstrip() for line in open(popul_fin)] + popul_ids = [line.rstrip() for line in open(popul_fin, encoding="utf-8")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj + def run_all(): """Run all local multiple tests.""" test_unknown_gos() @@ -95,7 +111,8 @@ def run_all(): test_goea_local() test_goea_bonferroni() -if __name__ == '__main__': + +if __name__ == "__main__": run_all() # Copyright (C) 2010-2019, H Tang et al., All rights reserved. diff --git a/tests/test_goea_quiet.py b/tests/test_goea_quiet.py index bef96a6b..58488da6 100755 --- a/tests/test_goea_quiet.py +++ b/tests/test_goea_quiet.py @@ -1,9 +1,10 @@ """Test that a Gene Ontology Enrichement Analysis can be run quietly""" import os -from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS + from goatools.anno.idtogos_reader import IdToGosReader from goatools.base import get_godag +from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS __copyright__ = "Copyright (C) 2010-present, H Tang et al., All rights reserved." @@ -14,20 +15,19 @@ def test_goea_quiet(): """Test that a Gene Ontology Enrichement Analysis can be run quietly""" goeaobj = _get_goeaobj() study_fin = "{REPO}/tests/data/small_study".format(REPO=REPO) - study_ids = [line.rstrip() for line in open(study_fin)] - print('\nTEST 1: GOEA run_study(study_ids)') + study_ids = [line.rstrip() for line in open(study_fin, encoding="utf-8")] + print("\nTEST 1: GOEA run_study(study_ids)") goea_results1 = goeaobj.run_study(study_ids) - print('{N} GOEA results for verbose GOEA'.format(N=len(goea_results1))) + print("{N} GOEA results for verbose GOEA".format(N=len(goea_results1))) - print('\nTEST 2: GOEA run_study(study_ids, prt=None)') + print("\nTEST 2: GOEA run_study(study_ids, prt=None)") goea_results2 = goeaobj.run_study(study_ids, prt=None) - print('{N} GOEA results for quiet GOEA'.format(N=len(goea_results2))) + print("{N} GOEA results for quiet GOEA".format(N=len(goea_results2))) # Original keyword is 'log' - print('\nTEST 3: GOEA run_study(study_ids, log=None)') + print("\nTEST 3: GOEA run_study(study_ids, log=None)") goea_results3 = goeaobj.run_study(study_ids, log=None) - print('{N} GOEA results for quiet GOEA'.format(N=len(goea_results3))) - + print("{N} GOEA results for quiet GOEA".format(N=len(goea_results3))) _chk_results(goea_results1, goea_results2) _chk_results(goea_results1, goea_results3) @@ -37,27 +37,36 @@ def _get_goeaobj(methods=None): """Test GOEA with method, fdr.""" # REad GODag obo_fin = os.path.join(REPO, "go-basic.obo") - obo_dag = get_godag(obo_fin, loading_bar=None) + obo_dag = get_godag(obo_fin) # Read association fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO) objanno = IdToGosReader(fin_assc, godag=obo_dag) ns2assc = objanno.get_ns2assc() popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) - popul_ids = [line.rstrip() for line in open(popul_fin)] + popul_ids = [line.rstrip() for line in open(popul_fin, encoding="utf-8")] goeaobj = GOEnrichmentStudyNS(popul_ids, ns2assc, obo_dag, methods=methods) return goeaobj + def _chk_results(results1, results2): """Check that results match""" # pylint: disable=line-too-long for res1, res2 in zip(results1, results2): - assert res1.GO == res2.GO, '\nRES1: {R1}\nRES2: {R2}\n\n'.format(R1=res1, R2=res2) - assert res1.p_bonferroni == res2.p_bonferroni, '\nRES1: {R1}\nRES2: {R2}\n\n'.format(R1=res1, R2=res2) - assert res1.p_sidak == res2.p_sidak, '\nRES1: {R1}\nRES2: {R2}\n\n'.format(R1=res1, R2=res2) - assert res1.p_holm == res2.p_holm, '\nRES1: {R1}\nRES2: {R2}\n\n'.format(R1=res1, R2=res2) + assert res1.GO == res2.GO, "\nRES1: {R1}\nRES2: {R2}\n\n".format( + R1=res1, R2=res2 + ) + assert ( + res1.p_bonferroni == res2.p_bonferroni + ), "\nRES1: {R1}\nRES2: {R2}\n\n".format(R1=res1, R2=res2) + assert res1.p_sidak == res2.p_sidak, "\nRES1: {R1}\nRES2: {R2}\n\n".format( + R1=res1, R2=res2 + ) + assert res1.p_holm == res2.p_holm, "\nRES1: {R1}\nRES2: {R2}\n\n".format( + R1=res1, R2=res2 + ) -if __name__ == '__main__': +if __name__ == "__main__": test_goea_quiet() # Copyright (C) 2010-present, H Tang et al., All rights reserved. diff --git a/tests/test_goea_rpt_bonferroni.py b/tests/test_goea_rpt_bonferroni.py index 6e1a9c02..d19cedcf 100755 --- a/tests/test_goea_rpt_bonferroni.py +++ b/tests/test_goea_rpt_bonferroni.py @@ -10,12 +10,13 @@ import os import sys -from goatools.base import get_godag from goatools.associations import read_associations +from goatools.base import get_godag from goatools.go_enrichment import GOEnrichmentStudy REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- @@ -31,8 +32,16 @@ def test_bonferroni(): fout_xls = "goea_bonferroni.xlsx" # print these in tsv and xlsx - print_fields = ['NS', 'study_count', 'p_uncorrected', 'p_bonferroni', - 'level', 'depth', 'GO', 'name'] + print_fields = [ + "NS", + "study_count", + "p_uncorrected", + "p_bonferroni", + "level", + "depth", + "GO", + "name", + ] # 1. Print results to screen using format in prtfmt. For example: # # BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation @@ -43,27 +52,63 @@ def test_bonferroni(): # BP 2 1.492e-02 L04 D06 GO:0006909 phagocytosis # BP 2 1.492e-02 L03 D03 GO:0051322 anaphase # ... - prtfmt = " ".join(["{NS} {study_count:3} {p_uncorrected:5.3e}", - "{p_bonferroni:5.3e} L{level:02} D{depth:02} {GO} {name}\n"]) + prtfmt = " ".join( + [ + "{NS} {study_count:3} {p_uncorrected:5.3e}", + "{p_bonferroni:5.3e} L{level:02} D{depth:02} {GO} {name}\n", + ] + ) prt_if = lambda nt: nt.p_uncorrected < 0.05 goea.prt_txt(log, results_nt, prtfmt, prt_if=prt_if) # 2. Write results to tsv file # Optional user defined formatting for specific fields - fld2fmt = {'p_bonferroni':'{:8.2e}', 'p_uncorrected':'{:8.2e}'} + fld2fmt = {"p_bonferroni": "{:8.2e}", "p_uncorrected": "{:8.2e}"} # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first. - sort_by = lambda nt: [nt.NS, -1*nt.depth] - goea.wr_tsv(fout_tsv, results_nt, - prt_if=prt_if, sort_by=sort_by, fld2fmt=fld2fmt, prt_flds=print_fields) + sort_by = lambda nt: [nt.NS, -1 * nt.depth] + goea.wr_tsv( + fout_tsv, + results_nt, + prt_if=prt_if, + sort_by=sort_by, + fld2fmt=fld2fmt, + prt_flds=print_fields, + ) # 3. Write results to xlsx file, including specific study genes assc. w/significant GOs # Use these headers instead of the print_fields for the xlsx header - hdrs = ['NS', 'pval', 'bonferroni', 'L', 'D', 'Term', 'Ontology Term Name', 'Cnt', 'Genes'] - print_fields = ['NS', 'p_uncorrected', 'p_bonferroni', - 'level', 'depth', 'GO', 'name', 'study_count', 'study_items'] - goea.wr_xlsx(fout_xls, results_nt, - # optional key-word args (ie, kwargs, kws) - prt_if=prt_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, prt_flds=print_fields) + hdrs = [ + "NS", + "pval", + "bonferroni", + "L", + "D", + "Term", + "Ontology Term Name", + "Cnt", + "Genes", + ] + print_fields = [ + "NS", + "p_uncorrected", + "p_bonferroni", + "level", + "depth", + "GO", + "name", + "study_count", + "study_items", + ] + goea.wr_xlsx( + fout_xls, + results_nt, + # optional key-word args (ie, kwargs, kws) + prt_if=prt_if, + sort_by=sort_by, + hdrs=hdrs, + fld2fmt=fld2fmt, + prt_flds=print_fields, + ) def run_bonferroni(): @@ -72,17 +117,26 @@ def run_bonferroni(): # Run Gene Ontology Analysis (GOEA) # # 1. Initialize - godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) + godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo")) fin_assc = os.path.join(REPO, "data/association") - assoc = read_associations(fin_assc, 'id2gos', no_top=True) - popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] - study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] + assoc = read_associations(fin_assc, "id2gos", no_top=True) + popul_ids = [ + line.rstrip() + for line in open(os.path.join(REPO, "data/population"), encoding="utf-8") + ] + study_ids = [ + line.rstrip() + for line in open(os.path.join(REPO, "data/study"), encoding="utf-8") + ] # 2. Run enrichment analysis - goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) + goea = GOEnrichmentStudy( + popul_ids, assoc, godag, alpha=0.05, methods=["bonferroni"] + ) results_nt = goea.run_study(study_ids) return results_nt, goea -if __name__ == '__main__': + +if __name__ == "__main__": test_bonferroni() # Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_gosearch_emptydict.py b/tests/test_gosearch_emptydict.py index 719a4dc7..25261e9c 100755 --- a/tests/test_gosearch_emptydict.py +++ b/tests/test_gosearch_emptydict.py @@ -3,38 +3,41 @@ import os import sys + from collections import defaultdict + +from goatools.associations import get_assoc_ncbi_taxids from goatools.base import download_go_basic_obo from goatools.go_search import GoSearch -from goatools.associations import get_assoc_ncbi_taxids __copyright__ = "Copyright (C) 2010-2019, DV Klopfenstein, H Tang, All rights reserved." __author__ = "DV Klopfenstein" REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_gosearch(log=sys.stdout): """Test GoSearch class with no annotations.""" taxids = [9606, 10090] # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(REPO, "go-basic.obo") - download_go_basic_obo(fin_go_obo, loading_bar=None) + download_go_basic_obo(fin_go_obo) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2IDs and ID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) - get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) + get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) for taxid in taxids: - obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2IDs'], log=log) + obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]["GO2IDs"], log=log) assert len(obj.obo_dag) > 40000 GoSearch(fin_go_obo, dict(), log=log) assert len(obj.obo_dag) > 40000 # GoSearch('go.obo', dict(), log=log) -if __name__ == '__main__': +if __name__ == "__main__": test_gosearch() # Copyright (C) 2010-2019, DV Klopfenstein, H Tang, All rights reserved. diff --git a/tests/test_gosubdag_relationships.py b/tests/test_gosubdag_relationships.py index a42abefa..853bc1c7 100755 --- a/tests/test_gosubdag_relationships.py +++ b/tests/test_gosubdag_relationships.py @@ -1,35 +1,38 @@ #!/usr/bin/env python """Plot both the standard 'is_a' field and the optional 'part_of' relationship.""" -from __future__ import print_function - __copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved." +import datetime import os import sys import timeit -import datetime + from goatools.base import download_go_basic_obo -from goatools.obo_parser import GODag from goatools.gosubdag.gosubdag import GoSubDag +from goatools.obo_parser import GODag REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..") + def test_gosubdag_relationships(prt=sys.stdout): """Plot both the standard 'is_a' field and the 'part_of' relationship.""" - goids = set([ - "GO:0032501", - "GO:0044707", # alt_id: GO:0032501 # BP 1011 L01 D01 B multicellular organismal process - "GO:0050874", - "GO:0007608", # sensory perception of smell - "GO:0050911"]) # detection of chemical stimulus involved in sensory perception of smell + goids = set( + [ + "GO:0032501", + "GO:0044707", # alt_id: GO:0032501 + "GO:0050874", + "GO:0007608", # sensory perception of smell + "GO:0050911", + ] + ) # detection of chemical stimulus involved in sensory perception of smell # Load GO-DAG: Load optional 'relationship' fin_obo = os.path.join(REPO, "go-basic.obo") - download_go_basic_obo(fin_obo, prt, loading_bar=None) + download_go_basic_obo(fin_obo, prt) go2obj_plain = GODag(fin_obo) - go2obj_relat = GODag(fin_obo, optional_attrs=['relationship']) + go2obj_relat = GODag(fin_obo, optional_attrs=["relationship"]) print("\nCreate GoSubDag with GO DAG containing no relationships.") tic = timeit.default_timer() @@ -48,7 +51,7 @@ def test_gosubdag_relationships(prt=sys.stdout): assert goids_plain == goids_false print("\nCreate GoSubDag while loading only the 'part_of' relationship") - gosubdag = GoSubDag(goids, go2obj_relat, relationships=['part_of'], prt=prt) + gosubdag = GoSubDag(goids, go2obj_relat, relationships=["part_of"], prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_part_of = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) @@ -63,13 +66,15 @@ def test_gosubdag_relationships(prt=sys.stdout): assert goids_part_of.intersection(goids_true) == goids_part_of assert len(goids_true) >= len(goids_part_of) + def _rpt_hms(tic, num_goids): """Report the elapsed time for particular events.""" - elapsed_time = str(datetime.timedelta(seconds=(timeit.default_timer()-tic))) + elapsed_time = str(datetime.timedelta(seconds=timeit.default_timer() - tic)) print("Elapsed HMS: {HMS} {N} GO IDs".format(HMS=elapsed_time, N=num_goids)) return timeit.default_timer() -if __name__ == '__main__': + +if __name__ == "__main__": test_gosubdag_relationships() # Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved. diff --git a/tests/test_gosubdag_relationships_i126.py b/tests/test_gosubdag_relationships_i126.py index 1fcc6d94..4cb84832 100755 --- a/tests/test_gosubdag_relationships_i126.py +++ b/tests/test_gosubdag_relationships_i126.py @@ -1,27 +1,19 @@ #!/usr/bin/env python """Test that GoSubDag contains ancestors from only the user-specified relationships""" -# tests/test_gosubdag_relationships_i126.py -# goatools/gosubdag/gosubdag.py -# goatools/gosubdag/godag_rcnt.py -# goatools/gosubdag/godag_rcnt_init.py -# goatools/godag/go_tasks.py -# goatools/obo_parser.py - -from __future__ import print_function __copyright__ = "Copyright (C) 2016-2019, DV Klopfenstein, H Tang, All rights reserved." -from os.path import join -from os import system -import sys -## import timeit -## import datetime import collections as cx +import sys + +from os import system +from os.path import join +from tests.utils import REPO + from goatools.base import get_godag from goatools.godag.consts import RELATIONSHIP_SET from goatools.gosubdag.gosubdag import GoSubDag from goatools.test_data.wr_subobo import WrSubObo -from tests.utils import REPO # pylint: disable=line-too-long,unused-variable @@ -29,12 +21,12 @@ def test_gosubdag_relationships(wr_new_obo_subset=False): """Test that GoSubDag contains ancestors from only the user-specified relationships""" # Leaf GO: viral triggering of virus induced gene silencing - goid_chosen = 'GO:0060150' + goid_chosen = "GO:0060150" # Load GODag with all relationships fin_obo = join(REPO, "tests/data/i126/viral_gene_silence.obo") # "go-basic.obo") - godag_r0 = get_godag(fin_obo, loading_bar=None) - godag_r1 = get_godag(fin_obo, loading_bar=None, optional_attrs=['relationship']) + godag_r0 = get_godag(fin_obo) + godag_r1 = get_godag(fin_obo, optional_attrs=["relationship"]) file_sub = join(REPO, "tests/data/viral_gene_silence.obo") @@ -49,35 +41,40 @@ def test_gosubdag_relationships(wr_new_obo_subset=False): # RELATIONSHIPS: ALL gosubdag_r1 = GoSubDag(set([goid_chosen]), godag_r1, relationships=True) assert gosubdag_r1.relationships == RELATIONSHIP_SET - #### set(['part_of', 'regulates', 'positively_regulates', 'negatively_regulates']) assert len(gosubdag_r1.rcntobj.go2ancestors[goid_chosen]) == 50 # RELATIONSHIPS: part_of - gosubdag_rp = GoSubDag(set([goid_chosen]), godag_r1, relationships={'part_of'}) - assert gosubdag_rp.relationships == set(['part_of']) + gosubdag_rp = GoSubDag(set([goid_chosen]), godag_r1, relationships={"part_of"}) + assert gosubdag_rp.relationships == set(["part_of"]) rp_par = gosubdag_rp.rcntobj.go2ancestors[goid_chosen] - assert 'GO:0016441' not in gosubdag_rp.go2obj, '**FATAL: REGULATION TERM GoSubDag(part_of) go2obj' - assert 'GO:0016441' not in rp_par, '**FATAL: REGULATION TERM GoSubDag(part_of) go2parents' + assert ( + "GO:0016441" not in gosubdag_rp.go2obj + ), "**FATAL: REGULATION TERM GoSubDag(part_of) go2obj" + assert ( + "GO:0016441" not in rp_par + ), "**FATAL: REGULATION TERM GoSubDag(part_of) go2parents" # RELATIONSHIPS: regulates - gosubdag_rr = GoSubDag(set([goid_chosen]), godag_r1, relationships={'regulates'}) - assert gosubdag_rr.relationships == set(['regulates']) + gosubdag_rr = GoSubDag(set([goid_chosen]), godag_r1, relationships={"regulates"}) + assert gosubdag_rr.relationships == set(["regulates"]) rp_par = gosubdag_rr.rcntobj.go2ancestors[goid_chosen] - # assert 'GO:0016441' not in gosubdag_rp.go2obj, '**FATAL: REGULATION TERM GoSubDag(part_of) go2obj' - # assert 'GO:0016441' not in rp_par, '**FATAL: REGULATION TERM GoSubDag(part_of) go2parents' # RELATIONSHIPS: positively_regulates - gosubdag_rp = GoSubDag(set([goid_chosen]), godag_r1, relationships={'positively_regulates'}) - assert gosubdag_rp.relationships == set(['positively_regulates']) + gosubdag_rp = GoSubDag( + set([goid_chosen]), godag_r1, relationships={"positively_regulates"} + ) + assert gosubdag_rp.relationships == set(["positively_regulates"]) rp_par = gosubdag_rp.rcntobj.go2ancestors[goid_chosen] # RELATIONSHIPS: negatively_regulates - gosubdag_rn = GoSubDag(set([goid_chosen]), godag_r1, relationships={'negatively_regulates'}) - assert gosubdag_rn.relationships == set(['negatively_regulates']) + gosubdag_rn = GoSubDag( + set([goid_chosen]), godag_r1, relationships={"negatively_regulates"} + ) + assert gosubdag_rn.relationships == set(["negatively_regulates"]) rp_par = gosubdag_rn.rcntobj.go2ancestors[goid_chosen] # RELATIONSHIPS: regulates positively_regulates negatively_regulates - regs = {'positively_regulates', 'negatively_regulates'} + regs = {"positively_regulates", "negatively_regulates"} gosubdag_rnp = GoSubDag(set([goid_chosen]), godag_r1, relationships=regs) assert gosubdag_rnp.relationships == regs rp_par = gosubdag_rnp.rcntobj.go2ancestors[goid_chosen] @@ -89,25 +86,6 @@ def test_gosubdag_relationships(wr_new_obo_subset=False): ancestors_r1 = gosubdag_r1.rcntobj.go2ancestors.get(goid, set()) assert ancestors_r1 == term.get_all_upper() - #### # Test that - #### gosubdag_rp = GoSubDag(set([goid_chosen]), godag_r1, relationships={'part_of'}, prt=sys.stdout) - #### for goid, dag_term in godag_r1.items(): - #### if goid in gosubdag_r1.rcntobj.go2ancestors: - #### ancestors = gosubdag_rp.rcntobj.go2ancestors[goid] - #### sub_term = gosubdag_rp.go2obj[goid] - #### reldict = sub_term.relationship.items() - #### # print(goid) - #### # print('DAG', sorted(dag_term.get_all_upper())) - #### # print('SUB', sorted(sub_term.get_all_upper())) - #### # print('ANS', sorted(ancestors)) - #### # for rel, pterms in cx.OrderedDict(reldict).items(): - #### # print(rel, ' '.join(sorted(o.id for o in pterms))) - #### # print('') - #### print(gosubdag_rp.relationships) - #### #assert 'GO:0016441' not in gosubdag_rp.rcntobj.go2ancestors['GO:0060150'] - #### assert 'GO:0016441' in gosubdag_r1.go2nt - #### assert 'GO:0010467' in gosubdag_r1.go2nt - def _run_baseline_r0(gosubdag_r0, gosubdag_r1): """BASELINE r0: Test that GOTerm.get_all_parents() == GoSubDag ancestors""" @@ -121,41 +99,53 @@ def _run_baseline_r0(gosubdag_r0, gosubdag_r1): if len(ancestors_r0) < len(ancestors_r1): r1_ancestors_more.add(goid) assert r1_ancestors_more - print('{N} r1 GO terms in GoSubDag have more ancestors than r0'.format( - N=len(r1_ancestors_more))) + print( + "{N} r1 GO terms in GoSubDag have more ancestors than r0".format( + N=len(r1_ancestors_more) + ) + ) # scripts/go_plot.py --go_file=i126_goids_baseline.txt -r --obo=tests/data/viral_gene_silence.obo -o i126_goids_baseline.png - fout_gos = 'i126_goids_baseline.txt' - with open(fout_gos, 'w') as prt: - prt.write('#cafffb {SRC_GO}\n'.format(SRC_GO=next(iter(gosubdag_r0.go_sources)))) + fout_gos = "i126_goids_baseline.txt" + with open(fout_gos, "w", encoding="utf-8") as prt: + prt.write( + "#cafffb {SRC_GO}\n".format(SRC_GO=next(iter(gosubdag_r0.go_sources))) + ) _prt_goterms(r1_ancestors_more, gosubdag_r1.go2nt, prt) - print(' WROTE: {GOs}'.format(GOs=fout_gos)) + print(" WROTE: {GOs}".format(GOs=fout_gos)) + def _prt_goterms(goids, go2nt, prt): """Print details of GO terms""" - fmt = ('#ffd1df {GO} # {NS} {dcnt:5} {childcnt:3} ' - 'L{level:02} D{depth:02} R{reldepth:02} {D1:5} {REL} {rel} {GO_name}\n') + fmt = ( + "#ffd1df {GO} # {NS} {dcnt:5} {childcnt:3} " + "L{level:02} D{depth:02} R{reldepth:02} {D1:5} {REL} {rel} {GO_name}\n" + ) nts = [nt for go, nt in go2nt.items() if go in goids] for ntd in sorted(nts, key=lambda nt: nt.dcnt, reverse=True): prt.write(fmt.format(**ntd._asdict())) -#cafffb GO:0060150 -#ffd1df GO:0050794 # BP 8278 64 D03 R03 regulation of cellular process -#ffd1df GO:0019222 # BP 3382 20 D03 R03 regulation of metabolic process -#ffd1df GO:0048522 # BP 2417 65 D04 R04 positive regulation of cellular process -#ffd1df GO:0060255 # BP 2130 20 D04 R04 regulation of macromolecule metabolic process -#ffd1df GO:0010468 # BP 862 20 D05 R05 regulation of gene expression -#ffd1df GO:0060968 # BP 53 4 D06 R08 regulation of gene silencing -#ffd1df GO:0060147 # BP 24 4 D07 R09 regulation of posttranscriptional gene silencing -#ffd1df GO:0060148 # BP 8 3 D08 R10 positive regulation of posttranscriptional gene silencing -#ffd1df GO:0060150 # BP 0 0 D09 R11 viral triggering of virus induced gene silencing + +# cafffb GO:0060150 +# ffd1df GO:0050794 # BP 8278 64 D03 R03 regulation of cellular process +# ffd1df GO:0019222 # BP 3382 20 D03 R03 regulation of metabolic process +# ffd1df GO:0048522 # BP 2417 65 D04 R04 positive regulation of cellular process +# ffd1df GO:0060255 # BP 2130 20 D04 R04 regulation of macromolecule metabolic process +# ffd1df GO:0010468 # BP 862 20 D05 R05 regulation of gene expression +# ffd1df GO:0060968 # BP 53 4 D06 R08 regulation of gene silencing +# ffd1df GO:0060147 # BP 24 4 D07 R09 regulation of posttranscriptional gene silencing +# ffd1df GO:0060148 # BP 8 3 D08 R10 positive regulation of posttranscriptional gene silencing +# ffd1df GO:0060150 # BP 0 0 D09 R11 viral triggering of virus induced gene silencing + # - Generate GO DAG subset for this test --------------------------------------------------------- def _wr_sub_obo(fout_obo, goid_chosen, godag_r1, fin_obo): """Sub plot used for visualizing this test file's elements""" # Load GO-DAG: Load optional 'relationship' - godag = {go:o for go, o in godag_r1.items() if go == o.item_id} + godag = {go: o for go, o in godag_r1.items() if go == o.item_id} _prt_rtel_ctr(godag) - rels_all = set(['part_of', 'regulates', 'negatively_regulates', 'positively_regulates']) + rels_all = set( + ["part_of", "regulates", "negatively_regulates", "positively_regulates"] + ) goids_leaf_all = set(o.id for o in godag.values() if not o.children) gosubdag_r1 = GoSubDag(goids_leaf_all, godag, relationships=True, prt=sys.stdout) goids_src_r1_all = _get_leafs_w_relsinhier(rels_all, gosubdag_r1) @@ -163,14 +153,23 @@ def _wr_sub_obo(fout_obo, goid_chosen, godag_r1, fin_obo): # Pick one of the GO IDs as a source for the subset DAG gosubdag_viral = GoSubDag({goid_chosen}, godag, relationships=True, prt=sys.stdout) goids_viral = set(gosubdag_viral.go2obj.keys()) - with open(fout_obo, 'w') as prt: + with open(fout_obo, "w", encoding="utf-8") as prt: WrSubObo.prt_goterms(fin_obo, goids_viral, prt) - print('{N} GO IDs WROTE: {OBO}'.format(N=len(goids_viral), OBO=fout_obo)) + print("{N} GO IDs WROTE: {OBO}".format(N=len(goids_viral), OBO=fout_obo)) # Plot obo subset - pat_r1 = '{REPO}/scripts/go_plot.py {GO} -o {PNG} -r' - pat_r0 = '{REPO}/scripts/go_plot.py {GO} -o {PNG}' - system(pat_r1.format(REPO=REPO, PNG=fout_obo.replace('.obo', '_r1.png'), GO=goid_chosen)) - system(pat_r0.format(REPO=REPO, PNG=fout_obo.replace('.obo', '_r0.png'), GO=goid_chosen)) + pat_r1 = "{REPO}/scripts/go_plot.py {GO} -o {PNG} -r" + pat_r0 = "{REPO}/scripts/go_plot.py {GO} -o {PNG}" + system( + pat_r1.format( + REPO=REPO, PNG=fout_obo.replace(".obo", "_r1.png"), GO=goid_chosen + ) + ) + system( + pat_r0.format( + REPO=REPO, PNG=fout_obo.replace(".obo", "_r0.png"), GO=goid_chosen + ) + ) + def _get_leafs_w_relsinhier(rels_usr, gosubdag_r1): """Get GO IDs that have all relationships up their hierarchy.""" @@ -178,25 +177,25 @@ def _get_leafs_w_relsinhier(rels_usr, gosubdag_r1): goids_leaf = set(o.id for o in gosubdag_r1.go2obj.values() if not o.children) for goid in goids_leaf: go_parents = gosubdag_r1.rcntobj.go2ancestors[goid] - rels = set(k for p in go_parents for k in gosubdag_r1.go2obj[p].relationship.keys()) + rels = set( + k for p in go_parents for k in gosubdag_r1.go2obj[p].relationship.keys() + ) if rels == rels_usr: gos_r1_relsinhier.add(goid) return gos_r1_relsinhier + def _prt_rtel_ctr(godag): """Print the count of relationships.""" objs_r1_all = set(o for o in godag.values() if o.relationship.keys()) octr = cx.Counter(k for o in objs_r1_all for k in o.relationship.keys()) # objs_r1_sub = set(o.id for o in objs_r1_all if not rels_all.isdisjoint(o.relationship.keys())) - print('{N:6,} GO Terms have relationships.'.format(N=len(objs_r1_all))) + print("{N:6,} GO Terms have relationships.".format(N=len(objs_r1_all))) for key, cnt in octr.most_common(): - print('{N:6,} {REL}'.format(N=cnt, REL=key)) + print("{N:6,} {REL}".format(N=cnt, REL=key)) -# def _chk_child_parent(go2o_dag, go2o_sub): -# """Check the differences between the two go2obb dicts.""" -# pass -if __name__ == '__main__': +if __name__ == "__main__": test_gosubdag_relationships(len(sys.argv) != 1) # Copyright (C) 2016-2019, DV Klopfenstein, H Tang, All rights reserved. diff --git a/tests/test_grpr_get_sections_2d.py b/tests/test_grpr_get_sections_2d.py index 27251239..0de1c2d8 100755 --- a/tests/test_grpr_get_sections_2d.py +++ b/tests/test_grpr_get_sections_2d.py @@ -9,42 +9,55 @@ import os import sys + from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag -from goatools.test_data.gjoneska_goea_consistent_increase import goea_results -from goatools.test_data.sections.gjoneska_pfenning import SECTIONS from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections +from goatools.test_data.gjoneska_goea_consistent_increase import goea_results +from goatools.test_data.sections.gjoneska_pfenning import SECTIONS PRT = sys.stdout REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_fnc(): """Test function, get_sections_2d, in the Grouper class.""" - usrgo2nt = {getattr(nt, 'GO'):nt for nt in goea_results if getattr(nt, 'p_fdr_bh') < 0.05} + usrgo2nt = { + getattr(nt, "GO"): nt for nt in goea_results if getattr(nt, "p_fdr_bh") < 0.05 + } usrgos = usrgo2nt.keys() grprdflt = _get_grprdflt() - hdrobj = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=SECTIONS, hdrgos=None) + hdrobj = HdrgosSections( + grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=SECTIONS, hdrgos=None + ) grprobj = Grouper("test", usrgos, hdrobj, grprdflt.gosubdag, go2nt=usrgo2nt) assert set(usrgos) == grprobj.usrgos sections_act = grprobj.get_sections_2d() chk_results(sections_act, grprobj) + def chk_results(sections_act, grprobj): """Get expected results.""" hdrgos_act = grprobj.get_hdrgos() hdrgos_sec_act = set([g for _, gs in sections_act for g in gs]) assert hdrgos_act == hdrgos_sec_act num_gos_orig = sum([len(gs) for _, gs in SECTIONS]) - PRT.write("{N} of {M} Sections Header GOs for {U} user GOs, {H} headers\n".format( - N=len(hdrgos_sec_act), M=num_gos_orig, U=len(grprobj.usrgos), H=len(hdrgos_act))) - sections_act_dict = {s:hs for s, hs in sections_act} + PRT.write( + "{N} of {M} Sections Header GOs for {U} user GOs, {H} headers\n".format( + N=len(hdrgos_sec_act), + M=num_gos_orig, + U=len(grprobj.usrgos), + H=len(hdrgos_act), + ) + ) + sections_act_dict = {s: hs for s, hs in sections_act} # Check that order of actual header GOs is the same as found in the sections 2-d list for section_name, hdrgos_all in SECTIONS: hdrgos_act = sections_act_dict.get(section_name, None) if hdrgos_act is not None: - h2i = {h:i for i, h in enumerate(hdrgos_act)} + h2i = {h: i for i, h in enumerate(hdrgos_act)} idx_act = None for hdrgo in hdrgos_all: idx = h2i.get(hdrgo, "") @@ -59,17 +72,20 @@ def chk_results(sections_act, grprobj): assert idx == 0 and idx_act is None or idx_act == idx - 1 idx_act = idx + def _get_gosubdag(): """Get GO DAG.""" - fin = os.path.join(REPO, 'go-basic.obo') - godag = get_godag(fin, prt=sys.stdout, loading_bar=False, optional_attrs=['relationship']) + fin = os.path.join(REPO, "go-basic.obo") + godag = get_godag(fin, prt=sys.stdout, optional_attrs=["relationship"]) return GoSubDag(None, godag) + def _get_grprdflt(): """Get Grouper defaults.""" gosubdag = _get_gosubdag() - fin_slim = os.path.join(REPO, 'goslim_generic.obo') + fin_slim = os.path.join(REPO, "goslim_generic.obo") return GrouperDflts(gosubdag, fin_slim) -if __name__ == '__main__': + +if __name__ == "__main__": test_fnc() diff --git a/tests/test_grprobj.py b/tests/test_grprobj.py index 873cb89e..b6da0b56 100755 --- a/tests/test_grprobj.py +++ b/tests/test_grprobj.py @@ -3,11 +3,12 @@ import os import sys + from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.wr_sections import WrSectionsTxt REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -15,92 +16,287 @@ def test_grouper_d2(do_plot=False): """Group depth-02 GO terms under their most specific depth-01 GO parent(s).""" - print('CWD', os.getcwd()) + print("CWD", os.getcwd()) # Get GOs to be grouped # Since no "Grouping GOs" were provided, depth-01 GOs are used for grouping. grprdflt = _get_grprdflt() - hdrobj = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=None) - grprobj = Grouper("Transient Increase", get_data0(), hdrobj, _get_gosubdag(), go2nt=None) + hdrobj = HdrgosSections( + grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=None + ) + grprobj = Grouper( + "Transient Increase", get_data0(), hdrobj, _get_gosubdag(), go2nt=None + ) objwr = WrSectionsTxt(grprobj) objwr.wr_txt_section_hdrgos("transient_increase_hdrgos.txt") objwr.wr_txt_grouping_gos() if do_plot: # Don't run in Travis-CI because it does not contain 'dot' from goatools.grouper.grprplt import GrouperPlot + GrouperPlot(grprobj).plot_groups_unplaced() chk_hdrs(grprobj) + def chk_hdrs(grprobj, prt=sys.stdout): """Check GO group headers.""" hdrgos_all = grprobj.get_hdrgos() hdrgos_u0 = grprobj.get_hdrgos_u0() hdrgos_u1 = grprobj.get_hdrgos_u1() - prt.write("{N} hdrgos ({U} are also user GO IDs) used out of {M} available\n".format( - N=len(hdrgos_all), U=len(hdrgos_u1), M=len(grprobj.hdrobj.hdrgos))) + prt.write( + "{N} hdrgos ({U} are also user GO IDs) used out of {M} available\n".format( + N=len(hdrgos_all), U=len(hdrgos_u1), M=len(grprobj.hdrobj.hdrgos) + ) + ) assert hdrgos_u0.union(hdrgos_u1) == hdrgos_all + def get_data0(): """Nature GO ids.""" return [ - "GO:0007049", "GO:0022402", "GO:0022403", "GO:0000279", "GO:0006259", - "GO:0000278", "GO:0051301", "GO:0000087", "GO:0007067", "GO:0000280", - "GO:0048285", "GO:0006996", "GO:0006260", "GO:0006974", "GO:0033554", - "GO:0006281", "GO:0016043", "GO:0051716", "GO:0009987", "GO:0006323", - "GO:0051276", "GO:0007059", "GO:0006139", "GO:0065004", "GO:0051726", - "GO:0007017", "GO:0031497", "GO:0034728", "GO:0006950", "GO:0034641", - "GO:0006334", "GO:0006333", "GO:0034621", "GO:0006807", "GO:0006261", - "GO:0007126", "GO:0051327", "GO:0051321", "GO:0034622", "GO:0044260", - "GO:0043933", "GO:0000226", "GO:0042770", "GO:0000075", "GO:0006270", - "GO:0065003", "GO:0006310", "GO:0010564", "GO:0022607", "GO:0006325", - "GO:0043170", "GO:0008629", "GO:0007346", "GO:0044085", "GO:0008630", - "GO:0051052", "GO:0050896", "GO:0031570", "GO:0051053", "GO:0007018", - "GO:0007051", "GO:0007093", "GO:0006275", "GO:0009411", "GO:0034645", - "GO:0000910", "GO:0009059", "GO:0044237", "GO:0010212", "GO:0000077", - "GO:0030261", "GO:0009615", "GO:0002376", "GO:0006955", "GO:0006952", - "GO:0002682", "GO:0050896", "GO:0048518", "GO:0048002", "GO:0050776", - "GO:0009605", "GO:0048583", "GO:0050778", "GO:0051707", "GO:0048584", - "GO:0019882", "GO:0009607", "GO:0019884", "GO:0002684", "GO:0048522", - "GO:0009611", "GO:0051704", "GO:0002252", "GO:0001819", "GO:0006954", - "GO:0002478", "GO:0002474", "GO:0030029", "GO:0006950", "GO:0030036", - "GO:0007155", "GO:0022610", "GO:0048856", "GO:0050793", "GO:0048731", - "GO:0048518", "GO:0006629", "GO:0007275", "GO:0032502", "GO:0051239", - "GO:0048513", "GO:0048522", "GO:0044255", "GO:0048869", "GO:0009611", - "GO:0031589", "GO:0009605", "GO:0010646", "GO:0002376", "GO:0043436", - "GO:0019752", "GO:0006082", "GO:0032787", "GO:0042127", "GO:0009987", - "GO:0030154", "GO:0042180", "GO:0001944", "GO:0065008", "GO:0006631", - "GO:0009966", "GO:0048583", "GO:0002682", "GO:0001568", "GO:0009653", - "GO:0007399", "GO:0007160", "GO:0045321", "GO:0001775", "GO:0080134", - "GO:0051093", "GO:0048519", "GO:0030155", "GO:0007167", "GO:0042221", - "GO:0045595", "GO:0048514", "GO:0042060", "GO:0030029", "GO:0048523", - "GO:0002684", "GO:0051234", "GO:0006810", "GO:0051179", "GO:0007268", - "GO:0019226", "GO:0051649", "GO:0051641", "GO:0009987", "GO:0007267", - "GO:0006811", "GO:0007154", "GO:0007611", "GO:0015672", "GO:0006812", - "GO:0006813", "GO:0046907", "GO:0006793", "GO:0006796", "GO:0030001", - "GO:0006091", "GO:0007612", "GO:0015031", "GO:0043632", "GO:0019941", - "GO:0045184", "GO:0045333", "GO:0031175", "GO:0048812", "GO:0044057", - "GO:0048858", "GO:0007399", "GO:0007409", "GO:0030030", "GO:0010646", - "GO:0032990", "GO:0006816", "GO:0048666", "GO:0051179", "GO:0048667", - "GO:0048699", "GO:0030001", "GO:0006811", "GO:0022008", "GO:0050804", - "GO:0009987", "GO:0006812", "GO:0000904", "GO:0031644", "GO:0006796", - "GO:0006793", "GO:0051969", "GO:0030182", "GO:0016310", "GO:0015674", - "GO:0007242", "GO:0006468", "GO:0006810", "GO:0051234", "GO:0007268", - "GO:0000902", "GO:0019226", "GO:0051056", "GO:0043687", "GO:0032989", - "GO:0006464", "GO:0016192", "GO:0016043", "GO:0007411", "GO:0043412", - "GO:0007610", "GO:0007267", "GO:0009966", "GO:0048468", "GO:0007154", - "GO:0048731", "GO:0006928", "GO:0015672"] + "GO:0007049", + "GO:0022402", + "GO:0022403", + "GO:0000279", + "GO:0006259", + "GO:0000278", + "GO:0051301", + "GO:0000087", + "GO:0007067", + "GO:0000280", + "GO:0048285", + "GO:0006996", + "GO:0006260", + "GO:0006974", + "GO:0033554", + "GO:0006281", + "GO:0016043", + "GO:0051716", + "GO:0009987", + "GO:0006323", + "GO:0051276", + "GO:0007059", + "GO:0006139", + "GO:0065004", + "GO:0051726", + "GO:0007017", + "GO:0031497", + "GO:0034728", + "GO:0006950", + "GO:0034641", + "GO:0006334", + "GO:0006333", + "GO:0034621", + "GO:0006807", + "GO:0006261", + "GO:0007126", + "GO:0051327", + "GO:0051321", + "GO:0034622", + "GO:0044260", + "GO:0043933", + "GO:0000226", + "GO:0042770", + "GO:0000075", + "GO:0006270", + "GO:0065003", + "GO:0006310", + "GO:0010564", + "GO:0022607", + "GO:0006325", + "GO:0043170", + "GO:0008629", + "GO:0007346", + "GO:0044085", + "GO:0008630", + "GO:0051052", + "GO:0050896", + "GO:0031570", + "GO:0051053", + "GO:0007018", + "GO:0007051", + "GO:0007093", + "GO:0006275", + "GO:0009411", + "GO:0034645", + "GO:0000910", + "GO:0009059", + "GO:0044237", + "GO:0010212", + "GO:0000077", + "GO:0030261", + "GO:0009615", + "GO:0002376", + "GO:0006955", + "GO:0006952", + "GO:0002682", + "GO:0050896", + "GO:0048518", + "GO:0048002", + "GO:0050776", + "GO:0009605", + "GO:0048583", + "GO:0050778", + "GO:0051707", + "GO:0048584", + "GO:0019882", + "GO:0009607", + "GO:0019884", + "GO:0002684", + "GO:0048522", + "GO:0009611", + "GO:0051704", + "GO:0002252", + "GO:0001819", + "GO:0006954", + "GO:0002478", + "GO:0002474", + "GO:0030029", + "GO:0006950", + "GO:0030036", + "GO:0007155", + "GO:0022610", + "GO:0048856", + "GO:0050793", + "GO:0048731", + "GO:0048518", + "GO:0006629", + "GO:0007275", + "GO:0032502", + "GO:0051239", + "GO:0048513", + "GO:0048522", + "GO:0044255", + "GO:0048869", + "GO:0009611", + "GO:0031589", + "GO:0009605", + "GO:0010646", + "GO:0002376", + "GO:0043436", + "GO:0019752", + "GO:0006082", + "GO:0032787", + "GO:0042127", + "GO:0009987", + "GO:0030154", + "GO:0042180", + "GO:0001944", + "GO:0065008", + "GO:0006631", + "GO:0009966", + "GO:0048583", + "GO:0002682", + "GO:0001568", + "GO:0009653", + "GO:0007399", + "GO:0007160", + "GO:0045321", + "GO:0001775", + "GO:0080134", + "GO:0051093", + "GO:0048519", + "GO:0030155", + "GO:0007167", + "GO:0042221", + "GO:0045595", + "GO:0048514", + "GO:0042060", + "GO:0030029", + "GO:0048523", + "GO:0002684", + "GO:0051234", + "GO:0006810", + "GO:0051179", + "GO:0007268", + "GO:0019226", + "GO:0051649", + "GO:0051641", + "GO:0009987", + "GO:0007267", + "GO:0006811", + "GO:0007154", + "GO:0007611", + "GO:0015672", + "GO:0006812", + "GO:0006813", + "GO:0046907", + "GO:0006793", + "GO:0006796", + "GO:0030001", + "GO:0006091", + "GO:0007612", + "GO:0015031", + "GO:0043632", + "GO:0019941", + "GO:0045184", + "GO:0045333", + "GO:0031175", + "GO:0048812", + "GO:0044057", + "GO:0048858", + "GO:0007399", + "GO:0007409", + "GO:0030030", + "GO:0010646", + "GO:0032990", + "GO:0006816", + "GO:0048666", + "GO:0051179", + "GO:0048667", + "GO:0048699", + "GO:0030001", + "GO:0006811", + "GO:0022008", + "GO:0050804", + "GO:0009987", + "GO:0006812", + "GO:0000904", + "GO:0031644", + "GO:0006796", + "GO:0006793", + "GO:0051969", + "GO:0030182", + "GO:0016310", + "GO:0015674", + "GO:0007242", + "GO:0006468", + "GO:0006810", + "GO:0051234", + "GO:0007268", + "GO:0000902", + "GO:0019226", + "GO:0051056", + "GO:0043687", + "GO:0032989", + "GO:0006464", + "GO:0016192", + "GO:0016043", + "GO:0007411", + "GO:0043412", + "GO:0007610", + "GO:0007267", + "GO:0009966", + "GO:0048468", + "GO:0007154", + "GO:0048731", + "GO:0006928", + "GO:0015672", + ] + def _get_gosubdag(): """Get GO DAG.""" - fin = os.path.join(REPO, 'go-basic.obo') - godag = get_godag(fin, prt=sys.stdout, loading_bar=False, optional_attrs=['relationship']) + fin = os.path.join(REPO, "go-basic.obo") + godag = get_godag(fin, prt=sys.stdout, optional_attrs=["relationship"]) return GoSubDag(None, godag) + def _get_grprdflt(): """Get Grouper defaults.""" gosubdag = _get_gosubdag() - fin_slim = os.path.join(REPO, 'goslim_generic.obo') + fin_slim = os.path.join(REPO, "goslim_generic.obo") return GrouperDflts(gosubdag, fin_slim) -if __name__ == '__main__': +if __name__ == "__main__": test_grouper_d2(True) diff --git a/tests/test_i122_goea.py b/tests/test_i122_goea.py index 70902265..bbfc7f51 100755 --- a/tests/test_i122_goea.py +++ b/tests/test_i122_goea.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 """Test to re-produce issue#122: Passes currently.""" -from __future__ import print_function - import os import xlrd -from goatools.base import get_godag + from goatools.associations import dnld_ncbi_gene_file from goatools.associations import read_ncbi_gene2go +from goatools.base import get_godag from goatools.go_enrichment import GOEnrichmentStudy REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -15,67 +14,82 @@ def test_i122(): """Test to re-produce issue#122: Passes currently.""" - obj = _Run(9606, 'gene2go', 'go-basic.obo') + obj = _Run(9606, "gene2go", "go-basic.obo") study_ids, population_ids = obj.get_genes_study_n_bg() # Result is the same whether fisher_scipy_stats of fisher - pvalcalc = 'fisher_scipy_stats' - goeaobj = GOEnrichmentStudy(population_ids, obj.gene2go, obj.godag, methods=['bonferroni', 'fdr_bh'], pvalcalc=pvalcalc) + pvalcalc = "fisher_scipy_stats" + goeaobj = GOEnrichmentStudy( + population_ids, + obj.gene2go, + obj.godag, + methods=["bonferroni", "fdr_bh"], + pvalcalc=pvalcalc, + ) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study_nts(study_ids) - print('NS GO p stu_ratio pop_ratio p-uncorr bonferro fdr_bh stu ') + print("NS GO p stu_ratio pop_ratio p-uncorr bonferro fdr_bh stu ") for ntd in results_goeas: if ntd.study_count == 0: doprt = False if ntd.p_bonferroni < 0.05: - assert ntd.enrichment == 'p' + assert ntd.enrichment == "p" doprt = True if ntd.p_fdr_bh < 0.05: - assert ntd.enrichment == 'p' + assert ntd.enrichment == "p" doprt = True if doprt: print(obj.str_nt(ntd)) # print(next(iter(results_goeas))._fields) -class _Run(): + +class _Run: """Run test.""" - patrec = '{NS} {GO} {e} {RS} {RP:>12} {PVAL:8.2e} {BONF:8.2e} {BH:8.2e} {STU}' + patrec = "{NS} {GO} {e} {RS} {RP:>12} {PVAL:8.2e} {BONF:8.2e} {BH:8.2e} {STU}" def __init__(self, taxid, fin_gene2go, fin_gobasic): _fin = os.path.join(REPO, fin_gene2go) - dnld_ncbi_gene_file(_fin, loading_bar=None) + dnld_ncbi_gene_file(_fin) self.gene2go = read_ncbi_gene2go(_fin, [taxid]) _fin_obo = os.path.join(REPO, fin_gobasic) - self.godag = get_godag(_fin_obo, loading_bar=None) + self.godag = get_godag(_fin_obo) def str_nt(self, ntd): return self.patrec.format( - NS=ntd.NS, GO=ntd.GO, - RS=ntd.ratio_in_study, RP=str(ntd.ratio_in_pop), + NS=ntd.NS, + GO=ntd.GO, + RS=ntd.ratio_in_study, + RP=str(ntd.ratio_in_pop), e=ntd.enrichment, - PVAL=ntd.p_uncorrected, BONF=ntd.p_bonferroni, BH=ntd.p_fdr_bh, - STU=ntd.study_items) + PVAL=ntd.p_uncorrected, + BONF=ntd.p_bonferroni, + BH=ntd.p_fdr_bh, + STU=ntd.study_items, + ) def get_genes_study_n_bg(self): """Get the study and background genes.""" - genes = {'stu':set(), 'pop':set()} - fin_xlsx = 'data/i122/study_and_bg_genes.xlsx' + genes = {"stu": set(), "pop": set()} + fin_xlsx = "data/i122/study_and_bg_genes.xlsx" book = xlrd.open_workbook(os.path.join(fin_xlsx)) sheet = book.sheet_by_index(0) for rownum in range(sheet.nrows): gene_stu = self._get_gene(rownum, 0, sheet) gene_pop = self._get_gene(rownum, 1, sheet) if gene_stu: - genes['stu'].add(gene_stu) + genes["stu"].add(gene_stu) if gene_pop: - genes['pop'].add(gene_pop) - print('{N} Study genes. {P} Background genes READ: {XLSX}'.format( - N=len(genes['stu']), P=len(genes['pop']), XLSX=fin_xlsx)) - return genes['stu'], genes['pop'] - - @staticmethod + genes["pop"].add(gene_pop) + print( + "{N} Study genes. {P} Background genes READ: {XLSX}".format( + N=len(genes["stu"]), P=len(genes["pop"]), XLSX=fin_xlsx + ) + ) + return genes["stu"], genes["pop"] + + @staticmethod def _get_gene(row, col, sheet): """Return the gene ID, if a gene is found.""" gene_id = sheet.cell_value(row, col) @@ -83,5 +97,5 @@ def _get_gene(row, col, sheet): return int(gene_id) -if __name__ == '__main__': +if __name__ == "__main__": test_i122() diff --git a/tests/test_i96_goea_ncbi.py b/tests/test_i96_goea_ncbi.py index 9036936b..5bbb61a6 100755 --- a/tests/test_i96_goea_ncbi.py +++ b/tests/test_i96_goea_ncbi.py @@ -1,12 +1,10 @@ #!/usr/bin/env python """Test to re-produce issue#96: Passes currently.""" -from __future__ import print_function - import os + from goatools.base import get_godag -from goatools.associations import dnld_ncbi_gene_file -from goatools.associations import read_ncbi_gene2go +from goatools.associations import dnld_ncbi_gene_file, read_ncbi_gene2go from goatools.go_enrichment import GOEnrichmentStudy from goatools.test_data.genes_NCBI_9606_All import GENEID2NT @@ -17,99 +15,1309 @@ def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes - print('CWD', os.getcwd()) + print("CWD", os.getcwd()) study_ids = _get_geneids() population_ids = GENEID2NT.keys() # Get databases print(os.getcwd()) - fin = os.path.join(REPO, 'gene2go') - dnld_ncbi_gene_file(fin, loading_bar=None) + fin = os.path.join(REPO, "gene2go") + dnld_ncbi_gene_file(fin) gene2go = read_ncbi_gene2go(fin, [9606]) fin_obo = os.path.join(REPO, "go-basic.obo") - godag = get_godag(fin_obo, loading_bar=None) - goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) + godag = get_godag(fin_obo) + goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=["fdr_bh"]) # Run GOEA Gene Ontology Enrichment Analysis - results_goeas = goeaobj.run_study(study_ids) + goeaobj.run_study(study_ids) # pylint: disable=line-too-long, bad-continuation def _get_geneids(): """Return study gene set.""" - symbol2geneid = {nt.Symbol:g for g, nt in GENEID2NT.items()} - symbols = ['MICAL2', 'MIR1231', 'ZMIZ1', 'CRIM1', 'SMAD3', 'EFEMP1', 'CRIM1', 'ANXA2', 'VGLL3', 'FHL2', 'FSTL1', 'KIAA1456', 'MIR4316', 'MYH9', 'SIPA1L1', 'C15orf53', 'TRAM2', 'IGFBP7-AS1', 'CALD1', - 'RP5-1120P11.1', 'WNT2B', 'DDAH1', 'MIR1203', 'NRG1', 'SEC24D', 'NHSL2', 'ERGIC1', 'RPL37A', 'PTPN14', 'FEZ2', 'VEGFC', 'C2orf61', 'MIR30A', 'CAPZB', 'SMAD3', 'AAGAB', 'EPS8', 'ITGB5', 'LRP1-AS', - 'NRP1', 'WWTR1-AS1', 'CDK6', 'ENTPD6', 'THBS1', 'AC016735.2', 'ZCCHC24', 'LINC00592', 'HSPG2', 'MIRLET7A2', 'SMAD6', 'STARD13', 'EMP1', 'LINC00656', 'CALD1', 'C10orf142', 'ARID5B', 'MIR6809', - 'MIR5191', 'SNORA59A', 'KIAA1462', 'FERMT2', 'ADAMTS6', 'RBMS1', 'MIR8073', 'MBNL1-AS1', 'TGFBR1', 'SH2D4A', 'DST', 'OTX2-AS1', 'LAMA4', 'ASAP1', 'RP11-161M6.2', 'DST', 'SMAD7', 'AFAP1-AS1', - 'MIR4803', 'RP5-1158E12.3', 'LPCAT2', 'NDST1', 'FAM105A', 'SMURF2', 'RP4-673D20.3', 'ZMIZ1', 'NLRP1', 'NAV2-AS5', 'CCNL1', 'MICAL2', 'SH3RF1', 'IL1R2', 'LINC00161', 'MIR1294', 'MYLK-AS2', 'THBS1', - 'RNF24', 'TNS4', 'FBN1', 'DCUN1D3', 'CREB3L2', 'RSRP1', 'RP1-63G5.7', 'LINC01365', 'RPL23AP87', 'SNORA59A', 'DUSP7', 'TMEM163', 'EXT1', 'ATXN10', 'MIR4316', 'MYL9', 'ABHD2', 'ADCY5', 'LRTM2', - 'FAM83C', 'DLC1', 'LINC01057', 'SMAD6', 'NAV1', 'MIR584', 'TMEM212', 'COL15A1', 'PLCXD2', 'PRRX1', 'BCL10', 'MIR2278', 'FEM1B', 'ABTB2', 'NIPAL2', 'CDCP1', 'CCDC80', 'FBN1', 'DEFB104A', 'RP11-30P6.6', - 'LINC01085', 'AKAP2', 'ADAMTS20', 'MIR3152', 'LMO7-AS1', 'RP11-887P2.5', 'ARHGAP26-IT1', 'TSPAN17', 'CYP3A43', 'CORO2B', 'RP3-332B22.1', 'RARA', 'MSN', 'UMODL1', 'C12orf74', 'TRAM2', 'LHFPL2', - 'TRIP13', 'PALLD', 'NRP2', 'LINC00607', 'COL6A3', 'CLMP', 'MIR4316', 'PTPN14', 'LINC01354', 'CBR4', 'FNDC3B', 'LINC01426', 'WISP2', 'NUDT6', 'MIR6083', 'GRHL1', 'KIF13A', 'VCL', 'MIR125B1', - 'FOXP1-AS1', 'CLCF1', 'CDK5RAP2', 'RP11-356I2.4', 'NEK6', 'CTLA4', 'SLC8A1-AS1', 'BACH1', 'MIR100', 'MIR3619', 'CD44', 'GOLT1A', 'LUM', 'BCAS3', 'MIR1208', 'NGF', 'INHBA', 'MRPS23', 'STK40', 'HK1', - 'ASAP2', 'WBP1L', 'HMCN2', 'DEFB104B', 'VAC14', 'MYOF', 'PTGIS', 'KIRREL', 'MAP4K4', 'GALNT10', 'LHFPL3-AS2', 'EGFR', 'TGM2', 'DMRTB1', 'YPEL5', 'EPG5', 'FNDC3B', 'SND1', 'MIR6827', 'MIR3126', - 'LAMA4', 'PPIF', 'ITGB1', 'SEMA5B', 'TEAD1', 'PAMR1', 'GCLM', 'NFIB', 'TM4SF4', 'LINC01132', 'BASP1', 'SPECC1', 'MAPKAPK2', 'CSRP1', 'NID2', 'DST', 'ERG', 'GLI2', 'TMEM50A', 'MIR29A', 'NBPF26', - 'LRRC8D', 'LINC01119', 'FNDC3B', 'LINC00917', 'GADD45A', 'SP3', 'LINC01625', 'ADAMTS9-AS1', 'TCF23', 'NPLOC4', 'MIR1252', 'ZEB2-AS1', 'NEURL1-AS1', 'SLC25A51', 'XPNPEP1', 'WASF2', 'SLC25A3P1', - 'GPR88', 'RXFP2', 'RUNX1T1', 'ME3', 'RAI14', 'PNMA2', 'TOM1L2', 'CWC22', 'R3HCC1', 'MYLK-AS2', 'ERVFRD-1', 'FENDRR', 'TIPARP', 'SMAD6', 'TPM1', 'RP11-266L9.4', 'B4GALT1', 'REP15', 'MIR8052', 'LAMB1', - 'FRMD6-AS1', 'ETS1', 'COL1A2', 'MIR4289', 'MAP1B', 'NABP1', 'SLC8A1-AS1', 'NOTCH2', 'ZMIZ1', 'GRHL2', 'CTD-2015G9.1', 'DLC1', 'CCNL1', 'NBPF14', 'ITGA11', 'TNS3', 'TTPAL', 'NBPF19', 'STAT3', - 'MIR4668', 'LINC00607', 'CARMN', 'COL22A1', 'MIR583', 'WT1', 'NEK7', 'PLEKHG4B', 'TIGD6', 'RP11-778O17.4', 'NOTCH2', 'SPACA1', 'TEAD1', 'TOX2', 'GALNT2', 'EVX2', 'SALRNA3', 'DEFB103B', 'CCDC12', - 'ITGAE', 'EXT1', 'RRBP1', 'TMEM189-UBE2V1', 'SEPT11', 'EXT1', 'ZBTB38', 'ASTN2-AS1', 'RAI1', 'LETM2', 'HIPK2', 'CLMP', 'PDLIM5', 'UBR5', 'RP1-102K2.8', 'RREB1', 'MIR1260B', 'NEK6', 'ARHGEF12', - 'PRR16', 'STRA6', 'MIR222', 'SH3RF1', 'STK35', 'RDH10-AS1', 'RP5-1070A16.1', 'AP3M1', 'ANKRD33B', 'SNORA3B', 'ANKRD40', 'PRRC2B', 'LAMC1', 'ADAMTSL1', 'EDN2', 'FMN1', 'CACNA1C-AS4', 'RP11-90C4.2', - 'MBNL1-AS1', 'TARS', 'LIPC', 'ASAP1', 'MIR6090', 'PAX2', 'CHD2', 'DLC1', 'LPCAT1', 'CITF22-92A6.2', 'NAV1', 'HIPK2', 'CCNI', 'C6orf89', 'VAC14-AS1', 'LINC01225', 'TRIM8', 'ADAM12', 'LMCD1', - 'RP11-205K6.1', 'PPFIBP1', 'SLC4A4', 'TBX15', 'ACOXL', 'FAM83C', 'DYRK1A', 'AC109344.1', 'DUSP1', 'RPL22L1', 'NCOA4', 'LOXL1-AS1', 'CORO1C', 'CMSS1', 'CYP11A1', 'SYNJ2-IT1', 'EPS8L3', 'RP11-366L20.2', - 'RP4-569M23.5', 'PAX8', 'MIR3152', 'ACKR3', 'LTBP2', 'ARSJ', 'RFTN1', 'FHL1', 'NARS2', 'EXT1', 'SH3BGRL3', 'ADAMTS2', 'COL8A1', 'IRF2BPL', 'NREP', 'NOTCH2NL', 'USP2', 'HSPH1', 'SKOR2', 'C16orf72', - 'ENPP2', 'TSPAN18', 'IRF1', 'TNC', 'OPTC', 'PDLIM5', 'PGBD3', 'CCDC167', 'COL12A1', 'RIMBP3', 'TFPI', 'SOCS5', 'HSPG2', 'SH2B3', 'LINC00316', 'DDR2', 'BCL10', 'LINC01132', 'ABHD5', 'TGFB3', 'DYSF', - 'TRPA1', 'TRIM8', 'UCK2', 'SOST', 'COL5A2', 'CA4', 'IGDCC3', 'SIPA1L1', 'SRCIN1', 'AP001626.1', 'AC010091.1', 'RAI1-AS1', 'MIR1260B', 'TCHP', 'RP11-168P8.5', 'BDKRB1', 'TSPYL5', 'MYLK-AS2', 'FOSL2', - 'MIR4743', 'TGFBI', 'AC020571.3', 'THBS1', 'NANOS1', 'LINC01447', 'LMO7-AS1', 'AP2S1', 'RMDN2', 'MIR4316', 'DCLK2', 'MIR4280', 'NFATC3', 'SEMA5B', 'MYO1B', 'ROR1', 'MED15', 'NFX1', 'HIC1', 'MIR1203', - 'STX1A', 'ANKFN1', 'CTB-113D17.1', 'MIR205HG', 'FAM129B', 'SH3BP4', 'RP11-478P10.1', 'MIR151A', 'CACNG4', 'CRYBA1', 'APBB2', 'CCDC80', 'TRIO', 'F2R', 'RAF1', 'CYTOR', 'ITGB6', 'PITPNB', 'DHRS3', - 'WDFY2', 'RNF141', 'ARL6IP5', 'MIR4435-2', 'MAP4K4', 'MIRLET7I', 'RND3', 'CXXC5', 'SNORA70E', 'ANXA2', 'KCCAT211', 'PLCB1', 'TMCC2', 'EXT1', 'RP11-366L20.2', 'WDR86-AS1', 'HMG20A', 'RP11-38F22.1', - 'FYCO1', 'LPP', 'HABP2', 'TSPEAR', 'ABLIM1', 'RP11-443B7.1', 'FAM20B', 'RASSF10', 'XPC', 'TNIP3', 'ACSL4', 'MTMR2', 'TNIK', 'RELT', 'CRIPT', 'RP11-572C15.5', 'CCDC81', 'CMSS1', 'C6orf223', 'SHISA4', - 'PDGFA', 'HS1BP3-IT1', 'MYPN', 'XCR1', 'ZFP36L1', 'CBR4', 'TRAPPC3', 'MIR802', 'CSRP1', 'DAPK2', 'SPESP1', 'RP11-890B15.3', 'SHB', 'INSIG2', 'ADGRG1', 'GPC6-AS1', 'KIRREL3-AS3', 'LXN', 'CBR4', 'CPA1', - 'MINPP1', 'NFIX', 'FLRT2', 'MIR6070', 'USP3', 'PRR16', 'ALPL', 'RP11-379K22.2', 'ADAMTSL1', 'RRBP1', 'RP11-430H10.2', 'MAPKAPK3', 'ABHD16B', 'CDKN3', 'STEAP3-AS1', 'RTP3', 'SLA', 'CYP1B1-AS1', - 'MIR6888', 'HIVEP3', 'LINC01119', 'CCDC71L', 'MACF1', 'EFNB1', 'CBLB', 'MIR760', 'NAMA', 'LNX1-AS1', 'KMT2E', 'PYROXD2', 'LMO7DN', 'EML4', 'CCDC80', 'SEC22A', 'COL21A1', 'CDC42EP3', 'EPHA2', 'CAPZA2', - 'PHLDB2', 'TPPP', 'MIR3129', 'LIMA1', 'PDE1C', 'RUNX2', 'SPRED2', 'C1QTNF1', 'EPHA2', 'IRF1', 'MIR4263', 'RXFP2', 'MTNR1A', 'CUEDC1', 'GCNT1', 'MIR3152', 'ST5', 'ITGA11', 'RP11-366L20.2', 'MAGI2', - 'KCCAT211', 'MIR6090', 'EMX1', 'CDC42EP3', 'PKP4', 'BCL10', 'SERPINB7', 'IKBKE', 'AGMO', 'RUNX1', 'PHC2', 'SH2D7', 'PARVA', 'B4GALT5', 'STAT4', 'ACTN4', 'RTKN2', 'MIR1260B', 'SH3PXD2B', 'ACTN1-AS1', - 'LINC00882', 'SLC8A1', 'NREP-AS1', 'THADA', 'DDAH1', 'MIR4274', 'SERPINE1', 'ASAP1-IT2', 'APH1B', 'IGF2BP2-AS1', 'MUSK', 'TRAF3IP2', 'COLEC12', 'EXT1', 'FLJ22447', 'CTB-113P19.1', 'RBPJ', - 'RP11-230G5.2', 'PALLD', 'SLC1A2', 'MIR190A', 'FHOD3', 'LHFPL2', 'C2CD5', 'SLCO4A1', 'SYPL1', 'ARHGAP18', 'MIR4703', 'SOX1', 'DIXDC1', 'TM2D3', 'MIR4743', 'CASC23', 'KDM4C', 'RAI14', 'MYOZ3', - 'MAP3K12', 'MIR2278', 'HPCAL1', 'C2orf78', 'DNASE2B', 'RP1-111D6.4', 'AC007246.3', 'SRGAP2B', 'CTD-2078B5.2', 'RMI2', 'PDGFC', 'PSAP', 'KLF4', 'MYO1B', 'PDGFC', 'EPHA2', 'MGC27382', 'FAM188B', 'FLNB', - 'ATP2B4', 'NR3C1', 'DUOX1', 'UPF1', 'MIR802', 'DLD', 'EDN2', 'ZNF703', 'IRAK1', 'ASB6', 'FENDRR', 'FAM105A', 'NAV2-AS5', 'SMG9', 'ELL', 'TNFRSF11B', 'LINC00619', 'LAMC1', 'MYLK', 'FAAP100', 'MIR8052', - 'ACSL4', 'ANK3', 'COL5A1-AS1', 'PXN', 'TSPAN18', 'PCSK1', 'TOR3A', 'COPS8', 'RRAS2', 'ERRFI1', 'CDH2', 'TMOD3', 'NFIB', 'AFAP1', 'SMIM14', 'PTRH2', 'EIF2D', 'PTPN1', 'MIR8079', 'NRP1', 'TCF21', - 'IL1R1', 'FAP', 'TNS3', 'COBL', 'PDLIM5', 'RAD51B', 'C9orf152', 'RFLNA', 'SYT2', 'LINC01101', 'DLX4', 'ZMIZ1', 'PLCE1-AS2', 'JPH3', 'SEC61A1', 'AC007163.3', 'NKX3-2', 'COL8A1', 'LINC01151', 'DERA', - 'Z99756.1', 'CEL', 'GYG1', 'MIR551A', 'TNKS1BP1', 'FAM20A', 'RPTN', 'NSMAF', 'DEFB103A', 'MIR3937', 'BRD4', 'MIR4256', 'DTWD1', 'DDX25', 'SMYD2', 'ADAMTS8', 'ITGA3', 'ZNF705B', 'SLC35F4', 'LINC00656', - 'RASA4', 'ADAMTS2', 'KCNK3', 'MYEOV', 'SKI', 'CAMK2D', 'NAA20', 'ITGA3', 'RP5-1172A22.1', 'NEK7', 'CASZ1', 'LINC00882', 'DPP4', 'CTD-2193G5.1', 'HEPHL1', 'SEMA5B', 'SLC6A9', 'ADM', 'LUCAT1', - 'RP11-284G10.1', 'RP11-177H13.2', 'SPECC1L', 'EXT1', 'PTX3', 'LUZP4', 'SLC6A6', 'RAP1B', 'FBN1', 'AKAP2', 'MPRIP', 'CFDP1', 'CTB-113P19.1', 'NRG2', 'KBTBD12', 'PPARD', 'SH3BP4', 'STK24', 'MIR4725', - 'TEAD1', 'LINC01118', 'EPSTI1', 'PVT1', 'CUZD1', 'IGFBP7-AS1', 'SLC7A1', 'AC013461.1', 'BCL2L14', 'IGFBP5', 'MIR620', 'NRG1-IT3', 'BRD4', 'ADAM19', 'ATP6V0D1', 'NFIX', 'PCA3', 'ACTL8', 'TGM4', - 'MIR1-1', 'CORO2B', 'SH3PXD2B', 'PLXNA1', 'EPS8L3', 'LAMB1', 'SLC22A23', 'ALX4', 'EHD4', 'DUSP6', 'FRMD6-AS2', 'ENTPD7', 'SIPA1L1', 'SH3PXD2A-AS1', 'TUBB2B', 'SNORA72', 'MIR1293', 'TRIL', 'GDF6', - 'FAT1', 'ARHGAP35', 'HTRA1', 'IGFL3', 'NTM', 'H1F0', 'EBF2', 'CCL2', 'SGIP1', 'TGFBI', 'MED9', 'SNTB1', 'SCRT1', 'AMPD3', 'COL6A3', 'RARRES1', 'FERMT2', 'CEBPG', 'SOCS5', 'RMDN2', 'DENND5B', 'PDZD4', - 'MIR8056', 'TARID', 'MAGI1-AS1', 'SYNPO2', 'ARNT2', 'KDM5C', 'HSD52', 'ZDHHC13', 'C9orf152', 'MED30', 'SBF2-AS1', 'CTD-2078B5.2', 'CAPZB', 'SHISA8', 'MIR1262', 'HOPX', 'PRDM12', 'SEMA3D', - 'RP5-1142A6.8', 'MAPKAPK2', 'PDE4B', 'LINC00882', 'NBPF14', 'DLEU1-AS1', 'NR5A1', 'MIR6720', 'ADAM12', 'ADAM30', 'RAPGEF4', 'AC006262.6', 'RGS3', 'SMAD6', 'SYNJ2-IT1', 'TEX2', 'LINC00381', 'DST', - 'MPZL1', 'BNC2', 'MIR4674', 'ARFGAP3', 'ADRA1D', 'SLIT3', 'COL27A1', 'SLC25A51', 'ATG16L2', 'ZMIZ1', 'MIR129-1', 'MTDH', 'MIR7848', 'MIR4430', 'HOXC13', 'PARD3', 'CAMK2B', 'ABHD5', 'RPLP1', 'MPEG1', - 'MLK7-AS1', 'QKI', 'SMYD2', 'ZBTB17', 'C6orf222', 'NAV2-AS5', 'OVAAL', 'LINC01048', 'TNFSF8', 'DHRS12', 'MACF1', 'PLEKHA2', 'LINC00926', 'EHF', 'KCNJ6', 'MYO1B', 'MIR2278', 'WASL', 'SRP9', - 'MIR4520-1', 'LINC01500', 'SLC30A2', 'PRPH2', 'RP11-221J22.1', 'SNORA31', 'SIPA1L1', 'SERAC1', 'EGFR', 'SLC38A2', 'FRMPD3', 'RP11-430H10.2', 'NUAK2', 'CELF1', 'CD36', 'CHSY1', 'ERGIC1', 'ZNF488', - 'SEMA6B', 'CISTR', 'MFNG', 'HS1BP3-IT1', 'NBPF10', 'CYTOR', 'CPED1', 'MIR378G', 'BMPER', 'ARFGAP3', 'RAB8B', 'LINC00917', 'ABCA17P', 'FHL2', 'MIR620', 'SPRED2', 'RAI14', 'ZSCAN20', 'MMD', 'MIR548Q', - 'SNORA3B', 'MIR7854', 'CEP350', 'RGAG4', 'HRCT1', 'CCL8', 'IL24', 'TBC1D4', 'IRS1', 'SLC8A1', 'PLS3-AS1', 'SIL1', 'LMO2', 'FGF3', 'RNVU1-19', 'LEPROT', 'MIR4774', 'CREB3L2', 'TSPAN2', 'KCNIP3', - 'MAP3K14', 'RASSF8', 'MIR4419A', 'EFHC1', 'MIR297', 'KSR1', 'GSN', 'ATG4A', 'VASN', 'BBS12', 'NUFIP1', 'CAMK2D', 'PPL', 'H2AFY', 'CRTAM', 'RP11-47I22.2', 'BLID', 'HTR2B', 'GPR183', 'UQCC2', 'WDR36', - 'ACTL8', 'COLEC12', 'CCDC85B', 'LINC00607', 'ITGA11', 'MBNL1-AS1', 'KRT85', 'LINC01478', 'LTBP2', 'NRG1', 'ALX4', 'ITGA11', 'MBOAT2', 'HELT', 'ZFP36L1', 'CISTR', 'FAM76B', 'CISTR', 'CDK5RAP2', - 'TINAGL1', 'SNORD96B', 'RP11-1008C21.1', 'CMSS1', 'RP11-367G18.1', 'MRPL33', 'AC012462.2', 'ZMIZ1', 'TSPAN18', 'PANDAR', 'MIR6888', 'MIR548AH', 'RP11-162I7.1', 'MIR548AZ', 'ABHD5', 'GALNT5', 'ANKRD1', - 'RP11-554D15.1', 'CALD1', 'CAV1', 'OLFML2B', 'RP11-363D14.1', 'RP11-443B7.1', 'MIR629', 'ASB1', 'GFRA1', 'CHAMP1', 'MIR2052', 'ADAMTSL4', 'NAIF1', 'MIR190A', 'THSD4', 'KCNIP3', 'TIAF1', 'MIR4265', - 'RP11-245G13.2', 'PYGL', 'OXCT1-AS1', 'GLI2', 'CDC42EP3', 'PLAU', 'CLIC4', 'ERMN', 'USP12', 'PLXNB2', 'WARS', 'SNORA72', 'LATS2', 'PLXNA4', 'RP5-898J17.1', 'PSD3', 'CTSC', 'TBX20', 'GDNF', 'TOP1', - 'DOK1', 'GPR12', 'LINC01082', 'MIR7973-1', 'MIR548AJ1', 'PPP1R3C', 'NPIPB11', 'MLXIP', 'RP11-890B15.3', 'RASA4B', 'MIR619', 'NFATC4', 'ABRA', 'CHML', 'SYNDIG1', 'PEAR1', 'LIFR-AS1', 'HIF1AN', - 'FOXP1-AS1', 'SELPLG', 'EFCAB1', 'MIR4743', 'WBSCR28', 'NHS-AS1', 'ABCA1', 'SOGA1', 'ARID5B', 'BICC1', 'CRY2', 'RASD2', 'MINOS1', 'MIR4277', 'PLAC9', 'MYLK-AS2', 'TCF21', 'WIPF1', 'MYF6', - 'AC007880.1', 'LINC00189', 'FST', 'TBC1D2', 'IL34', 'MIR101-2', 'NRG1-IT3', 'APOA1', 'CAV2', 'MIR4642', 'ATP8B1', 'ATG16L2', 'PDGFRL', 'MYO1B', 'TIMP3', 'CD82', 'MIR4319', 'ANKEF1', 'NPFFR1', 'ZHX2', - 'SHQ1', 'NGF', 'AC002480.2', 'LINC01592', 'SRGAP2', 'RP11-148B18.1', 'LOXL1-AS1', 'KIF6', 'GGTA1P', 'ROS1', 'GNB1L', 'THY1', 'SIX6', 'ABL1', 'AC002480.3', 'SFTPB', 'KSR1', 'FMNL1', 'ARHGEF16', - 'CREB5', 'NPC1', 'MIR760', 'ADAMTS20', 'DAPL1', 'SLC22A12', 'LINC01276', 'SLC22A5', 'EML4', 'TPST1', 'FLJ45079', 'ARRB1', 'CCDC34', 'NEK7', 'AQP9', 'BBOX1-AS1', 'ASAP1-IT2', 'WDR61', 'CTA-929C8.6', - 'LFNG', 'FAM188B', 'CALD1', 'ZFAND3', 'GTF3A', 'CYP26B1', 'KIF3A', 'MIR1252', 'MIR4424', 'MYO1B', 'SVIL', 'REV1', 'YWHAH', 'TSPAN14', 'PNMA1', 'RP11-111I12.1', 'CTGF', 'PDGFRA', 'CRISPLD2', 'HAND2', - 'RP11-1124B17.1', 'SLC4A4', 'TMEM75', 'NABP1', 'RUSC2', 'PDGFC', 'MIR4476', 'ARHGEF10L', 'PIK3R1', 'RARB', 'C14orf132', 'PDGFC', 'GFRA2', 'MIR4711', 'XYLT1', 'PLOD2', 'DIEXF', 'VEGFC', 'CTTN', - 'ZNF618', 'MIR3937', 'MED15', 'NRP2', 'AC064834.3', 'FOXS1', 'AQP1', 'UTRN', 'MRPL33', 'CMTM4', 'AKR1C1', 'ALX4', 'TMEM241', 'SAMD12', 'FST', 'TFAP2A', 'MATN1-AS1', 'LINC00484', 'KIF12', 'PHLDA3', - 'C8orf86', 'LINC01143', 'AFAP1-AS1', 'FNBP1L', 'SPAG6', 'RIPK2', 'KY', 'VSX1', 'SACM1L', 'CYTOR', 'STK17B', 'MAPRE1', 'TLDC1', 'PLXND1', 'GUSBP5', 'MIR4293', 'DCAF5', 'DOCK1', 'PRRX1', 'CRAT40', - 'AL450226.2', 'CISH', 'RP11-137H2.4', 'BIN1', 'LINC00837', 'CCNYL1', 'UBE2H', 'LIFR-AS1', 'CCDC171', 'CACNG4', 'ZBTB40', 'ANXA2', 'MYRIP', 'EVC', 'DCPS', 'ZBTB38', 'RTN4', 'S100A11', 'PSMB7', 'SP6', - 'TSPEAR', 'INSC', 'FLJ42102', 'MIR1205', 'ISLR'] + symbol2geneid = {nt.Symbol: g for g, nt in GENEID2NT.items()} + symbols = [ + "MICAL2", + "MIR1231", + "ZMIZ1", + "CRIM1", + "SMAD3", + "EFEMP1", + "CRIM1", + "ANXA2", + "VGLL3", + "FHL2", + "FSTL1", + "KIAA1456", + "MIR4316", + "MYH9", + "SIPA1L1", + "C15orf53", + "TRAM2", + "IGFBP7-AS1", + "CALD1", + "RP5-1120P11.1", + "WNT2B", + "DDAH1", + "MIR1203", + "NRG1", + "SEC24D", + "NHSL2", + "ERGIC1", + "RPL37A", + "PTPN14", + "FEZ2", + "VEGFC", + "C2orf61", + "MIR30A", + "CAPZB", + "SMAD3", + "AAGAB", + "EPS8", + "ITGB5", + "LRP1-AS", + "NRP1", + "WWTR1-AS1", + "CDK6", + "ENTPD6", + "THBS1", + "AC016735.2", + "ZCCHC24", + "LINC00592", + "HSPG2", + "MIRLET7A2", + "SMAD6", + "STARD13", + "EMP1", + "LINC00656", + "CALD1", + "C10orf142", + "ARID5B", + "MIR6809", + "MIR5191", + "SNORA59A", + "KIAA1462", + "FERMT2", + "ADAMTS6", + "RBMS1", + "MIR8073", + "MBNL1-AS1", + "TGFBR1", + "SH2D4A", + "DST", + "OTX2-AS1", + "LAMA4", + "ASAP1", + "RP11-161M6.2", + "DST", + "SMAD7", + "AFAP1-AS1", + "MIR4803", + "RP5-1158E12.3", + "LPCAT2", + "NDST1", + "FAM105A", + "SMURF2", + "RP4-673D20.3", + "ZMIZ1", + "NLRP1", + "NAV2-AS5", + "CCNL1", + "MICAL2", + "SH3RF1", + "IL1R2", + "LINC00161", + "MIR1294", + "MYLK-AS2", + "THBS1", + "RNF24", + "TNS4", + "FBN1", + "DCUN1D3", + "CREB3L2", + "RSRP1", + "RP1-63G5.7", + "LINC01365", + "RPL23AP87", + "SNORA59A", + "DUSP7", + "TMEM163", + "EXT1", + "ATXN10", + "MIR4316", + "MYL9", + "ABHD2", + "ADCY5", + "LRTM2", + "FAM83C", + "DLC1", + "LINC01057", + "SMAD6", + "NAV1", + "MIR584", + "TMEM212", + "COL15A1", + "PLCXD2", + "PRRX1", + "BCL10", + "MIR2278", + "FEM1B", + "ABTB2", + "NIPAL2", + "CDCP1", + "CCDC80", + "FBN1", + "DEFB104A", + "RP11-30P6.6", + "LINC01085", + "AKAP2", + "ADAMTS20", + "MIR3152", + "LMO7-AS1", + "RP11-887P2.5", + "ARHGAP26-IT1", + "TSPAN17", + "CYP3A43", + "CORO2B", + "RP3-332B22.1", + "RARA", + "MSN", + "UMODL1", + "C12orf74", + "TRAM2", + "LHFPL2", + "TRIP13", + "PALLD", + "NRP2", + "LINC00607", + "COL6A3", + "CLMP", + "MIR4316", + "PTPN14", + "LINC01354", + "CBR4", + "FNDC3B", + "LINC01426", + "WISP2", + "NUDT6", + "MIR6083", + "GRHL1", + "KIF13A", + "VCL", + "MIR125B1", + "FOXP1-AS1", + "CLCF1", + "CDK5RAP2", + "RP11-356I2.4", + "NEK6", + "CTLA4", + "SLC8A1-AS1", + "BACH1", + "MIR100", + "MIR3619", + "CD44", + "GOLT1A", + "LUM", + "BCAS3", + "MIR1208", + "NGF", + "INHBA", + "MRPS23", + "STK40", + "HK1", + "ASAP2", + "WBP1L", + "HMCN2", + "DEFB104B", + "VAC14", + "MYOF", + "PTGIS", + "KIRREL", + "MAP4K4", + "GALNT10", + "LHFPL3-AS2", + "EGFR", + "TGM2", + "DMRTB1", + "YPEL5", + "EPG5", + "FNDC3B", + "SND1", + "MIR6827", + "MIR3126", + "LAMA4", + "PPIF", + "ITGB1", + "SEMA5B", + "TEAD1", + "PAMR1", + "GCLM", + "NFIB", + "TM4SF4", + "LINC01132", + "BASP1", + "SPECC1", + "MAPKAPK2", + "CSRP1", + "NID2", + "DST", + "ERG", + "GLI2", + "TMEM50A", + "MIR29A", + "NBPF26", + "LRRC8D", + "LINC01119", + "FNDC3B", + "LINC00917", + "GADD45A", + "SP3", + "LINC01625", + "ADAMTS9-AS1", + "TCF23", + "NPLOC4", + "MIR1252", + "ZEB2-AS1", + "NEURL1-AS1", + "SLC25A51", + "XPNPEP1", + "WASF2", + "SLC25A3P1", + "GPR88", + "RXFP2", + "RUNX1T1", + "ME3", + "RAI14", + "PNMA2", + "TOM1L2", + "CWC22", + "R3HCC1", + "MYLK-AS2", + "ERVFRD-1", + "FENDRR", + "TIPARP", + "SMAD6", + "TPM1", + "RP11-266L9.4", + "B4GALT1", + "REP15", + "MIR8052", + "LAMB1", + "FRMD6-AS1", + "ETS1", + "COL1A2", + "MIR4289", + "MAP1B", + "NABP1", + "SLC8A1-AS1", + "NOTCH2", + "ZMIZ1", + "GRHL2", + "CTD-2015G9.1", + "DLC1", + "CCNL1", + "NBPF14", + "ITGA11", + "TNS3", + "TTPAL", + "NBPF19", + "STAT3", + "MIR4668", + "LINC00607", + "CARMN", + "COL22A1", + "MIR583", + "WT1", + "NEK7", + "PLEKHG4B", + "TIGD6", + "RP11-778O17.4", + "NOTCH2", + "SPACA1", + "TEAD1", + "TOX2", + "GALNT2", + "EVX2", + "SALRNA3", + "DEFB103B", + "CCDC12", + "ITGAE", + "EXT1", + "RRBP1", + "TMEM189-UBE2V1", + "SEPT11", + "EXT1", + "ZBTB38", + "ASTN2-AS1", + "RAI1", + "LETM2", + "HIPK2", + "CLMP", + "PDLIM5", + "UBR5", + "RP1-102K2.8", + "RREB1", + "MIR1260B", + "NEK6", + "ARHGEF12", + "PRR16", + "STRA6", + "MIR222", + "SH3RF1", + "STK35", + "RDH10-AS1", + "RP5-1070A16.1", + "AP3M1", + "ANKRD33B", + "SNORA3B", + "ANKRD40", + "PRRC2B", + "LAMC1", + "ADAMTSL1", + "EDN2", + "FMN1", + "CACNA1C-AS4", + "RP11-90C4.2", + "MBNL1-AS1", + "TARS", + "LIPC", + "ASAP1", + "MIR6090", + "PAX2", + "CHD2", + "DLC1", + "LPCAT1", + "CITF22-92A6.2", + "NAV1", + "HIPK2", + "CCNI", + "C6orf89", + "VAC14-AS1", + "LINC01225", + "TRIM8", + "ADAM12", + "LMCD1", + "RP11-205K6.1", + "PPFIBP1", + "SLC4A4", + "TBX15", + "ACOXL", + "FAM83C", + "DYRK1A", + "AC109344.1", + "DUSP1", + "RPL22L1", + "NCOA4", + "LOXL1-AS1", + "CORO1C", + "CMSS1", + "CYP11A1", + "SYNJ2-IT1", + "EPS8L3", + "RP11-366L20.2", + "RP4-569M23.5", + "PAX8", + "MIR3152", + "ACKR3", + "LTBP2", + "ARSJ", + "RFTN1", + "FHL1", + "NARS2", + "EXT1", + "SH3BGRL3", + "ADAMTS2", + "COL8A1", + "IRF2BPL", + "NREP", + "NOTCH2NL", + "USP2", + "HSPH1", + "SKOR2", + "C16orf72", + "ENPP2", + "TSPAN18", + "IRF1", + "TNC", + "OPTC", + "PDLIM5", + "PGBD3", + "CCDC167", + "COL12A1", + "RIMBP3", + "TFPI", + "SOCS5", + "HSPG2", + "SH2B3", + "LINC00316", + "DDR2", + "BCL10", + "LINC01132", + "ABHD5", + "TGFB3", + "DYSF", + "TRPA1", + "TRIM8", + "UCK2", + "SOST", + "COL5A2", + "CA4", + "IGDCC3", + "SIPA1L1", + "SRCIN1", + "AP001626.1", + "AC010091.1", + "RAI1-AS1", + "MIR1260B", + "TCHP", + "RP11-168P8.5", + "BDKRB1", + "TSPYL5", + "MYLK-AS2", + "FOSL2", + "MIR4743", + "TGFBI", + "AC020571.3", + "THBS1", + "NANOS1", + "LINC01447", + "LMO7-AS1", + "AP2S1", + "RMDN2", + "MIR4316", + "DCLK2", + "MIR4280", + "NFATC3", + "SEMA5B", + "MYO1B", + "ROR1", + "MED15", + "NFX1", + "HIC1", + "MIR1203", + "STX1A", + "ANKFN1", + "CTB-113D17.1", + "MIR205HG", + "FAM129B", + "SH3BP4", + "RP11-478P10.1", + "MIR151A", + "CACNG4", + "CRYBA1", + "APBB2", + "CCDC80", + "TRIO", + "F2R", + "RAF1", + "CYTOR", + "ITGB6", + "PITPNB", + "DHRS3", + "WDFY2", + "RNF141", + "ARL6IP5", + "MIR4435-2", + "MAP4K4", + "MIRLET7I", + "RND3", + "CXXC5", + "SNORA70E", + "ANXA2", + "KCCAT211", + "PLCB1", + "TMCC2", + "EXT1", + "RP11-366L20.2", + "WDR86-AS1", + "HMG20A", + "RP11-38F22.1", + "FYCO1", + "LPP", + "HABP2", + "TSPEAR", + "ABLIM1", + "RP11-443B7.1", + "FAM20B", + "RASSF10", + "XPC", + "TNIP3", + "ACSL4", + "MTMR2", + "TNIK", + "RELT", + "CRIPT", + "RP11-572C15.5", + "CCDC81", + "CMSS1", + "C6orf223", + "SHISA4", + "PDGFA", + "HS1BP3-IT1", + "MYPN", + "XCR1", + "ZFP36L1", + "CBR4", + "TRAPPC3", + "MIR802", + "CSRP1", + "DAPK2", + "SPESP1", + "RP11-890B15.3", + "SHB", + "INSIG2", + "ADGRG1", + "GPC6-AS1", + "KIRREL3-AS3", + "LXN", + "CBR4", + "CPA1", + "MINPP1", + "NFIX", + "FLRT2", + "MIR6070", + "USP3", + "PRR16", + "ALPL", + "RP11-379K22.2", + "ADAMTSL1", + "RRBP1", + "RP11-430H10.2", + "MAPKAPK3", + "ABHD16B", + "CDKN3", + "STEAP3-AS1", + "RTP3", + "SLA", + "CYP1B1-AS1", + "MIR6888", + "HIVEP3", + "LINC01119", + "CCDC71L", + "MACF1", + "EFNB1", + "CBLB", + "MIR760", + "NAMA", + "LNX1-AS1", + "KMT2E", + "PYROXD2", + "LMO7DN", + "EML4", + "CCDC80", + "SEC22A", + "COL21A1", + "CDC42EP3", + "EPHA2", + "CAPZA2", + "PHLDB2", + "TPPP", + "MIR3129", + "LIMA1", + "PDE1C", + "RUNX2", + "SPRED2", + "C1QTNF1", + "EPHA2", + "IRF1", + "MIR4263", + "RXFP2", + "MTNR1A", + "CUEDC1", + "GCNT1", + "MIR3152", + "ST5", + "ITGA11", + "RP11-366L20.2", + "MAGI2", + "KCCAT211", + "MIR6090", + "EMX1", + "CDC42EP3", + "PKP4", + "BCL10", + "SERPINB7", + "IKBKE", + "AGMO", + "RUNX1", + "PHC2", + "SH2D7", + "PARVA", + "B4GALT5", + "STAT4", + "ACTN4", + "RTKN2", + "MIR1260B", + "SH3PXD2B", + "ACTN1-AS1", + "LINC00882", + "SLC8A1", + "NREP-AS1", + "THADA", + "DDAH1", + "MIR4274", + "SERPINE1", + "ASAP1-IT2", + "APH1B", + "IGF2BP2-AS1", + "MUSK", + "TRAF3IP2", + "COLEC12", + "EXT1", + "FLJ22447", + "CTB-113P19.1", + "RBPJ", + "RP11-230G5.2", + "PALLD", + "SLC1A2", + "MIR190A", + "FHOD3", + "LHFPL2", + "C2CD5", + "SLCO4A1", + "SYPL1", + "ARHGAP18", + "MIR4703", + "SOX1", + "DIXDC1", + "TM2D3", + "MIR4743", + "CASC23", + "KDM4C", + "RAI14", + "MYOZ3", + "MAP3K12", + "MIR2278", + "HPCAL1", + "C2orf78", + "DNASE2B", + "RP1-111D6.4", + "AC007246.3", + "SRGAP2B", + "CTD-2078B5.2", + "RMI2", + "PDGFC", + "PSAP", + "KLF4", + "MYO1B", + "PDGFC", + "EPHA2", + "MGC27382", + "FAM188B", + "FLNB", + "ATP2B4", + "NR3C1", + "DUOX1", + "UPF1", + "MIR802", + "DLD", + "EDN2", + "ZNF703", + "IRAK1", + "ASB6", + "FENDRR", + "FAM105A", + "NAV2-AS5", + "SMG9", + "ELL", + "TNFRSF11B", + "LINC00619", + "LAMC1", + "MYLK", + "FAAP100", + "MIR8052", + "ACSL4", + "ANK3", + "COL5A1-AS1", + "PXN", + "TSPAN18", + "PCSK1", + "TOR3A", + "COPS8", + "RRAS2", + "ERRFI1", + "CDH2", + "TMOD3", + "NFIB", + "AFAP1", + "SMIM14", + "PTRH2", + "EIF2D", + "PTPN1", + "MIR8079", + "NRP1", + "TCF21", + "IL1R1", + "FAP", + "TNS3", + "COBL", + "PDLIM5", + "RAD51B", + "C9orf152", + "RFLNA", + "SYT2", + "LINC01101", + "DLX4", + "ZMIZ1", + "PLCE1-AS2", + "JPH3", + "SEC61A1", + "AC007163.3", + "NKX3-2", + "COL8A1", + "LINC01151", + "DERA", + "Z99756.1", + "CEL", + "GYG1", + "MIR551A", + "TNKS1BP1", + "FAM20A", + "RPTN", + "NSMAF", + "DEFB103A", + "MIR3937", + "BRD4", + "MIR4256", + "DTWD1", + "DDX25", + "SMYD2", + "ADAMTS8", + "ITGA3", + "ZNF705B", + "SLC35F4", + "LINC00656", + "RASA4", + "ADAMTS2", + "KCNK3", + "MYEOV", + "SKI", + "CAMK2D", + "NAA20", + "ITGA3", + "RP5-1172A22.1", + "NEK7", + "CASZ1", + "LINC00882", + "DPP4", + "CTD-2193G5.1", + "HEPHL1", + "SEMA5B", + "SLC6A9", + "ADM", + "LUCAT1", + "RP11-284G10.1", + "RP11-177H13.2", + "SPECC1L", + "EXT1", + "PTX3", + "LUZP4", + "SLC6A6", + "RAP1B", + "FBN1", + "AKAP2", + "MPRIP", + "CFDP1", + "CTB-113P19.1", + "NRG2", + "KBTBD12", + "PPARD", + "SH3BP4", + "STK24", + "MIR4725", + "TEAD1", + "LINC01118", + "EPSTI1", + "PVT1", + "CUZD1", + "IGFBP7-AS1", + "SLC7A1", + "AC013461.1", + "BCL2L14", + "IGFBP5", + "MIR620", + "NRG1-IT3", + "BRD4", + "ADAM19", + "ATP6V0D1", + "NFIX", + "PCA3", + "ACTL8", + "TGM4", + "MIR1-1", + "CORO2B", + "SH3PXD2B", + "PLXNA1", + "EPS8L3", + "LAMB1", + "SLC22A23", + "ALX4", + "EHD4", + "DUSP6", + "FRMD6-AS2", + "ENTPD7", + "SIPA1L1", + "SH3PXD2A-AS1", + "TUBB2B", + "SNORA72", + "MIR1293", + "TRIL", + "GDF6", + "FAT1", + "ARHGAP35", + "HTRA1", + "IGFL3", + "NTM", + "H1F0", + "EBF2", + "CCL2", + "SGIP1", + "TGFBI", + "MED9", + "SNTB1", + "SCRT1", + "AMPD3", + "COL6A3", + "RARRES1", + "FERMT2", + "CEBPG", + "SOCS5", + "RMDN2", + "DENND5B", + "PDZD4", + "MIR8056", + "TARID", + "MAGI1-AS1", + "SYNPO2", + "ARNT2", + "KDM5C", + "HSD52", + "ZDHHC13", + "C9orf152", + "MED30", + "SBF2-AS1", + "CTD-2078B5.2", + "CAPZB", + "SHISA8", + "MIR1262", + "HOPX", + "PRDM12", + "SEMA3D", + "RP5-1142A6.8", + "MAPKAPK2", + "PDE4B", + "LINC00882", + "NBPF14", + "DLEU1-AS1", + "NR5A1", + "MIR6720", + "ADAM12", + "ADAM30", + "RAPGEF4", + "AC006262.6", + "RGS3", + "SMAD6", + "SYNJ2-IT1", + "TEX2", + "LINC00381", + "DST", + "MPZL1", + "BNC2", + "MIR4674", + "ARFGAP3", + "ADRA1D", + "SLIT3", + "COL27A1", + "SLC25A51", + "ATG16L2", + "ZMIZ1", + "MIR129-1", + "MTDH", + "MIR7848", + "MIR4430", + "HOXC13", + "PARD3", + "CAMK2B", + "ABHD5", + "RPLP1", + "MPEG1", + "MLK7-AS1", + "QKI", + "SMYD2", + "ZBTB17", + "C6orf222", + "NAV2-AS5", + "OVAAL", + "LINC01048", + "TNFSF8", + "DHRS12", + "MACF1", + "PLEKHA2", + "LINC00926", + "EHF", + "KCNJ6", + "MYO1B", + "MIR2278", + "WASL", + "SRP9", + "MIR4520-1", + "LINC01500", + "SLC30A2", + "PRPH2", + "RP11-221J22.1", + "SNORA31", + "SIPA1L1", + "SERAC1", + "EGFR", + "SLC38A2", + "FRMPD3", + "RP11-430H10.2", + "NUAK2", + "CELF1", + "CD36", + "CHSY1", + "ERGIC1", + "ZNF488", + "SEMA6B", + "CISTR", + "MFNG", + "HS1BP3-IT1", + "NBPF10", + "CYTOR", + "CPED1", + "MIR378G", + "BMPER", + "ARFGAP3", + "RAB8B", + "LINC00917", + "ABCA17P", + "FHL2", + "MIR620", + "SPRED2", + "RAI14", + "ZSCAN20", + "MMD", + "MIR548Q", + "SNORA3B", + "MIR7854", + "CEP350", + "RGAG4", + "HRCT1", + "CCL8", + "IL24", + "TBC1D4", + "IRS1", + "SLC8A1", + "PLS3-AS1", + "SIL1", + "LMO2", + "FGF3", + "RNVU1-19", + "LEPROT", + "MIR4774", + "CREB3L2", + "TSPAN2", + "KCNIP3", + "MAP3K14", + "RASSF8", + "MIR4419A", + "EFHC1", + "MIR297", + "KSR1", + "GSN", + "ATG4A", + "VASN", + "BBS12", + "NUFIP1", + "CAMK2D", + "PPL", + "H2AFY", + "CRTAM", + "RP11-47I22.2", + "BLID", + "HTR2B", + "GPR183", + "UQCC2", + "WDR36", + "ACTL8", + "COLEC12", + "CCDC85B", + "LINC00607", + "ITGA11", + "MBNL1-AS1", + "KRT85", + "LINC01478", + "LTBP2", + "NRG1", + "ALX4", + "ITGA11", + "MBOAT2", + "HELT", + "ZFP36L1", + "CISTR", + "FAM76B", + "CISTR", + "CDK5RAP2", + "TINAGL1", + "SNORD96B", + "RP11-1008C21.1", + "CMSS1", + "RP11-367G18.1", + "MRPL33", + "AC012462.2", + "ZMIZ1", + "TSPAN18", + "PANDAR", + "MIR6888", + "MIR548AH", + "RP11-162I7.1", + "MIR548AZ", + "ABHD5", + "GALNT5", + "ANKRD1", + "RP11-554D15.1", + "CALD1", + "CAV1", + "OLFML2B", + "RP11-363D14.1", + "RP11-443B7.1", + "MIR629", + "ASB1", + "GFRA1", + "CHAMP1", + "MIR2052", + "ADAMTSL4", + "NAIF1", + "MIR190A", + "THSD4", + "KCNIP3", + "TIAF1", + "MIR4265", + "RP11-245G13.2", + "PYGL", + "OXCT1-AS1", + "GLI2", + "CDC42EP3", + "PLAU", + "CLIC4", + "ERMN", + "USP12", + "PLXNB2", + "WARS", + "SNORA72", + "LATS2", + "PLXNA4", + "RP5-898J17.1", + "PSD3", + "CTSC", + "TBX20", + "GDNF", + "TOP1", + "DOK1", + "GPR12", + "LINC01082", + "MIR7973-1", + "MIR548AJ1", + "PPP1R3C", + "NPIPB11", + "MLXIP", + "RP11-890B15.3", + "RASA4B", + "MIR619", + "NFATC4", + "ABRA", + "CHML", + "SYNDIG1", + "PEAR1", + "LIFR-AS1", + "HIF1AN", + "FOXP1-AS1", + "SELPLG", + "EFCAB1", + "MIR4743", + "WBSCR28", + "NHS-AS1", + "ABCA1", + "SOGA1", + "ARID5B", + "BICC1", + "CRY2", + "RASD2", + "MINOS1", + "MIR4277", + "PLAC9", + "MYLK-AS2", + "TCF21", + "WIPF1", + "MYF6", + "AC007880.1", + "LINC00189", + "FST", + "TBC1D2", + "IL34", + "MIR101-2", + "NRG1-IT3", + "APOA1", + "CAV2", + "MIR4642", + "ATP8B1", + "ATG16L2", + "PDGFRL", + "MYO1B", + "TIMP3", + "CD82", + "MIR4319", + "ANKEF1", + "NPFFR1", + "ZHX2", + "SHQ1", + "NGF", + "AC002480.2", + "LINC01592", + "SRGAP2", + "RP11-148B18.1", + "LOXL1-AS1", + "KIF6", + "GGTA1P", + "ROS1", + "GNB1L", + "THY1", + "SIX6", + "ABL1", + "AC002480.3", + "SFTPB", + "KSR1", + "FMNL1", + "ARHGEF16", + "CREB5", + "NPC1", + "MIR760", + "ADAMTS20", + "DAPL1", + "SLC22A12", + "LINC01276", + "SLC22A5", + "EML4", + "TPST1", + "FLJ45079", + "ARRB1", + "CCDC34", + "NEK7", + "AQP9", + "BBOX1-AS1", + "ASAP1-IT2", + "WDR61", + "CTA-929C8.6", + "LFNG", + "FAM188B", + "CALD1", + "ZFAND3", + "GTF3A", + "CYP26B1", + "KIF3A", + "MIR1252", + "MIR4424", + "MYO1B", + "SVIL", + "REV1", + "YWHAH", + "TSPAN14", + "PNMA1", + "RP11-111I12.1", + "CTGF", + "PDGFRA", + "CRISPLD2", + "HAND2", + "RP11-1124B17.1", + "SLC4A4", + "TMEM75", + "NABP1", + "RUSC2", + "PDGFC", + "MIR4476", + "ARHGEF10L", + "PIK3R1", + "RARB", + "C14orf132", + "PDGFC", + "GFRA2", + "MIR4711", + "XYLT1", + "PLOD2", + "DIEXF", + "VEGFC", + "CTTN", + "ZNF618", + "MIR3937", + "MED15", + "NRP2", + "AC064834.3", + "FOXS1", + "AQP1", + "UTRN", + "MRPL33", + "CMTM4", + "AKR1C1", + "ALX4", + "TMEM241", + "SAMD12", + "FST", + "TFAP2A", + "MATN1-AS1", + "LINC00484", + "KIF12", + "PHLDA3", + "C8orf86", + "LINC01143", + "AFAP1-AS1", + "FNBP1L", + "SPAG6", + "RIPK2", + "KY", + "VSX1", + "SACM1L", + "CYTOR", + "STK17B", + "MAPRE1", + "TLDC1", + "PLXND1", + "GUSBP5", + "MIR4293", + "DCAF5", + "DOCK1", + "PRRX1", + "CRAT40", + "AL450226.2", + "CISH", + "RP11-137H2.4", + "BIN1", + "LINC00837", + "CCNYL1", + "UBE2H", + "LIFR-AS1", + "CCDC171", + "CACNG4", + "ZBTB40", + "ANXA2", + "MYRIP", + "EVC", + "DCPS", + "ZBTB38", + "RTN4", + "S100A11", + "PSMB7", + "SP6", + "TSPEAR", + "INSC", + "FLJ42102", + "MIR1205", + "ISLR", + ] geneids = [symbol2geneid[symbol] for symbol in symbols if symbol in symbol2geneid] print("{N} study Symbols".format(N=len(symbols))) print("{N} study GeneIDs".format(N=len(geneids))) return geneids -if __name__ == '__main__': +if __name__ == "__main__": test_i96() diff --git a/tests/test_ncbi_entrez_annotations.py b/tests/test_ncbi_entrez_annotations.py index d13863ae..214cdb46 100755 --- a/tests/test_ncbi_entrez_annotations.py +++ b/tests/test_ncbi_entrez_annotations.py @@ -9,11 +9,12 @@ import os import sys + from collections import defaultdict -from goatools.associations import dnld_ncbi_gene_file -from goatools.associations import read_ncbi_gene2go -from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hsa + +from goatools.associations import dnld_ncbi_gene_file, read_ncbi_gene2go from goatools.test_data.genes_NCBI_7227_ProteinCoding import GENEID2NT as GeneID2nt_dme +from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hsa REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -21,75 +22,89 @@ def test_ncbi_gene2go(log=sys.stdout): """Return GO associations to Entrez GeneIDs. Download if necessary. - Example report generated with Feb 22, 2013 download of: - NCBI Gene tables and associations in gene2go + Example report generated with Feb 22, 2013 download of: + NCBI Gene tables and associations in gene2go - 49672 items found in gene2go from NCBI's ftp server + 49672 items found in gene2go from NCBI's ftp server - taxid GOs GeneIDs Description - ----- ------ ------- ----------- - 10090 16,807 18,971 all DNA items - 7227 7,022 12,019 all DNA items - 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes - 9606 16,299 18,680 all DNA items - 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes + taxid GOs GeneIDs Description + ----- ------ ------- ----------- + 10090 16,807 18,971 all DNA items + 7227 7,022 12,019 all DNA items + 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes + 9606 16,299 18,680 all DNA items + 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes """ # Get associations for human(9606), mouse(10090), and fly(7227) # (optional) multi-level dictionary separate associations by taxid # Simple dictionary containing id2gos - taxid2asscs = _get_id2gos('gene2go', [9606, 10090, 7227], log) - taxid2pc = {9606:GeneID2nt_hsa, 7227:GeneID2nt_dme} + taxid2asscs = _get_id2gos("gene2go", [9606, 10090, 7227], log) + taxid2pc = {9606: GeneID2nt_hsa, 7227: GeneID2nt_dme} # Report findings log.write(" taxid GOs GeneIDs Description\n") log.write(" ----- ------ ------- -----------\n") assert taxid2asscs for taxid, asscs in taxid2asscs.items(): - num_gene2gos_all = len(asscs['ID2GOs']) - num_go2genes_all = len(asscs['GO2IDs']) - log.write(" {TAXID:>6} {N:>6,} {M:>7,} all DNA items\n".format( - TAXID=taxid, N=num_go2genes_all, M=num_gene2gos_all)) + num_gene2gos_all = len(asscs["ID2GOs"]) + num_go2genes_all = len(asscs["GO2IDs"]) + log.write( + " {TAXID:>6} {N:>6,} {M:>7,} all DNA items\n".format( + TAXID=taxid, N=num_go2genes_all, M=num_gene2gos_all + ) + ) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos_all > 11000 assert num_go2genes_all > 6000 if taxid in taxid2pc.keys(): rpt_coverage(taxid, asscs, taxid2pc[taxid], log) + def _get_id2gos(file_assc, taxids, log): """Return associations.""" taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) fin = os.path.join(REPO, file_assc) - dnld_ncbi_gene_file(fin, loading_bar=None) + dnld_ncbi_gene_file(fin) id2gos = read_ncbi_gene2go(fin, taxids, taxid2asscs=taxid2asscs) - log.write(" {N} items found in gene2go from NCBI's ftp server\n".format(N=len(id2gos))) + log.write( + " {N} items found in gene2go from NCBI's ftp server\n".format(N=len(id2gos)) + ) return taxid2asscs + def rpt_coverage(taxid, asscs, pc2nt, log): """Calculate and report GO coverage on protein-coding genes. - Example report generated with Feb 22, 2013 download of: - NCBI Gene tables and associations in gene2go + Example report generated with Feb 22, 2013 download of: + NCBI Gene tables and associations in gene2go - taxid GOs GeneIDs Description - ----- ------ ------- ----------- - 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes - 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes + taxid GOs GeneIDs Description + ----- ------ ------- ----------- + 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes + 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes """ # List of all protein-coding genes have GO terms associated with them - geneid2gos = asscs['ID2GOs'] + geneid2gos = asscs["ID2GOs"] pcgene_w_gos = set(geneid2gos.keys()).intersection(set(pc2nt.keys())) num_pcgene_w_gos = len(pcgene_w_gos) num_pc_genes = len(pc2nt) - perc_cov = 100.0*num_pcgene_w_gos/num_pc_genes + perc_cov = 100.0 * num_pcgene_w_gos / num_pc_genes # Get list of GOs associated with protein-coding genes gos_pcgenes = set() for geneid in pcgene_w_gos: gos_pcgenes |= geneid2gos[geneid] txt = " {TAXID:>6} {N:>6,} {M:>7,} {COV:2.0f}% GO coverage of {TOT:,} protein-coding genes\n" - log.write(txt.format( - TAXID=taxid, N=len(gos_pcgenes), M=num_pcgene_w_gos, COV=perc_cov, TOT=num_pc_genes)) - - -if __name__ == '__main__': + log.write( + txt.format( + TAXID=taxid, + N=len(gos_pcgenes), + M=num_pcgene_w_gos, + COV=perc_cov, + TOT=num_pc_genes, + ) + ) + + +if __name__ == "__main__": test_ncbi_gene2go() diff --git a/tests/test_plot_relationship_part_of.py b/tests/test_plot_relationship_part_of.py index 3b909263..fa3124fd 100755 --- a/tests/test_plot_relationship_part_of.py +++ b/tests/test_plot_relationship_part_of.py @@ -1,46 +1,46 @@ #!/usr/bin/env python """Plot both the standard 'is_a' field and the optional 'part_of' relationship.""" -from __future__ import print_function - __copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved." import os import sys -from goatools.test_data.wr_subobo import WrSubObo + from goatools.base import download_go_basic_obo -from goatools.obo_parser import GODag from goatools.gosubdag.gosubdag import GoSubDag from goatools.gosubdag.plot.gosubdag_plot import GoSubDagPlot - +from goatools.obo_parser import GODag +from goatools.test_data.wr_subobo import WrSubObo REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") NAME2GOIDS = { - 'smell': set([ - "GO:0007608", # sensory perception of smell - "GO:0050911"]), # detection of chemical stimulus involved in sensory perception of smell - - 'secretory':set([ - "GO:0030141", # CC 20 L06 D07 AC P... p... secretory granule - "GO:0034774"]), # CC 8 L05 D06 ABD P... .... secretory granule lumen - + "smell": set( + ["GO:0007608", "GO:0050911"] # sensory perception of smell + ), # detection of chemical stimulus involved in sensory perception of smell + "secretory": set( + ["GO:0030141", "GO:0034774"] # CC 20 L06 D07 AC P... p... secretory granule + ), # CC 8 L05 D06 ABD P... .... secretory granule lumen # MISSING Edge source(GO:0007507); target(GO:0072359) # MISSING: GO:0061371 # BP 0 L06 D06 B P p determination of heart left/right asymmetry # ERROR: GO:0007507 - 'heartjogging':set([ - "GO:0003304", # BP 0 L06 D06 A P . myocardial epithelial involution in heart jogging - "GO:0003146"]) # BP 0 L05 D07 A P p heart jogging + "heartjogging": set( + [ + "GO:0003304", # BP 0 L06 D06 A P . myocardial epithelial involution in heart jogging + "GO:0003146", + ] + ), # BP 0 L05 D07 A P p heart jogging } + def test_plot_part_of(): """Plot both the standard 'is_a' field and the 'part_of' relationship.""" fout_log = "plot_relationship_part_of.log" obj = _Run() names = NAME2GOIDS # names = ["heartjogging"] - with open(fout_log, 'w') as prt: + with open(fout_log, "w", encoding="utf-8") as prt: for name in names: goids = NAME2GOIDS[name] obj.plot_all(goids, name, prt) @@ -55,7 +55,7 @@ class _Run(object): def __init__(self): _fin_obo = os.path.join(REPO, "go-basic.obo") - self.go2obj = GODag(_fin_obo, optional_attrs=['relationship']) + self.go2obj = GODag(_fin_obo, optional_attrs=["relationship"]) def plot_all(self, goids, name, prt=sys.stdout): """Create plots with various numbers of relationships.""" @@ -64,17 +64,21 @@ def plot_all(self, goids, name, prt=sys.stdout): gosubdag_orig.prt_goids(gosubdag_orig.go2obj, prt=prt) prt.write("{N} GO IDS".format(N=len(gosubdag_orig.go2obj))) gopltdag = GoSubDagPlot(gosubdag_orig, mark_alt_id=True) - gopltdag.plt_dag(os.path.join(REPO, "a_relationship_{NAME}_r0.png".format(NAME=name))) + gopltdag.plt_dag( + os.path.join(REPO, "a_relationship_{NAME}_r0.png".format(NAME=name)) + ) # goids.update(['GO:0007507'], ['GO:0072359']) prt.write("\nCreate GoSubDag while loading only the 'part_of' relationship") - gosubdag = GoSubDag(goids, self.go2obj, relationships=['part_of'], prt=prt) + gosubdag = GoSubDag(goids, self.go2obj, relationships=["part_of"], prt=prt) gosubdag.prt_goids(gosubdag.go2obj, prt=prt) prt.write("{N} GO IDS".format(N=len(gosubdag.go2obj))) gopltdag = GoSubDagPlot(gosubdag, mark_alt_id=True) prt.write("GO SOURCES:") gosubdag.prt_goids(gosubdag.go_sources, prt=prt) - gopltdag.plt_dag(os.path.join(REPO, "a_relationship_{NAME}_partof.png".format(NAME=name))) + gopltdag.plt_dag( + os.path.join(REPO, "a_relationship_{NAME}_partof.png".format(NAME=name)) + ) prt.write("\nCreate GoSubDag while loading all relationships") gosubdag = GoSubDag(goids, self.go2obj, relationships=True, prt=prt) @@ -83,12 +87,14 @@ def plot_all(self, goids, name, prt=sys.stdout): prt.write("2 GO SOURCES:") gosubdag.prt_goids(gosubdag.go_sources, prt=prt) goids_new = set(gosubdag.go2obj).difference(set(gosubdag_orig.go2obj)) - go2color = {go:'#d5ffff' for go in goids_new} + go2color = {go: "#d5ffff" for go in goids_new} prt.write("{N} NEW GO IDS:".format(N=len(goids_new))) gosubdag.prt_goids(goids_new, prt=prt) prt.write("{N} GO IDS".format(N=len(gosubdag.go2obj))) gopltdag = GoSubDagPlot(gosubdag, mark_alt_id=True, go2color=go2color) - gopltdag.plt_dag(os.path.join(REPO, "a_relationship_{NAME}_r1.png".format(NAME=name))) + gopltdag.plt_dag( + os.path.join(REPO, "a_relationship_{NAME}_r1.png".format(NAME=name)) + ) def wr_subobo(self): """Write a subset obo to be used for testing.""" @@ -96,8 +102,8 @@ def wr_subobo(self): for name, goids in NAME2GOIDS.items(): fout_obo = self.get_obo_name(name) fin_obo = os.path.join(REPO, "go-basic.obo") - download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None) - obj = WrSubObo(fin_obo, optional_attrs=['relationship']) + download_go_basic_obo(fin_obo, prt=sys.stdout) + obj = WrSubObo(fin_obo, optional_attrs=["relationship"]) # obj = WrSubObo(fin_obo) obj.wrobo(fout_obo, goids) @@ -106,8 +112,8 @@ def get_obo_name(self, name): return os.path.join(REPO, self.obopat.format(NAME=name)) -if __name__ == '__main__': - #_Run().wr_subobo() +if __name__ == "__main__": + # _Run().wr_subobo() test_plot_part_of() # Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved. diff --git a/tests/test_propagate_counts_w_relationships.py b/tests/test_propagate_counts_w_relationships.py index 8ada5878..33d92225 100755 --- a/tests/test_propagate_counts_w_relationships.py +++ b/tests/test_propagate_counts_w_relationships.py @@ -1,35 +1,45 @@ #!/usr/bin/env python """Test propagate_counts up relationships as well as parent-child links.""" -import sys import os -# from itertools import combinations -# import collections as cx +import sys -from goatools.go_enrichment import GOEnrichmentStudy +from goatools.associations import get_assoc_ncbi_taxids from goatools.base import get_godag +from goatools.go_enrichment import GOEnrichmentStudy from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus from goatools.test_data.nature3102_goea import get_geneid2symbol -from goatools.associations import get_assoc_ncbi_taxids REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_pc_w_rels(prt=sys.stdout): """Test P-value calculations.""" file_obo = os.path.join(REPO, "go-basic.obo") - godag_r0 = get_godag(file_obo, prt, loading_bar=None) - godag_r1 = get_godag(file_obo, prt, loading_bar=None, optional_attrs=['relationship']) + godag_r0 = get_godag(file_obo, prt) + godag_r1 = get_godag(file_obo, prt, optional_attrs=["relationship"]) # pylint: disable=line-too-long # Check that relationships are used in propgate counts - results_r0 = _get_results(godag_r1, propagate_counts=True, relationships=False, prt=prt) - results_r1 = _get_results(godag_r1, propagate_counts=True, relationships=True, prt=prt) - results_rr = _get_results(godag_r1, propagate_counts=True, relationships={'regulates', 'negatively_regulates', 'positively_regulates'}, prt=prt) - results_rp = _get_results(godag_r1, propagate_counts=True, relationships={'part_of'}, prt=prt) - prt.write('{N} results with r0\n'.format(N=len(results_r0))) - prt.write('{N} results with r1\n'.format(N=len(results_r1))) - prt.write('{N} results with all regulates\n'.format(N=len(results_rr))) - prt.write('{N} results with part_of\n'.format(N=len(results_rp))) + results_r0 = _get_results( + godag_r1, propagate_counts=True, relationships=False, prt=prt + ) + results_r1 = _get_results( + godag_r1, propagate_counts=True, relationships=True, prt=prt + ) + results_rr = _get_results( + godag_r1, + propagate_counts=True, + relationships={"regulates", "negatively_regulates", "positively_regulates"}, + prt=prt, + ) + results_rp = _get_results( + godag_r1, propagate_counts=True, relationships={"part_of"}, prt=prt + ) + prt.write("{N} results with r0\n".format(N=len(results_r0))) + prt.write("{N} results with r1\n".format(N=len(results_r1))) + prt.write("{N} results with all regulates\n".format(N=len(results_rr))) + prt.write("{N} results with part_of\n".format(N=len(results_rp))) assert len(results_r1) > len(results_rr) assert len(results_rr) > len(results_rp) assert len(results_rp) > len(results_r0) @@ -37,18 +47,13 @@ def test_pc_w_rels(prt=sys.stdout): # TBD: Add warning message that relationships are ignored # Check that relationships are ignored in propagate counts if they were not loaded _get_results(godag_r0, propagate_counts=True, relationships=False, prt=prt) - ## results_r0b = _get_results(godag_r0, propagate_counts=True, relationships=True, prt=prt) - ## results_r0c = _get_results(godag_r0, propagate_counts=True, relationships={'regulates', 'negatively_regulates', 'positively_regulates'}, prt=prt) - ## results_r0d = _get_results(godag_r0, propagate_counts=True, relationships={'part_of'}, prt=prt) - ## assert len(results_r0a) == len(results_r0b) - ## assert len(results_r0b) == len(results_r0c) - ## assert len(results_r0c) == len(results_r0d) + def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" - taxid = 10090 # Mouse study + taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) - assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) + assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy( geneids_pop, @@ -57,9 +62,10 @@ def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, - methods=['fdr_bh']) + methods=["fdr_bh"], + ) return goeaobj.run_study(geneids_study, prt=prt) -if __name__ == '__main__': +if __name__ == "__main__": test_pc_w_rels() diff --git a/tests/test_pvalcalc.py b/tests/test_pvalcalc.py index dd472341..be08d0c7 100644 --- a/tests/test_pvalcalc.py +++ b/tests/test_pvalcalc.py @@ -1,37 +1,53 @@ #!/usr/bin/env python """Test that two different but equivalent fishers functions give the similar results.""" -import sys +import collections as cx import os +import sys + from itertools import combinations -import collections as cx -from goatools.go_enrichment import GOEnrichmentStudy +from goatools.associations import get_assoc_ncbi_taxids from goatools.base import get_godag +from goatools.go_enrichment import GOEnrichmentStudy from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus from goatools.test_data.nature3102_goea import get_geneid2symbol -from goatools.associations import get_assoc_ncbi_taxids + def test_pvalcalc(prt=sys.stdout): """Test P-value calculations.""" - pvalfnc_names = ['fisher_scipy_stats'] + pvalfnc_names = ["fisher_scipy_stats"] fisher2pvals = _get_pvals(pvalfnc_names) _chk_pvals(fisher2pvals, prt) + def _chk_pvals(fisher2pvals, prt): fmterr = "**ERROR: {GO} {N1}({P1:4.2f}) {N2}({P2:4.2f}) {N1}({p1}) {N2}({p2})\n" for fish1, fish2 in combinations(fisher2pvals.keys(), 2): ctr = cx.Counter() - pvals1 = cx.OrderedDict(sorted([(r.GO, r.p_uncorrected) for r in fisher2pvals[fish1]])) - pvals2 = cx.OrderedDict(sorted([(r.GO, r.p_uncorrected) for r in fisher2pvals[fish2]])) + pvals1 = cx.OrderedDict( + sorted([(r.GO, r.p_uncorrected) for r in fisher2pvals[fish1]]) + ) + pvals2 = cx.OrderedDict( + sorted([(r.GO, r.p_uncorrected) for r in fisher2pvals[fish2]]) + ) assert len(pvals1) == len(pvals2) for go_id, pval1 in pvals1.items(): pval2 = pvals2[go_id] ctr[pval1 == pval2] += 1 # Are values from 'fisher' and scipy stats 'fisher_exact' equivalent? if abs(pval1 - pval2) > 0.00001: - prt.write(fmterr.format( - GO=go_id, N1=fish1, N2=fish2, P1=pval1, P2=pval2, p1=pval1, p2=pval2)) + prt.write( + fmterr.format( + GO=go_id, + N1=fish1, + N2=fish2, + P1=pval1, + P2=pval2, + p1=pval1, + p2=pval2, + ) + ) # An exact match 10,984 times. A close match 6,683 times: # 10,984 1: fisher == fisher_scipy_stats # 6,683 0: fisher == fisher_scipy_stats @@ -39,13 +55,14 @@ def _chk_pvals(fisher2pvals, prt): for val, cnt in ctr.most_common(): prt.write(pat.format(N=cnt, RES=val, N1=fish1, N2=fish2)) + def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} - taxid = 10090 # Mouse study + taxid = 10090 # Mouse study file_obo = os.path.join(os.getcwd(), "go-basic.obo") - obo_dag = get_godag(file_obo, prt, loading_bar=None) + obo_dag = get_godag(file_obo, prt) geneids_pop = set(GeneID2nt_mus.keys()) - assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) + assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy( @@ -55,10 +72,11 @@ def _get_pvals(pvalfnc_names, prt=sys.stdout): propagate_counts=False, alpha=0.05, methods=None, - pvalcalc=fisher) + pvalcalc=fisher, + ) fisher2pvals[fisher] = goeaobj.get_pval_uncorr(geneids_study, prt) return fisher2pvals -if __name__ == '__main__': +if __name__ == "__main__": test_pvalcalc() diff --git a/tests/test_read_gaf_allow_nd.py b/tests/test_read_gaf_allow_nd.py index 3248c4b0..383e0100 100755 --- a/tests/test_read_gaf_allow_nd.py +++ b/tests/test_read_gaf_allow_nd.py @@ -5,9 +5,11 @@ __author__ = "DV Klopfenstein" import sys + from goatools.associations import read_gaf from goatools.base import dnld_gaf + def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" # On 2017/04/10, there were 3 GO IDs with ND Evidence Codes: @@ -18,7 +20,7 @@ def test_gaf_read(log=sys.stdout): # 639 GO:0008150 ND # Example species_ids: goa_human mgi fb - fin_gaf = dnld_gaf('goa_human', loading_bar=None) + fin_gaf = dnld_gaf("goa_human") # Example 1: Read GAF go2ids = read_gaf(fin_gaf, go2geneids=True) @@ -27,8 +29,11 @@ def test_gaf_read(log=sys.stdout): # Example 2: Read GAF using defaults (No NOT Qualifiers and no ND Evidence Codes) go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=False, keep_NOT=False) - log.write("Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format( - N=len(go2ids))) + log.write( + "Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format( + N=len(go2ids) + ) + ) # Example 3: Read GAF allowing GOs with ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True) @@ -36,10 +41,14 @@ def test_gaf_read(log=sys.stdout): # Example 4: Read GAF allowing all GOs, even those with NOT Qualifiers or ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True, keep_NOT=True) - log.write("Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format(N=len(go2ids))) + log.write( + "Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format( + N=len(go2ids) + ) + ) -if __name__ == '__main__': +if __name__ == "__main__": test_gaf_read() # Copyright (C) 2016-2019, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_relationships_usr.py b/tests/test_relationships_usr.py index ec4a6203..4865ec96 100755 --- a/tests/test_relationships_usr.py +++ b/tests/test_relationships_usr.py @@ -2,6 +2,7 @@ """Test a user providing both valid and unexpected relationships""" import os + from goatools.base import get_godag from goatools.godag.relationship_combos import RelationshipCombos @@ -12,37 +13,147 @@ def test_relationships_usr(): """Test a user providing unexpected relationships""" # Set up test fin_godag = os.path.join(REPO, "go-basic.obo") - godag_r1 = get_godag(fin_godag, optional_attrs=['relationship'], loading_bar=None) + godag_r1 = get_godag(fin_godag, optional_attrs=["relationship"]) obj_r1 = RelationshipCombos(godag_r1) assert obj_r1.get_set(True) == obj_r1.dag_rels assert obj_r1.get_set(False) == set() - assert obj_r1.get_set(set(['part_of',])) == {'part_of',} - assert obj_r1.get_set({'part_of',}) == {'part_of',} - assert obj_r1.get_set(['part_of',]) == {'part_of',} - assert obj_r1.get_set('part_of') == {'part_of',} + assert obj_r1.get_set( + set( + [ + "part_of", + ] + ) + ) == { + "part_of", + } + assert obj_r1.get_set( + { + "part_of", + } + ) == { + "part_of", + } + assert obj_r1.get_set( + [ + "part_of", + ] + ) == { + "part_of", + } + assert obj_r1.get_set("part_of") == { + "part_of", + } # Run tests: bad user relationships - assert obj_r1.get_set('BAD_REL') == set() - assert obj_r1.get_set(['BAD_REL',]) == set() - assert obj_r1.get_set({'BAD_REL',}) == set() - assert obj_r1.get_set(set(['BAD_REL',])) == set() - assert obj_r1.get_set(set(['part_of', 'BAD_REL',])) == {'part_of',} + assert obj_r1.get_set("BAD_REL") == set() + assert ( + obj_r1.get_set( + [ + "BAD_REL", + ] + ) + == set() + ) + assert ( + obj_r1.get_set( + { + "BAD_REL", + } + ) + == set() + ) + assert ( + obj_r1.get_set( + set( + [ + "BAD_REL", + ] + ) + ) + == set() + ) + assert obj_r1.get_set( + set( + [ + "part_of", + "BAD_REL", + ] + ) + ) == { + "part_of", + } # Run tests: expected relationships - godag_r0 = get_godag(fin_godag, loading_bar=None) + godag_r0 = get_godag(fin_godag) obj_r0 = RelationshipCombos(godag_r0) assert obj_r0.get_set(True) == set() assert obj_r0.get_set(False) == set() - assert obj_r0.get_set(set(['part_of',])) == set() - assert obj_r0.get_set({'part_of',}) == set() - assert obj_r0.get_set(['part_of',]) == set() - assert obj_r0.get_set('part_of') == set() + assert ( + obj_r0.get_set( + set( + [ + "part_of", + ] + ) + ) + == set() + ) + assert ( + obj_r0.get_set( + { + "part_of", + } + ) + == set() + ) + assert ( + obj_r0.get_set( + [ + "part_of", + ] + ) + == set() + ) + assert obj_r0.get_set("part_of") == set() # Run tests: bad user relationships - assert obj_r0.get_set('BAD_REL') == set() - assert obj_r0.get_set(['BAD_REL',]) == set() - assert obj_r0.get_set({'BAD_REL',}) == set() - assert obj_r0.get_set(set(['BAD_REL',])) == set() - assert obj_r0.get_set(set(['part_of', 'BAD_REL',])) == set() + assert obj_r0.get_set("BAD_REL") == set() + assert ( + obj_r0.get_set( + [ + "BAD_REL", + ] + ) + == set() + ) + assert ( + obj_r0.get_set( + { + "BAD_REL", + } + ) + == set() + ) + assert ( + obj_r0.get_set( + set( + [ + "BAD_REL", + ] + ) + ) + == set() + ) + assert ( + obj_r0.get_set( + set( + [ + "part_of", + "BAD_REL", + ] + ) + ) + == set() + ) -if __name__ == '__main__': +if __name__ == "__main__": test_relationships_usr() diff --git a/tests/test_rpt_gene2go_evidencecodes.py b/tests/test_rpt_gene2go_evidencecodes.py index 2a339844..03d024d8 100755 --- a/tests/test_rpt_gene2go_evidencecodes.py +++ b/tests/test_rpt_gene2go_evidencecodes.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 """Tests that all evidence codes seen in NCBI's gene2go have description.""" -from __future__ import print_function - __copyright__ = "Copyright (C) 2016-2019, DV Klopfenstein, H Tang. All rights reserved." __author__ = "DV Klopfenstein" @@ -10,35 +8,38 @@ from goatools.associations import dnld_ncbi_gene_file from goatools.evidence_codes import EvidenceCodes + REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") + def test_ev(): """Return GO associations from a GAF file. Download if necessary.""" - evs = _get_evidencecodes('gene2go') + evs = _get_evidencecodes("gene2go") obj = EvidenceCodes() missing = evs.difference(obj.code2nt) - assert not missing, 'MISSING({EV})'.format(EV=missing) + assert not missing, "MISSING({EV})".format(EV=missing) + def _get_evidencecodes(fin_gene2go): """Get all evidence codes and qualifiers.""" evs = set() - fin_gene2go = os.path.join(REPO, 'gene2go') - dnld_ncbi_gene_file(fin_gene2go, force_dnld=False, loading_bar=False) - with open(fin_gene2go) as ifstrm: + fin_gene2go = os.path.join(REPO, "gene2go") + dnld_ncbi_gene_file(fin_gene2go, force_dnld=False) + with open(fin_gene2go, encoding="utf-8") as ifstrm: for line in ifstrm: - if line[0] != '#': # Line contains data. Not a comment - line = line.rstrip() # chomp - flds = line.split('\t') + if line[0] != "#": # Line contains data. Not a comment + line = line.rstrip() # chomp + flds = line.split("\t") if len(flds) >= 5: # taxid_curr, geneid, go_id, evidence, qualifier = flds[:5] evidence = flds[3] assert len(evidence) >= 2, flds evs.add(evidence) - print('{N} evidence codes in {FIN}'.format(N=len(evs), FIN=fin_gene2go)) + print("{N} evidence codes in {FIN}".format(N=len(evs), FIN=fin_gene2go)) return evs -if __name__ == '__main__': +if __name__ == "__main__": test_ev() # Copyright (C) 2016-2019, DV Klopfenstein, H Tang. All rights reserved. diff --git a/tests/test_semantic_similarity.py b/tests/test_semantic_similarity.py index 93561e38..dd550b2e 100755 --- a/tests/test_semantic_similarity.py +++ b/tests/test_semantic_similarity.py @@ -1,44 +1,47 @@ #!/usr/bin/env python -"""Code as found in notebooks/semantic_similarity.ipynb.""" +"""Code as found in notebooks/semantic_similarity.ipynb. -from __future__ import print_function +Computing basic semantic similarities between GO terms +Adapted from book chapter written by _Alex Warwick Vesztrocy and Christophe Dessimoz_ +How to compute semantic similarity between GO terms. +""" -# Computing basic semantic similarities between GO terms +import os -# Adapted from book chapter written by _Alex Warwick Vesztrocy and Christophe Dessimoz_ +from goatools.associations import dnld_assc +from goatools.base import get_godag +from goatools.godag.consts import NS2GO +from goatools.semantic import ( + deepest_common_ancestor, + get_info_content, + lin_sim, + resnik_sim, + semantic_similarity, + TermCounts, +) -# How to compute semantic similarity between GO terms. +REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") # First we need to write a function that calculates the minimum number # of branches connecting two GO terms. -import os -from goatools.base import get_godag -from goatools.associations import dnld_assc -from goatools.semantic import semantic_similarity -from goatools.semantic import TermCounts -from goatools.semantic import get_info_content -from goatools.semantic import deepest_common_ancestor -from goatools.semantic import resnik_sim -from goatools.semantic import lin_sim -from goatools.godag.consts import NS2GO - -REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) + godag = get_godag(os.path.join(REPO, "go-basic.obo")) # Get all the annotations from arabidopsis. - associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) - + associations = dnld_assc(os.path.join(REPO, "tair.gaf"), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. - go_id3 = 'GO:0048364' # BP level-03 depth-04 root development - go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process + go_id3 = "GO:0048364" # BP level-03 depth-04 root development + go_id4 = "GO:0044707" # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) - print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( - GO1=go_id3, GO2=go_id4, VAL=sim)) + print( + "\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.".format( + GO1=go_id3, GO2=go_id4, VAL=sim + ) + ) print(godag[go_id3]) print(godag[go_id4]) @@ -51,7 +54,7 @@ def test_semantic_similarity(): # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) - print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) + print("\nInformation content ({GO}) = {INFO}\n".format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most @@ -60,25 +63,30 @@ def test_semantic_similarity(): # Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) dca = deepest_common_ancestor([go_id3, go_id4], godag) - assert dca == NS2GO['BP'] + assert dca == NS2GO["BP"] assert sim_r == get_info_content(dca, termcounts) assert sim_r == 0.0 - print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( - GO1=go_id3, GO2=go_id4, VAL=sim_r)) + print( + "Resnik similarity score ({GO1}, {GO2}) = {VAL}".format( + GO1=go_id3, GO2=go_id4, VAL=sim_r + ) + ) # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top sim_l = lin_sim(go_id3, go_id4, godag, termcounts) - print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) + print( + "Lin similarity score ({GO1}, {GO2}) = {VAL}".format( + GO1=go_id3, GO2=go_id4, VAL=sim_l + ) + ) assert sim_l == 0.0, "FATAL LIN SCORE" - # - go_top_cc = NS2GO['CC'] + go_top_cc = NS2GO["CC"] sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_r == 0.0 sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_l == 1.0 - -if __name__ == '__main__': +if __name__ == "__main__": test_semantic_similarity() diff --git a/tests/test_semantic_similarity_best4lex.py b/tests/test_semantic_similarity_best4lex.py index 84d703e4..63e09681 100755 --- a/tests/test_semantic_similarity_best4lex.py +++ b/tests/test_semantic_similarity_best4lex.py @@ -1,16 +1,12 @@ #!/usr/bin/env python """Computing basic semantic similarities between GO terms.""" -from __future__ import print_function - import os import itertools -from goatools.base import get_godag + from goatools.associations import dnld_assc -from goatools.semantic import TermCounts -from goatools.semantic import get_info_content -from goatools.semantic import resnik_sim -from goatools.semantic import lin_sim +from goatools.base import get_godag +from goatools.semantic import TermCounts, get_info_content, lin_sim, resnik_sim REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -27,17 +23,16 @@ def test_semantic_similarity(): ] # Get all the annotations from arabidopsis. associations = [ - ('human', 'goa_human.gaf'), - ('yeast', 'sgd.gaf'), + ("human", "goa_human.gaf"), + ("yeast", "sgd.gaf"), ] - - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) + godag = get_godag(os.path.join(REPO, "go-basic.obo")) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species fin_assc = os.path.join(REPO, assc_name) - assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None) + assc_gene2gos = dnld_assc(fin_assc, godag, namespace="MF", prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) @@ -45,8 +40,15 @@ def test_semantic_similarity(): for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) term = godag[goid] - print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format( - SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name)) + print( + "{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}".format( + SPECIES=species, + GO=goid, + INFO=infocontent, + NS=term.namespace, + NAME=term.name, + ) + ) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") @@ -57,11 +59,14 @@ def test_semantic_similarity(): sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) - print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format( - GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) + print( + "{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}".format( + GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l + ) + ) assert sim_r >= 0.0, "FATAL RESNIK SCORE: {S}".format(S=sim_r) assert sim_l >= 0.0, "FATAL LIN SCORE: {S}".format(S=sim_l) -if __name__ == '__main__': +if __name__ == "__main__": test_semantic_similarity() diff --git a/tests/test_sorter.py b/tests/test_sorter.py index 726ebae7..25eeecb7 100755 --- a/tests/test_sorter.py +++ b/tests/test_sorter.py @@ -1,16 +1,14 @@ #!/usr/bin/env python """Test method, sorter, in class, CountRelatives.""" -from __future__ import print_function - import os import sys from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.sorter import Sorter REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -26,42 +24,52 @@ def test_dflthdrs(prt=sys.stdout, do_plt=False): data = get_data0() # This may need to be updated if default hdrgos are changed - exp_hdrs0 = set([ - "GO:0050789", # BP 11,095 L01 D01 B regulation of biological process - "GO:0044848", # BP 62 L01 D01 S biological phase - "GO:0050794", # BP 8,031 L02 D02 AB regulation of cellular process - "GO:0019222", # BP 3,227 L02 D02 AB regulation of metabolic process - "GO:0048583", # BP 2,377 L02 D02 AB regulation of response to stimulus - "GO:0050793", # BP 1,789 L02 D02 AB regulation of developmental process - "GO:0023051", # BP 1,364 L02 D02 AB regulation of signaling - "GO:0002682", # BP 1,183 L02 D02 AB regulation of immune system process - "GO:0007155", # BP 165 L02 D02 P cell adhesion - "GO:0080134", # BP 940 L03 D03 AB regulation of response to stress - "GO:0007165", # BP 717 L03 D03 AB signal transduction - "GO:0050877", # BP 96 L03 D03 K neurological system process - "GO:0007267"]) # BP 99 L03 D04 CDR cell-cell signaling - + exp_hdrs0 = set( + [ + "GO:0050789", # BP 11,095 L01 D01 B regulation of biological process + "GO:0044848", # BP 62 L01 D01 S biological phase + "GO:0050794", # BP 8,031 L02 D02 AB regulation of cellular process + "GO:0019222", # BP 3,227 L02 D02 AB regulation of metabolic process + "GO:0048583", # BP 2,377 L02 D02 AB regulation of response to stimulus + "GO:0050793", # BP 1,789 L02 D02 AB regulation of developmental process + "GO:0023051", # BP 1,364 L02 D02 AB regulation of signaling + "GO:0002682", # BP 1,183 L02 D02 AB regulation of immune system process + "GO:0007155", # BP 165 L02 D02 P cell adhesion + "GO:0080134", # BP 940 L03 D03 AB regulation of response to stress + "GO:0007165", # BP 717 L03 D03 AB signal transduction + "GO:0050877", # BP 96 L03 D03 K neurological system process + "GO:0007267", + ] + ) # BP 99 L03 D04 CDR cell-cell signaling # Since no "GO group headers" (None) were provided, depth-01 GOs are used for grouping. - hdrobj0 = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=None) + hdrobj0 = HdrgosSections( + grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=None + ) grprobj0 = Grouper("dflt", data, hdrobj0, grprdflt.gosubdag, go2nt=None) _, _, nts0_go, act_hdrs0 = run(grprobj0, hdrobj0, exp_hdrs0) # Grouping GOs are provided, these are added to the depth-01 defaults GOs are used for grouping. - hdrgos = set([ - "GO:0099536", # BP 40 L04 D05 CDR regulation of response to stimulus - "GO:0051239", # BP 2,532 L02 D02 AB regulation of multicellular organismal process - "GO:0048519", # BP 3,293 L02 D02 AB negative regulation of biological process - "GO:0048518"])# BP 3,353 L02 D02 AB positive regulation of biological process + hdrgos = set( + [ + "GO:0099536", # BP 40 L04 D05 CDR regulation of response to stimulus + "GO:0051239", # BP 2,532 L02 D02 AB regulation of multicellular organismal process + "GO:0048519", # BP 3,293 L02 D02 AB negative regulation of biological process + "GO:0048518", + ] + ) # BP 3,353 L02 D02 AB positive regulation of biological process exp_hdrs1 = exp_hdrs0.union(hdrgos) name = "usrhdrs4" - hdrobj1 = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=hdrgos) + hdrobj1 = HdrgosSections( + grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=hdrgos + ) grprobj1 = Grouper(name, data, hdrobj1, grprdflt.gosubdag, go2nt=None) sortobj1, _, nts1_go, act_hdrs1 = run(grprobj1, hdrobj1, exp_hdrs1) if do_plt: from goatools.grouper.grprplt import GrouperPlot + prt.write("\nPLOT DAG\n") GrouperPlot(grprobj1).plot_grouped_gos() @@ -78,7 +86,9 @@ def test_dflthdrs(prt=sys.stdout, do_plt=False): assert act_hdrs1.difference(act_hdrs0) == set(hdrgos) hdrgo_prt = False - sys.stdout.write("\n{NAME}: PRINT GOs hdrgo_prt({H}):\n".format(H=hdrgo_prt, NAME=name)) + sys.stdout.write( + "\n{NAME}: PRINT GOs hdrgo_prt({H}):\n".format(H=hdrgo_prt, NAME=name) + ) sortobj1.prt_gos(hdrgo_prt=hdrgo_prt) nts2 = sortobj1.get_nts_flat(hdrgo_prt) nts2_go = set([nt.GO for nt in nts2]) @@ -100,17 +110,20 @@ def run(grprobj, hdrobj, exp_hdrs, hdrgo_prt=True): # assert act_hdrs == exp_hdrs sortobj = Sorter(grprobj, hdrgo_prt=hdrgo_prt) - sys.stdout.write("\n{NAME} PRINT GOs hdrgo_prt({H}):\n".format( - H=hdrgo_prt, NAME=grprobj.grpname)) + sys.stdout.write( + "\n{NAME} PRINT GOs hdrgo_prt({H}):\n".format(H=hdrgo_prt, NAME=grprobj.grpname) + ) sortobj.prt_gos() nts = sortobj.get_nts_flat(hdrgo_prt) nts_go = set([nt.GO for nt in nts]) usrgos = grprobj.usrgos - assert nts_go.intersection(usrgos) == usrgos, \ - "ONLY {N} of {U} user gos found in grouped sorted GOs. MISSING: {GOs}".format( - N=len(nts_go.intersection(usrgos)), - GOs=" ".join(usrgos.difference(nts_go.intersection(usrgos))), - U=len(usrgos)) + assert ( + nts_go.intersection(usrgos) == usrgos + ), "ONLY {N} of {U} user gos found in grouped sorted GOs. MISSING: {GOs}".format( + N=len(nts_go.intersection(usrgos)), + GOs=" ".join(usrgos.difference(nts_go.intersection(usrgos))), + U=len(usrgos), + ) return sortobj, nts, nts_go, act_hdrs @@ -119,75 +132,84 @@ def chk_hdrs(grprobj, hdrobj, prt=sys.stdout): hdrgos_all = grprobj.get_hdrgos() hdrgos_u0 = grprobj.get_hdrgos_u0() hdrgos_u1 = grprobj.get_hdrgos_u1() - prt.write("{N} hdrgos ({U} are also user GO IDs) used out of {M} available\n".format( - N=len(hdrgos_all), U=len(hdrgos_u1), M=len(hdrobj.hdrgos))) + prt.write( + "{N} hdrgos ({U} are also user GO IDs) used out of {M} available\n".format( + N=len(hdrgos_all), U=len(hdrgos_u1), M=len(hdrobj.hdrgos) + ) + ) assert hdrgos_u0.union(hdrgos_u1) == hdrgos_all + def get_data0(): """Nature GO ids.""" - return set([ - #"GO:0050789", # BP 1 11,101 L01 D01 B reg. of biological process - "GO:0051969", # BP 5 L03 D05 AB reg. of transmission of nerve impulse - "GO:0008629", # BP 13 L05 D05 AB intrinsic apoptotic signaling pathway - "GO:0051056", # BP 26 L05 D06 AB reg. of small GTPase mediated signal transduction - "GO:0031644", # BP 30 L04 D04 AB reg. of neurological system process - "GO:0006275", # BP 50 L05 D06 AB reg. of DNA replication - "GO:0051053", # BP * 76 L05 D06 AB negative reg. of DNA metabolic process - "GO:0007167", # BP 121 L05 D05 AB enzyme linked receptor protein signaling pathway - "GO:0050804", # BP 120 L03 D04 AB modulation of synaptic transmission - "GO:0007242", # BP 135 L04 D04 AB intracellular signal transduction - "GO:0007346", # BP 157 L04 D04 AB reg. of mitotic cell cycle - "GO:0001819", # BP 154 L04 D04 AB positive reg. of cytokine production - "GO:0051052", # BP 225 L04 D05 AB reg. of DNA metabolic process - "GO:0050778", # BP 227 L04 D04 AB positive reg. of immune response - "GO:0030155", # BP 246 L02 D02 AB reg. of cell adhesion - "GO:0042127", # BP 268 L03 D03 AB reg. of cell proliferation - "GO:0010564", # BP 350 L04 D04 AB reg. of cell cycle process - "GO:0044057", # BP * 392 L03 D03 AB reg. of system process - "GO:0051726", # BP 404 L03 D03 AB reg. of cell cycle - "GO:0002684", # BP * 436 L03 D03 AB positive reg. of immune system process - "GO:0051093", # BP 549 L03 D03 AB negative reg. of developmental process - "GO:0050776", # BP 661 L03 D03 AB reg. of immune response - "GO:0048584", # BP 776 L03 D03 AB positive reg. of response to stimulus - "GO:0045595", # BP 828 L03 D03 AB reg. of cell differentiation - "GO:0080134", # BP 940 L03 D03 AB reg. of response to stress - "GO:0009966", # BP 1,108 L03 D04 AB reg. of signal transduction - "GO:0002682", # BP 1,183 L02 D02 AB reg. of immune system process - "GO:0010646", # BP 1,392 L03 D03 AB reg. of cell communication - "GO:0050793", # BP 1,789 L02 D02 AB reg. of developmental process - "GO:0048522", # BP 2,289 L03 D03 AB positive reg. of cellular process - "GO:0048523", # BP 2,372 L03 D03 AB negative reg. of cellular process - #"GO:0048583", # BP 2,377 L02 D02 AB reg. of response to stimulus - "GO:0051239", # BP 2,532 L02 D02 AB reg. of multicellular organismal process - "GO:0048519", # BP 3,293 L02 D02 AB negative reg. of biological process - "GO:0048518", # BP 3,353 L02 D02 AB positive reg. of biological process - #"GO:0044848", # BP 1 62 L01 D01 S biological phase - "GO:0000087", # BP 0 0 L04 D04 S mitotic M phase - "GO:0051327", # BP 0 0 L04 D04 S meiotic M phase - "GO:0000279", # BP 0 2 L03 D03 S M phase - "GO:0022403", # BP 0 46 L02 D02 S cell cycle phase - #"GO:0023052", # BP 1 116 L01 D01 R signaling - "GO:0019226", # BP 0 0 L04 D04 DKR transmission of nerve impulse - "GO:0007268", # BP 0 12 L07 D08 CDR chemical synaptic transmission - "GO:0007267", # BP 0 99 L03 D04 CDR cell-cell signaling - #"GO:0022610", # BP 1 194 L01 D01 P biological adhesion - "GO:0007155", # BP 0 165 L02 D02 P cell adhesion - #"GO:0007610", # BP 1 219 L01 D01 O behavior - "GO:0007612", # BP 0 14 L04 D06 DKO learning - "GO:0007611"])# BP 0 22 L03 D05 DKO learning or memory + return set( + [ + # "GO:0050789", # BP 1 11,101 L01 D01 B reg. of biological process + "GO:0051969", # BP 5 L03 D05 AB reg. of transmission of nerve impulse + "GO:0008629", # BP 13 L05 D05 AB intrinsic apoptotic signaling pathway + "GO:0051056", # BP 26 L05 D06 AB reg. of small GTPase mediated signal transduction + "GO:0031644", # BP 30 L04 D04 AB reg. of neurological system process + "GO:0006275", # BP 50 L05 D06 AB reg. of DNA replication + "GO:0051053", # BP * 76 L05 D06 AB negative reg. of DNA metabolic process + "GO:0007167", # BP 121 L05 D05 AB enzyme linked receptor protein signaling pathway + "GO:0050804", # BP 120 L03 D04 AB modulation of synaptic transmission + "GO:0007242", # BP 135 L04 D04 AB intracellular signal transduction + "GO:0007346", # BP 157 L04 D04 AB reg. of mitotic cell cycle + "GO:0001819", # BP 154 L04 D04 AB positive reg. of cytokine production + "GO:0051052", # BP 225 L04 D05 AB reg. of DNA metabolic process + "GO:0050778", # BP 227 L04 D04 AB positive reg. of immune response + "GO:0030155", # BP 246 L02 D02 AB reg. of cell adhesion + "GO:0042127", # BP 268 L03 D03 AB reg. of cell proliferation + "GO:0010564", # BP 350 L04 D04 AB reg. of cell cycle process + "GO:0044057", # BP * 392 L03 D03 AB reg. of system process + "GO:0051726", # BP 404 L03 D03 AB reg. of cell cycle + "GO:0002684", # BP * 436 L03 D03 AB positive reg. of immune system process + "GO:0051093", # BP 549 L03 D03 AB negative reg. of developmental process + "GO:0050776", # BP 661 L03 D03 AB reg. of immune response + "GO:0048584", # BP 776 L03 D03 AB positive reg. of response to stimulus + "GO:0045595", # BP 828 L03 D03 AB reg. of cell differentiation + "GO:0080134", # BP 940 L03 D03 AB reg. of response to stress + "GO:0009966", # BP 1,108 L03 D04 AB reg. of signal transduction + "GO:0002682", # BP 1,183 L02 D02 AB reg. of immune system process + "GO:0010646", # BP 1,392 L03 D03 AB reg. of cell communication + "GO:0050793", # BP 1,789 L02 D02 AB reg. of developmental process + "GO:0048522", # BP 2,289 L03 D03 AB positive reg. of cellular process + "GO:0048523", # BP 2,372 L03 D03 AB negative reg. of cellular process + # "GO:0048583", # BP 2,377 L02 D02 AB reg. of response to stimulus + "GO:0051239", # BP 2,532 L02 D02 AB reg. of multicellular organismal process + "GO:0048519", # BP 3,293 L02 D02 AB negative reg. of biological process + "GO:0048518", # BP 3,353 L02 D02 AB positive reg. of biological process + # "GO:0044848", # BP 1 62 L01 D01 S biological phase + "GO:0000087", # BP 0 0 L04 D04 S mitotic M phase + "GO:0051327", # BP 0 0 L04 D04 S meiotic M phase + "GO:0000279", # BP 0 2 L03 D03 S M phase + "GO:0022403", # BP 0 46 L02 D02 S cell cycle phase + # "GO:0023052", # BP 1 116 L01 D01 R signaling + "GO:0019226", # BP 0 0 L04 D04 DKR transmission of nerve impulse + "GO:0007268", # BP 0 12 L07 D08 CDR chemical synaptic transmission + "GO:0007267", # BP 0 99 L03 D04 CDR cell-cell signaling + # "GO:0022610", # BP 1 194 L01 D01 P biological adhesion + "GO:0007155", # BP 0 165 L02 D02 P cell adhesion + # "GO:0007610", # BP 1 219 L01 D01 O behavior + "GO:0007612", # BP 0 14 L04 D06 DKO learning + "GO:0007611", + ] + ) # BP 0 22 L03 D05 DKO learning or memory + def _get_gosubdag(): """Get GO DAG.""" - fin = os.path.join(REPO, 'go-basic.obo') - godag = get_godag(fin, prt=sys.stdout, loading_bar=False, optional_attrs=['relationship']) + fin = os.path.join(REPO, "go-basic.obo") + godag = get_godag(fin, prt=sys.stdout, optional_attrs=["relationship"]) return GoSubDag(None, godag) + def _get_grprdflt(): """Get Grouper defaults.""" gosubdag = _get_gosubdag() - fin_slim = os.path.join(REPO, 'goslim_generic.obo') + fin_slim = os.path.join(REPO, "goslim_generic.obo") return GrouperDflts(gosubdag, fin_slim) -if __name__ == '__main__': +if __name__ == "__main__": test_dflthdrs(do_plt=True) diff --git a/tests/test_sorter_desc2nts.py b/tests/test_sorter_desc2nts.py index 70ec26dc..3469e659 100755 --- a/tests/test_sorter_desc2nts.py +++ b/tests/test_sorter_desc2nts.py @@ -2,23 +2,42 @@ """Test various options while sorting when using sections.""" import os + from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.sorter import Sorter -from goatools.test_data.sorter import USER_GOS -from goatools.test_data.sorter import SECTIONS +from goatools.test_data.sorter import SECTIONS, USER_GOS REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") -GO_FLDS = ('format_txt', 'hdr_idx', 'is_hdrgo', 'is_usrgo', 'num_usrgos', 'hdr1usr01', - 'NS', 'level', 'depth', 'reldepth', 'GO', 'alt', 'GO_name', - 'dcnt', 'D1', 'childcnt', 'REL', 'REL_short', 'rel', 'id') - -D2_FLDS = set(['flds', 'num_items', 'num_sections', 'hdrgo_prt', 'sortobj', 'sections']) -D1_FLDS = set(['flds', 'num_items', 'num_sections', 'hdrgo_prt', 'sortobj', 'flat']) +GO_FLDS = ( + "format_txt", + "hdr_idx", + "is_hdrgo", + "is_usrgo", + "num_usrgos", + "hdr1usr01", + "NS", + "level", + "depth", + "reldepth", + "GO", + "alt", + "GO_name", + "dcnt", + "D1", + "childcnt", + "REL", + "REL_short", + "rel", + "id", +) + +D2_FLDS = set(["flds", "num_items", "num_sections", "hdrgo_prt", "sortobj", "sections"]) +D1_FLDS = set(["flds", "num_items", "num_sections", "hdrgo_prt", "sortobj", "flat"]) # pylint: disable=line-too-long @@ -26,86 +45,111 @@ def test_desc2nts(): """Test various options while sorting when using sections.""" sortobj = _get_sortobj() - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=None, top_n=None, use_sections=True) - assert desc2nts['hdrgo_prt'] - h1a = set(nt.GO for sec, nts in desc2nts['sections'] for nt in nts) - num_sections = len(desc2nts['sections']) - assert desc2nts['sections'] == sortobj.get_desc2nts_fnc()['sections'], "**FAILED: Default values" + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=None, top_n=None, use_sections=True + ) + assert desc2nts["hdrgo_prt"] + h1a = set(nt.GO for sec, nts in desc2nts["sections"] for nt in nts) + num_sections = len(desc2nts["sections"]) + assert ( + desc2nts["sections"] == sortobj.get_desc2nts_fnc()["sections"] + ), "**FAILED: Default values" _chk_d2(desc2nts, num_sections) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=True, top_n=None, use_sections=True) - assert desc2nts['hdrgo_prt'] - assert set(nt.GO for sec, nts in desc2nts['sections'] for nt in nts) == h1a + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=True, top_n=None, use_sections=True + ) + assert desc2nts["hdrgo_prt"] + assert set(nt.GO for sec, nts in desc2nts["sections"] for nt in nts) == h1a _chk_d2(desc2nts, num_sections) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=False, top_n=None, use_sections=True) + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=False, top_n=None, use_sections=True + ) assert set(desc2nts.keys()) == D1_FLDS - assert desc2nts['hdrgo_prt'] - assert set(nt.GO for nt in desc2nts['flat']) == h1a - assert set(desc2nts['flds']).difference(set(GO_FLDS)) == set(['section']) - assert desc2nts['num_sections'] == num_sections - - - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=None, top_n=None, use_sections=True) - h0a = set(nt.GO for sec, nts in desc2nts['sections'] for nt in nts) - assert not desc2nts['hdrgo_prt'] + assert desc2nts["hdrgo_prt"] + assert set(nt.GO for nt in desc2nts["flat"]) == h1a + assert set(desc2nts["flds"]).difference(set(GO_FLDS)) == set(["section"]) + assert desc2nts["num_sections"] == num_sections + + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=None, top_n=None, use_sections=True + ) + h0a = set(nt.GO for sec, nts in desc2nts["sections"] for nt in nts) + assert not desc2nts["hdrgo_prt"] assert len(h1a) > len(h0a), "**FAILED: MISSING HEADER GOs" _chk_d2(desc2nts, num_sections) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=True, top_n=None, use_sections=True) - assert not desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=True, top_n=None, use_sections=True + ) + assert not desc2nts["hdrgo_prt"] _chk_d2(desc2nts, num_sections) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=False, top_n=None, use_sections=True) + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=False, top_n=None, use_sections=True + ) assert set(desc2nts.keys()) == D1_FLDS - assert not desc2nts['hdrgo_prt'] - assert set(nt.GO for nt in desc2nts['flat']) == h0a - assert set(desc2nts['flds']).difference(set(GO_FLDS)) == set(['section']) - assert desc2nts['num_sections'] == num_sections - - - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=None, top_n=None, use_sections=False) - assert desc2nts['hdrgo_prt'] + assert not desc2nts["hdrgo_prt"] + assert set(nt.GO for nt in desc2nts["flat"]) == h0a + assert set(desc2nts["flds"]).difference(set(GO_FLDS)) == set(["section"]) + assert desc2nts["num_sections"] == num_sections + + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=None, top_n=None, use_sections=False + ) + assert desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h1a) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=True, top_n=None, use_sections=False) - assert desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=True, top_n=None, use_sections=False + ) + assert desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h1a) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=True, section_prt=False, top_n=None, use_sections=False) - assert desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=True, section_prt=False, top_n=None, use_sections=False + ) + assert desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h1a) - - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=None, top_n=None, use_sections=False) - assert not desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=None, top_n=None, use_sections=False + ) + assert not desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h0a) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=True, top_n=None, use_sections=False) - assert not desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=True, top_n=None, use_sections=False + ) + assert not desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h0a) - desc2nts = sortobj.get_desc2nts_fnc(hdrgo_prt=False, section_prt=False, top_n=None, use_sections=False) - assert not desc2nts['hdrgo_prt'] + desc2nts = sortobj.get_desc2nts_fnc( + hdrgo_prt=False, section_prt=False, top_n=None, use_sections=False + ) + assert not desc2nts["hdrgo_prt"] _chk_flat(desc2nts, h0a) def _chk_d2(desc2nts, num_sections): """Check section fields.""" assert set(desc2nts.keys()) == D2_FLDS - assert desc2nts['flds'] == GO_FLDS - assert desc2nts['num_sections'] == num_sections + assert desc2nts["flds"] == GO_FLDS + assert desc2nts["num_sections"] == num_sections + def _chk_flat(desc2nts, h0a): """Check flat fields.""" assert set(desc2nts.keys()) == D1_FLDS - assert set(nt.GO for nt in desc2nts['flat']) == h0a - assert desc2nts['flds'] == GO_FLDS + assert set(nt.GO for nt in desc2nts["flat"]) == h0a + assert desc2nts["flds"] == GO_FLDS + def _get_sortobj(): """Get object for grouping GO IDs.""" fin_godag = os.path.join(REPO, "go-basic.obo") - godag = get_godag(fin_godag, prt=None, loading_bar=False, optional_attrs=['relationship']) + godag = get_godag(fin_godag, prt=None, optional_attrs=["relationship"]) gosubdag = GoSubDag(USER_GOS, godag, relationships=True, tcntobj=None) grprdflt = GrouperDflts(gosubdag) hdrobj = HdrgosSections(gosubdag, grprdflt.hdrgos_dflt, SECTIONS) @@ -113,5 +157,5 @@ def _get_sortobj(): return Sorter(grprobj) -if __name__ == '__main__': +if __name__ == "__main__": test_desc2nts() diff --git a/tests/test_sorter_sections.py b/tests/test_sorter_sections.py index f0ce4194..1a23b7b7 100755 --- a/tests/test_sorter_sections.py +++ b/tests/test_sorter_sections.py @@ -2,15 +2,15 @@ """Test various options while sorting when using sections.""" import os + from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.sorter import Sorter from goatools.grouper.wrxlsx import WrXlsxSortedGos -from goatools.test_data.sorter import USER_GOS -from goatools.test_data.sorter import SECTIONS +from goatools.test_data.sorter import SECTIONS, USER_GOS REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") @@ -20,9 +20,9 @@ def test_sort(): grprobj = _get_grprobj() # kws: hdrgo_prt section_prt use_sections - #------------------------------------------------------------ + # ------------------------------------------------------------ # TEST: hdrgo_prt and section_prt - #------------------------------------------------------------ + # ------------------------------------------------------------ # 1) Print sections: # * Print headers: ph(print header) ch(color header) _wr_xlsx("t_a1_ps1_ph1", grprobj) @@ -34,20 +34,20 @@ def test_sort(): # * Use (but don't print) headers. _wr_xlsx("t_a4_ps0_ph0", grprobj, section_prt=False, hdrgo_prt=False) - #------------------------------------------------------------ + # ------------------------------------------------------------ # TEST: use_sections=False # sections are ignored, but hdrgos defined in sections are used - #------------------------------------------------------------ + # ------------------------------------------------------------ _wr_xlsx("t_b1_us0_ph1", grprobj, use_sections=False) _wr_xlsx("t_b2_us0_ph0", grprobj, use_sections=False, hdrgo_prt=False) - #------------------------------------------------------------ + # ------------------------------------------------------------ # TEST: use_sections=True hdrgo_prt(T/F) # These conditions force hdrgo_prt = False, if hdrgo_prt was not mentioned # * section_sortby == True # * section_sortby = user_sort # * top_n == N - #------------------------------------------------------------ + # ------------------------------------------------------------ sortby = lambda nt: nt.depth _wr_xlsx("t_c1_ps1_ph0_hsortT", grprobj, section_sortby=True) _wr_xlsx("t_c2_ps1_ph0_hsortT_top3", grprobj, section_sortby=True, top_n=3) @@ -56,33 +56,39 @@ def test_sort(): # Most commonly used; User provides the sort. Users often like to sort by pval, if exists: _wr_xlsx("t_c4_ps1_ph0_usort", grprobj, section_sortby=sortby) _wr_xlsx("t_c5_ps1_ph0_usort_top3", grprobj, section_sortby=sortby, top_n=3) - _wr_xlsx("t_c6_ps0_ph0_usort_top3", grprobj, section_sortby=sortby, top_n=3, section_prt=False) + _wr_xlsx( + "t_c6_ps0_ph0_usort_top3", + grprobj, + section_sortby=sortby, + top_n=3, + section_prt=False, + ) def _wr_xlsx(name, grprobj, **kws): """Group, sort, and print xlsx file.""" - # print('\nTEST {} kws_sortobj: {}'.format(name, kws)) # KWS SORT OBJ - kws_sort = {'sortby', 'hdrgo_sortby', 'section_sortby'} + kws_sort = {"sortby", "hdrgo_sortby", "section_sortby"} # KWS SORT FUNC: hdrgo_prt section_prt top_n use_sections prtfmt # Exclude ungrouped "Misc." section of sections var(sec_rd) fout_xlsx = "{NAME}.xlsx".format(NAME=name) # kws Sorter: hdrgo_prt section_prt top_n use_sections - sortobj = Sorter(grprobj, **{k:v for k, v in kws.items() if k in kws_sort}) + sortobj = Sorter(grprobj, **{k: v for k, v in kws.items() if k in kws_sort}) desc2nts = sortobj.get_desc2nts(**kws) objwr = WrXlsxSortedGos(name, sortobj) # kws WrXlsxSortedGos wr_xlsx_nts: title hdrs objwr.wr_xlsx_nts(fout_xlsx, desc2nts, **kws) + def _get_grprobj(): """Get object for grouping GO IDs.""" fin_obo = os.path.join(REPO, "go-basic.obo") - godag = get_godag(fin_obo, prt=None, loading_bar=False, optional_attrs=['relationship']) + godag = get_godag(fin_obo, prt=None, optional_attrs=["relationship"]) gosubdag = GoSubDag(USER_GOS, godag, relationships=True, tcntobj=None) grprdflt = GrouperDflts(gosubdag) hdrobj = HdrgosSections(gosubdag, grprdflt.hdrgos_dflt, SECTIONS) return Grouper("wrusrgos", USER_GOS, hdrobj, gosubdag) -if __name__ == '__main__': +if __name__ == "__main__": test_sort() diff --git a/tests/test_tcntobj_relationships.py b/tests/test_tcntobj_relationships.py index 6afb607a..d0375454 100755 --- a/tests/test_tcntobj_relationships.py +++ b/tests/test_tcntobj_relationships.py @@ -3,41 +3,51 @@ import os import sys + +from goatools.anno.gpad_reader import GpadReader +from goatools.associations import dnld_annotation from goatools.base import download_go_basic_obo from goatools.obo_parser import GODag -from goatools.associations import dnld_annotation -from goatools.anno.gpad_reader import GpadReader from goatools.semantic import TermCounts REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") -NSS = ['BP', 'MF', 'CC', 'all'] -RELS = {'part_of',} +NSS = ["BP", "MF", "CC", "all"] +RELS = { + "part_of", +} + def test_tcntobj_relationships(prt=sys.stdout): """Test loading of relationships, like part_of, into TermCounts""" fin_obo = os.path.join(REPO, "go-basic.obo") - fin_anno = os.path.join(REPO, 'goa_human.gpad') + fin_anno = os.path.join(REPO, "goa_human.gpad") - download_go_basic_obo(fin_obo, prt, loading_bar=None) + download_go_basic_obo(fin_obo, prt) dnld_annotation(fin_anno) # Load ontologies go2obj_r0 = GODag(fin_obo) - go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship']) + go2obj_r1 = GODag(fin_obo, optional_attrs=["relationship"]) # Load annotations annoobj = GpadReader(fin_anno, godag=go2obj_r0) # Create TermCounts objects - ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS} - ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS} + ns2tcntobj_r0 = {ns: TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS} + ns2tcntobj_r1 = { + ns: TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS + } _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1) def _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1): """Check to see that term counts are different w and wo/relationships""" - print('\nCOMPARE GO Counts wo/relationships and with: {Rs}'.format(Rs=' '.join(sorted(RELS)))) + print( + "\nCOMPARE GO Counts wo/relationships and with: {Rs}".format( + Rs=" ".join(sorted(RELS)) + ) + ) for nspc in NSS: cnt = 0 go2cnts_r1 = ns2tcntobj_r1[nspc].gocnts @@ -46,9 +56,11 @@ def _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1): assert cnt_r0 <= cnt_r1 if cnt_r0 != cnt_r1: cnt += 1 - print('{NS:3} {N:5,} more GO ID counts using relationships'.format(NS=nspc, N=cnt)) + print( + "{NS:3} {N:5,} more GO ID counts using relationships".format(NS=nspc, N=cnt) + ) assert ns2tcntobj_r0[nspc].gocnts != ns2tcntobj_r1[nspc].gocnts -if __name__ == '__main__': +if __name__ == "__main__": test_tcntobj_relationships() diff --git a/tests/test_termcounts_asscs.py b/tests/test_termcounts_asscs.py index 83c16d96..4aaaccfa 100755 --- a/tests/test_termcounts_asscs.py +++ b/tests/test_termcounts_asscs.py @@ -1,29 +1,26 @@ #!/usr/bin/env python """Test TermCounts object used in Resnik and Lin similarity calculations.""" -from __future__ import print_function - +import datetime import os import sys import timeit -import datetime + +from goatools.anno.gaf_reader import GafReader +from goatools.associations import dnld_annotation from goatools.base import get_godag -from goatools.semantic import TermCounts -from goatools.semantic import get_info_content +from goatools.semantic import TermCounts, get_info_content from goatools.test_data.gafs import ASSOCIATIONS -from goatools.associations import dnld_annotation -from goatools.anno.gaf_reader import GafReader -from goatools.godag.consts import NS2NAMESPACE TIC = timeit.default_timer() REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" - not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} + not_these = {"goa_uniprot_all.gaf", "goa_uniprot_all_noiea.gaf"} associations = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() - # goids = go2obj.keys() # http://current.geneontology.org/annotations/ if usr_assc is not None: associations = [usr_assc] @@ -36,8 +33,7 @@ def test_semantic_similarity(usr_assc=None): if not os.path.exists(fin_gaf): dnld_annotation(fin_gaf) annoobj = GafReader(fin_gaf) - #### for nspc in ['BP', 'MF', 'CC']: - assc_gene2gos = annoobj.get_id2gos('all') + assc_gene2gos = annoobj.get_id2gos("all") if not assc_gene2gos: not_found.add(assc_name) continue @@ -49,42 +45,49 @@ def test_semantic_similarity(usr_assc=None): tcntobj = TermCounts(go2obj, assc_gene2gos) go_cnt = tcntobj.gocnts.most_common() - #print tcntobj.gocnts.most_common() - if go_cnt: print("{ASSC}".format(ASSC=assc_name)) print(tcntobj.aspect_counts) gocnt_max = go_cnt[0][1] prt_info(tcntobj, go_cnt, None) - prt_info(tcntobj, go_cnt, gocnt_max/2.0) - prt_info(tcntobj, go_cnt, gocnt_max/10.0) - print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic))) - print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations))) + prt_info(tcntobj, go_cnt, gocnt_max / 2.0) + prt_info(tcntobj, go_cnt, gocnt_max / 10.0) + print( + "{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic)) + ) + print("{HMS} {N} Associations".format(HMS=_hms(TIC), N=len(associations))) if not_found: _prt_not_found(not_found) if errs: - fout_err = 'namespace_errors.txt' - with open(fout_err, 'w') as prt: + fout_err = "namespace_errors.txt" + with open(fout_err, "w", encoding="utf-8") as prt: for err in errs: prt.write(err) - print(' {N} ERRORS WROTE: {TXT}'.format(N=len(errs), TXT=fout_err)) + print(" {N} ERRORS WROTE: {TXT}".format(N=len(errs), TXT=fout_err)) def _prt_not_found(not_found): - print('**WARNING: {N} EMPTY ASSOCIATIONS:'.format(N=len(not_found))) + print("**WARNING: {N} EMPTY ASSOCIATIONS:".format(N=len(not_found))) for idx, assc in enumerate(not_found): - print(' {I}) {ASSC}'.format(I=idx, ASSC=assc)) + print(" {I}) {ASSC}".format(I=idx, ASSC=assc)) + def _hms(tic): """Get Timing.""" - return '{HMS}'.format(HMS=str(datetime.timedelta(seconds=(timeit.default_timer()-tic)))) + return "{HMS}".format( + HMS=str(datetime.timedelta(seconds=(timeit.default_timer() - tic))) + ) + def prt_info(tcntobj, go_cnt, max_val): """Print the information content of a frequently used GO ID.""" go_id, cnt = get_goid(go_cnt, max_val) infocontent = get_info_content(go_id, tcntobj) - msg = 'Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}' - print(msg.format(GO=go_id, CNT=cnt, INFO=infocontent, NAME=tcntobj.go2obj[go_id].name)) + msg = "Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}" + print( + msg.format(GO=go_id, CNT=cnt, INFO=infocontent, NAME=tcntobj.go2obj[go_id].name) + ) + def get_goid(go_cnt, max_val): """Get frequently used GO ID.""" @@ -95,11 +98,13 @@ def get_goid(go_cnt, max_val): return go_cnt[-1][0], go_cnt[-1][1] return go_cnt[0][0], go_cnt[0][1] + def get_go2obj(): """Read GODag and return go2obj.""" - godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) - return {go:o for go, o in godag.items() if not o.is_obsolete} + godag = get_godag(os.path.join(REPO, "go-basic.obo")) + return {go: o for go, o in godag.items() if not o.is_obsolete} + -if __name__ == '__main__': +if __name__ == "__main__": ASSC_NAME = None if len(sys.argv) == 1 else sys.argv[1] test_semantic_similarity(ASSC_NAME) diff --git a/tests/test_typedefs.py b/tests/test_typedefs.py index c62396fc..ed86b50d 100755 --- a/tests/test_typedefs.py +++ b/tests/test_typedefs.py @@ -6,9 +6,9 @@ def test_typedef(): """Ensure that alternate GO IDs.""" - obo_dag = get_godag("go-basic.obo", loading_bar=None) - print(obo_dag.typedefs['negatively_regulates']) + obo_dag = get_godag("go-basic.obo") + print(obo_dag.typedefs["negatively_regulates"]) -if __name__ == '__main__': +if __name__ == "__main__": test_typedef() diff --git a/tests/test_wr_sections_txt.py b/tests/test_wr_sections_txt.py index abed7e36..acf19335 100755 --- a/tests/test_wr_sections_txt.py +++ b/tests/test_wr_sections_txt.py @@ -3,28 +3,27 @@ import os import sys + from goatools.base import get_godag from goatools.gosubdag.gosubdag import GoSubDag from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.grprobj import Grouper +from goatools.grouper.hdrgos import HdrgosSections from goatools.grouper.read_goids import read_sections -from goatools.grouper.wr_sections import WrSectionsTxt -from goatools.grouper.wr_sections import WrSectionsPy -# from goatools.cli.wr_sections import +from goatools.grouper.wr_sections import WrSectionsPy, WrSectionsTxt from goatools.test_data.gjoneska_goea_consistent_increase import goea_results REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_wr_sections_all(): """Test that all sections files generated by wr_sections have the same content.""" f_sec_rd = os.path.join(REPO, "data/gjoneska_pfenning/sections_in.txt") - f_sec_wr = os.path.join(REPO, "tmp_test_sections_out.txt") + f_sec_wr = os.path.join(REPO, "tmp_test_sections_out.txt") # Travis-CI path is cwd - f_sec_py = os.path.join(REPO, "tmp_test_sections.py") - # f_sec_mod = "tmp_test_sections" + f_sec_py = os.path.join(REPO, "tmp_test_sections.py") # Read user GO IDs. Setup to write sections text file and Python file - usrgos = [getattr(nt, 'GO') for nt in goea_results] + usrgos = [getattr(nt, "GO") for nt in goea_results] sec_rd = _read_sections(f_sec_rd) # Do preliminaries godag = _get_godag() @@ -40,26 +39,26 @@ def test_wr_sections_all(): objsecpy = WrSectionsPy(grprobj) objsecpy.wr_py_sections(os.path.join(REPO, f_sec_py), sec_rd, doc=godag.version) # Read text and Python sections files - sec_wr = _read_sections(f_sec_wr) - sec_py = _read_sections(f_sec_py) - # sec_mod = _read_sections(f_sec_mod) - # _chk_sections(sec_py, sec_mod) - # _chk_sections(sec_wr, sec_mod, hdrobj.secdflt) + _read_sections(f_sec_wr) + _read_sections(f_sec_py) + def _get_godag(): """Get GO DAG.""" - fin = os.path.join(REPO, 'go-basic.obo') - return get_godag(fin, prt=None, loading_bar=False, optional_attrs=['relationship']) + fin = os.path.join(REPO, "go-basic.obo") + return get_godag(fin, prt=None, optional_attrs=["relationship"]) + def _get_grprdflt(gosubdag=None): """Get Grouper defaults.""" - fin_slim = os.path.join(REPO, 'goslim_generic.obo') + fin_slim = os.path.join(REPO, "goslim_generic.obo") return GrouperDflts(gosubdag, fin_slim) + def test_wr_sections_txt(): """Group depth-02 GO terms under their most specific depth-01 GO parent(s).""" # Get GOs to be grouped - usrgos = [getattr(nt, 'GO') for nt in goea_results] + usrgos = [getattr(nt, "GO") for nt in goea_results] # Read OBO files once to save time grprdflt = _get_grprdflt() @@ -87,37 +86,47 @@ def test_wr_sections_txt(): def _chk_sections(sec_a, sec_b, hdrgos_dflt=None): """Do the two sections variables contain the same data?""" if hdrgos_dflt: - assert sec_a[-1][0] == hdrgos_dflt, "EXP({}) ACT({})".format(hdrgos_dflt, sec_a[-1][0]) + assert sec_a[-1][0] == hdrgos_dflt, "EXP({}) ACT({})".format( + hdrgos_dflt, sec_a[-1][0] + ) sec_a = sec_a[:-1] print("EXP({}) ACT({})".format(hdrgos_dflt, sec_b[-1][0])) # sec_b = sec_b[:-1] assert len(sec_a) == len(sec_b), "LENGTH MISMATCH: {A} != {B}".format( - A=len(sec_a), B=len(sec_b)) + A=len(sec_a), B=len(sec_b) + ) for (name_a, gos_a), (name_b, gos_b) in zip(sec_a, sec_b): assert name_a == name_b, "NAME MISMATCH: {A} != {B}".format(A=name_a, B=name_b) assert gos_a == gos_b, "{NM} GO IDs MISMATCH: {A} != {B}".format( - NM=name_a, A=gos_a, B=gos_b) + NM=name_a, A=gos_a, B=gos_b + ) + def _read_sections(sec): """Get sections variable from file.""" - if '/' in sec: + if "/" in sec: sec = os.path.join(REPO, sec) var = read_sections(sec) assert var, "EMPTY SECTIONS FILE({})".format(sec) return var + def _wr_sections_txt(fout_txt, usrgos, sections, grprdflt): """Given a list of usrgos and sections, write text file.""" try: - hdrobj = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=sections) + hdrobj = HdrgosSections( + grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=sections + ) grprobj = Grouper(fout_txt, usrgos, hdrobj, grprdflt.gosubdag, go2nt=None) full_txt = os.path.join(REPO, fout_txt) - WrSectionsTxt(grprobj).wr_txt_section_hdrgos(full_txt, sortby=None, prt_section=True) + WrSectionsTxt(grprobj).wr_txt_section_hdrgos( + full_txt, sortby=None, prt_section=True + ) assert os.path.exists(full_txt) except RuntimeError as inst: sys.stdout.write("\n **FATAL: {MSG}\n\n".format(MSG=str(inst))) -if __name__ == '__main__': +if __name__ == "__main__": test_wr_sections_all() test_wr_sections_txt() diff --git a/tests/test_write_hier_ns.py b/tests/test_write_hier_ns.py index a2dbf554..3ff02ff7 100755 --- a/tests/test_write_hier_ns.py +++ b/tests/test_write_hier_ns.py @@ -15,67 +15,142 @@ REPO = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") + def test_write_hier_bp_mf_cc(): """Test that write hierarchy writes all: BP, MF, CC""" - fin_anno = os.path.join(REPO, 'gene2go') + fin_anno = os.path.join(REPO, "gene2go") fin_dag = os.path.join(REPO, "go-basic.obo") _dnld_anno(fin_anno) - #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None) - print('\nTEST STORING ONLY ONE SPECIES') + print("\nTEST STORING ONLY ONE SPECIES") #### obj = Gene2GoReader(fin_anno) godag = get_godag(fin_dag) - gene2gos = read_annotations(namespace='ALL') + gene2gos = read_annotations(namespace="ALL") tcntobj = TermCounts(godag, gene2gos) if gene2gos else None - gosubdag = GoSubDag(godag.keys(), godag, - relationships=False, - tcntobj=tcntobj, - children=True, - prt=sys.stdout) + gosubdag = GoSubDag( + godag.keys(), + godag, + relationships=False, + tcntobj=tcntobj, + children=True, + prt=sys.stdout, + ) - print('Test using no_dup True: concise printing with no GO branches repeated') + print("Test using no_dup True: concise printing with no GO branches repeated") objwr = WrHierGO(gosubdag, no_dup=True) - assert len(_wr_hier('nodup1', ['BP', 'MF', 'CC'], gosubdag.go2nt, objwr)) > 33000 - assert len(_wr_hier('nodup1', ['BP',], gosubdag.go2nt, objwr)) > 25000 - assert len(_wr_hier('nodup1', ['MF',], gosubdag.go2nt, objwr)) > 10000 - assert len(_wr_hier('nodup1', ['CC',], gosubdag.go2nt, objwr)) > 4000 + assert len(_wr_hier("nodup1", ["BP", "MF", "CC"], gosubdag.go2nt, objwr)) > 33000 + assert ( + len( + _wr_hier( + "nodup1", + [ + "BP", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 25000 + ) + assert ( + len( + _wr_hier( + "nodup1", + [ + "MF", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 10000 + ) + assert ( + len( + _wr_hier( + "nodup1", + [ + "CC", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 4000 + ) - print('Test using no_dup False: verbose printing with duplicate copies branches') + print("Test using no_dup False: verbose printing with duplicate copies branches") objwr = WrHierGO(gosubdag) # 2020 11: # 594,748 GO lines under GO:0008150 # 23,199 GO lines under GO:0003674 # 6,259 GO lines under GO:0005575 # 624,206 items WROTE: tmp_test_wr_hier_BP_MF_CC.txt - assert len(_wr_hier('nodup0', ['BP', 'MF', 'CC'], gosubdag.go2nt, objwr)) > 580000 - assert len(_wr_hier('nodup0', ['BP',], gosubdag.go2nt, objwr)) > 500000 - assert len(_wr_hier('nodup0', ['MF',], gosubdag.go2nt, objwr)) > 20000 - assert len(_wr_hier('nodup0', ['CC',], gosubdag.go2nt, objwr)) > 5000 + assert len(_wr_hier("nodup0", ["BP", "MF", "CC"], gosubdag.go2nt, objwr)) > 580000 + assert ( + len( + _wr_hier( + "nodup0", + [ + "BP", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 500000 + ) + assert ( + len( + _wr_hier( + "nodup0", + [ + "MF", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 20000 + ) + assert ( + len( + _wr_hier( + "nodup0", + [ + "CC", + ], + gosubdag.go2nt, + objwr, + ) + ) + > 5000 + ) def _wr_hier(desc, nss, go2nt, objwr): """Write hierarchy""" goids = WrHierCli.init_goids(nss, None, go2nt) - fout_rpt = 'tmp_test_wr_hier_{DESC}_{NSs}.txt'.format( - DESC=desc, NSs='_'.join(nss)) + fout_rpt = "tmp_test_wr_hier_{DESC}_{NSs}.txt".format(DESC=desc, NSs="_".join(nss)) items_all = [] - with open(fout_rpt, 'w') as prt: + with open(fout_rpt, "w", encoding="utf-8") as prt: for goid in goids: items_cur = objwr.prt_hier_down(goid, prt) items_all.extend(items_cur) - print('{N:7,} GO lines under {GO}'.format(N=len(items_cur), GO=goid)) - print('{N:7,} items WROTE: {TXT}'.format(N=len(items_all), TXT=fout_rpt)) + print("{N:7,} GO lines under {GO}".format(N=len(items_cur), GO=goid)) + print("{N:7,} items WROTE: {TXT}".format(N=len(items_all), TXT=fout_rpt)) return items_all + def _dnld_anno(file_anno): """Download the annotation file, if needed.""" if os.path.exists(file_anno): assert os.path.getsize(file_anno) > 1000000, "BAD ANNO({F})".format(F=file_anno) return - dnld_ncbi_gene_file(file_anno, loading_bar=None) + dnld_ncbi_gene_file(file_anno) assert os.path.isfile(file_anno), "MISSING ANNO({F})".format(F=file_anno) assert os.path.getsize(file_anno) > 1000000, "BAD ANNO({F})".format(F=file_anno) -if __name__ == '__main__': +if __name__ == "__main__": test_write_hier_bp_mf_cc() diff --git a/tests/test_write_summary_cnts.py b/tests/test_write_summary_cnts.py index 00c1bf45..97ce4aa4 100755 --- a/tests/test_write_summary_cnts.py +++ b/tests/test_write_summary_cnts.py @@ -6,16 +6,18 @@ import os import sys + from collections import defaultdict +from goatools.associations import get_assoc_ncbi_taxids from goatools.base import get_godag from goatools.rpt.rpt_lev_depth import RptLevDepth -from goatools.associations import get_assoc_ncbi_taxids + def test_write_summary_cnts(log=sys.stdout): """Print level/depth summaries for various sets of GO terms.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") - godag = get_godag(fin_obo, loading_bar=None) + godag = get_godag(fin_obo) rptobj = RptLevDepth(godag, log) # Report level/depth summary for all GOs in a dag log.write("\nSummary for all Ontologies:\n") @@ -25,11 +27,11 @@ def test_write_summary_cnts(log=sys.stdout): # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Get associations for human fly and mouse - get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) - assert taxid2asscs, 'taxid2asscs EMPTY' + get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs) + assert taxid2asscs, "taxid2asscs EMPTY" for taxid, assc in taxid2asscs.items(): log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) - go_ids = assc['GO2IDs'].keys() + go_ids = assc["GO2IDs"].keys() rptobj.write_summary_cnts(go_ids) log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_objs = [godag.get(goid) for goid in go_ids] @@ -37,7 +39,8 @@ def test_write_summary_cnts(log=sys.stdout): # Print GO depth count table for full GO DAG in LaTeX format rptobj.prttex_summary_cnts_all(prt=log) -if __name__ == '__main__': + +if __name__ == "__main__": test_write_summary_cnts() # Copyright (C) 2015-2019, DV Klopfenstein, H Tang, All rights reserved. diff --git a/tests/utils.py b/tests/utils.py index ee246aaa..d9667101 100755 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,40 +1,47 @@ #!/usr/bin/env python3 """Small test utilities used by multiple tests""" -__copyright__ = "Copyright (C) 2019-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2019-present, DV Klopfenstein, H Tang. All rights reserved." +) __author__ = "DV Klopfenstein" +import sys +import timeit + from os.path import join from os.path import dirname from os.path import abspath -import sys -import timeit + from datetime import timedelta from goatools.base import get_godag as base_get_godag from goatools.associations import dnld_annotation from goatools.anno.factory import get_objanno as get_objanno_factory -# from goatools.gosubdag.gosubdag import GoSubDag + from goatools.semantic import TermCounts -# from goatools_alpha.geneprodsim.semanticcalcs import SemanticCalcs DIR_TEST = dirname(abspath(__file__)) REPO = abspath(join(DIR_TEST, "..")) + def prt_hms(tic, msg, prt=sys.stdout): """Print elapsed time and return current time""" toc = timeit.default_timer() - prt.write('{HMS} {MSG}\n'.format(HMS=str(timedelta(seconds=toc-tic)), MSG=msg)) + prt.write("{HMS} {MSG}\n".format(HMS=str(timedelta(seconds=toc - tic)), MSG=msg)) return toc + def repofn(fin): """Get a full filename, given a local file name from repo dir root""" return join(REPO, fin) + def get_godag(fin_godag, **kws): """Get GODAG containing only primary GO IDs (no alternate GO IDs)""" - godag = base_get_godag(join(REPO, fin_godag), loading_bar=False, **kws) - return {o.item_id:o for o in godag.values()} + godag = base_get_godag(join(REPO, fin_godag), **kws) + return {o.item_id: o for o in godag.values()} + def get_anno_fullname(fin_anno): """Get annotation filename""" @@ -42,12 +49,14 @@ def get_anno_fullname(fin_anno): dnld_annotation(fin_full) return fin_full -def get_objanno(fin_anno, godag, namespace='all'): + +def get_objanno(fin_anno, godag, namespace="all"): """Get annotation object""" fin_full = get_anno_fullname(fin_anno) return get_objanno_factory(fin_full, godag=godag, namespace=namespace) -def get_termcounts(fin_anno, godag, namespace='all', **kws): + +def get_termcounts(fin_anno, godag, namespace="all", **kws): """Get termcounts object""" objanno = get_objanno(fin_anno, godag, namespace) id2gos = objanno.get_id2gos(namespace=namespace, **kws)