From aecd43de10338f1914094f140efe74dc878a2386 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Mon, 19 Jun 2017 20:23:49 +0100 Subject: [PATCH 01/18] Start reimplementing getMeasurementAndTech() from Java code, towards #218 --- isatools/magetab.py | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/isatools/magetab.py b/isatools/magetab.py index 56665f19..135b7676 100644 --- a/isatools/magetab.py +++ b/isatools/magetab.py @@ -8,6 +8,7 @@ import pandas as pd from io import StringIO from itertools import zip_longest +import re logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO) @@ -1153,20 +1154,31 @@ def get_single(values): # Comments in IDF - comment_keys = [x for x in squashed_table_dict.keys() if x.startswith("comment")] + comments_dict = dict(map(lambda x: (x[0][8:-1], get_single(x[1])), [x for x in squashed_table_dict.items() + if x[0].startswith("comment")])) - for key in comment_keys: - c = Comment(name=key[8:-1], value=get_single(squashed_table_dict[key])) - if c.name == "ArrayExpressAccession": - S.identifier = c.value # ArrayExpress adds this comment, so use it as the study ID if it's available + for key in comments_dict.keys(): + c = Comment(name=key, value=comments_dict[key]) S.comments.append(c) + if "ArrayExpressAccession" in comments_dict.keys(): + S.identifier = comments_dict["ArrayExpressAccession"] # ArrayExpress adds this, so use it as the study ID + + + design_type = None + + if "AEExperimentType" in comments_dict.keys(): + design_type = comments_dict["AEExperimentType"] + protocol_types = [x.protocol_type for x in S.protocols] hyb_prots_used = {"nucleic acid hybridization", "hybridization"}.intersection({squashstr(x.term) for x in protocol_types}) if sdrf_file is not None: S.filename = "s_{}".format(sdrf_file) a_filename = "a_{}".format(sdrf_file) + + + ttoa = None if technology_type is not None: ttoa = OntologyAnnotation(term=technology_type) @@ -1183,4 +1195,23 @@ def get_single(values): ISA.identifier = S.identifier ISA.title = S.title ISA.studies = [S] - return ISA \ No newline at end of file + return ISA + + +def get_measurement_and_type(design_type): + + if re.match("(?i).*ChIP-Chip.*", design_type): + return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip" + if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match( + "(?i).*transcription profiling by high throughput sequencing.*", design_type): + return "transcription profiling", "nucleotide sequencing", "RNA-Seq" + if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type): + return "transcription profiling", "DNA microarray", "GeneChip" + if re.match("(?i).*methylation profiling by array.*", design_type): + return "DNA methylation profiling", "DNA microarray", "Me-Chip" + if re.match("(?i).*comparative genomic hybridization by array.*", design_type): + return "comparative genomic hybridization", "DNA microarray", "CGH-Chip" + if re.match(".*genotyping by array.*", design_type): + return "SNP analysis", "DNA microarray", "SNPChip" + if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type): + return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq" \ No newline at end of file From 830d4313f4fc85c2b46319e3bd86275dd2c89cf3 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Tue, 20 Jun 2017 16:14:39 +0100 Subject: [PATCH 02/18] Fixes for #218 --- isatools/convert/magetab2isatab.py | 27 ++++------ isatools/magetab.py | 80 +++++++++++++++++++++++------- 2 files changed, 71 insertions(+), 36 deletions(-) diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py index 85bedd98..41c68d3f 100644 --- a/isatools/convert/magetab2isatab.py +++ b/isatools/convert/magetab2isatab.py @@ -22,7 +22,7 @@ def convert(source_idf_fp, output_path, technology_type, measurement_type): for _, row in df.iterrows(): sdrf_file = row["SDRF File"] if isinstance(sdrf_file, str): - study_df, assay_df = split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file)) + study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file)) study_df.columns = study_df.isatab_header assay_df.columns = assay_df.isatab_header # write out ISA table files @@ -45,19 +45,12 @@ def get_investigation_title(line, ISA): ISA.title = value -def split_tables(sdrf_path): - sdrf_df = isatab.read_tfile(sdrf_path) - sdrf_df_isatab_header = sdrf_df.isatab_header - if "Sample Name" in sdrf_df.columns: - sample_name_index = list(sdrf_df.columns).index("Sample Name") - elif "Extract Name" in sdrf_df.columns: - sample_name_index = list(sdrf_df.columns).index("Extract Name") - elif "Labeled Extract Name" in sdrf_df.columns: - sample_name_index = list(sdrf_df.columns).index("Labeled Extract Name") - else: - raise magetab.MageTabParserException("Could not split SDRF table as could not find suitable column to split on") - study_df = sdrf_df[sdrf_df.columns[0:sample_name_index+1]].drop_duplicates() - study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index+1] - assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]] - assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:] - return study_df, assay_df \ No newline at end of file +def get_first_node_index(header): + sqaushed_header = map(lambda x: magetab.squashstr(x), header) + nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"] + for node in nodes: + try: + index = sqaushed_header.index(node) + return index + except ValueError: + pass diff --git a/isatools/magetab.py b/isatools/magetab.py index 135b7676..94295888 100644 --- a/isatools/magetab.py +++ b/isatools/magetab.py @@ -486,15 +486,45 @@ def export_to_isatab(FP, output_dir): assay_df.to_csv(assay_fp, sep='\t', index=False, header=assay_df.isatab_header) +def get_first_node_index(header): + squashed_header = list(map(lambda x: squashstr(x), header)) + nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"] + for node in nodes: + try: + index = squashed_header.index(node) + return index + except ValueError: + pass + + def split_tables(sdrf_path): + + def split_on_sample(sdrf_df): + sdrf_df_isatab_header = sdrf_df.isatab_header + sdrf_df_cols = list(sdrf_df.columns) + sample_name_index = sdrf_df_cols.index("Sample Name") + study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates() + study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1] + assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]] + assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:] + return study_df, assay_df + sdrf_df = isatab.read_tfile(sdrf_path) - sdrf_df_isatab_header = sdrf_df.isatab_header - sample_name_index = list(sdrf_df.columns).index("Sample Name") - study_df = sdrf_df[sdrf_df.columns[0:sample_name_index+1]].drop_duplicates() - study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index+1] - assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]] - assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:] - return study_df, assay_df + + if "Sample Name" in sdrf_df.columns: + return split_on_sample(sdrf_df) + else: # insert Sample Name + sdrf_df_columns = list(sdrf_df.columns) + sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]] + sdrf_df_isatab_header = sdrf_df.isatab_header + sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name") + + sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name") + + sdrf_df = sdrf_df[sdrf_df_columns] + sdrf_df.isatab_header = sdrf_df_isatab_header + + return split_on_sample(sdrf_df) idf_map = { @@ -731,7 +761,7 @@ def get_squashed(key): # for MAGE-TAB spec 2.1.7, deal with variants on labels return squashstr(key) -def parse_idf(file_path, technology_type=None, measurement_type=None): +def parse_idf(file_path, technology_type=None, measurement_type=None, technology_platform=None): def get_single(values): stripped_values = [x for x in values if x != ''] @@ -1164,32 +1194,44 @@ def get_single(values): if "ArrayExpressAccession" in comments_dict.keys(): S.identifier = comments_dict["ArrayExpressAccession"] # ArrayExpress adds this, so use it as the study ID - design_type = None if "AEExperimentType" in comments_dict.keys(): design_type = comments_dict["AEExperimentType"] - protocol_types = [x.protocol_type for x in S.protocols] - hyb_prots_used = {"nucleic acid hybridization", - "hybridization"}.intersection({squashstr(x.term) for x in protocol_types}) + inferred_t_type = None + inferred_m_type = None + inferred_t_plat = None + if design_type is not None: + inferred_t_type, inferred_m_type, inferred_t_plat = get_measurement_and_type(design_type=design_type) + if sdrf_file is not None: S.filename = "s_{}".format(sdrf_file) a_filename = "a_{}".format(sdrf_file) - - ttoa = None if technology_type is not None: ttoa = OntologyAnnotation(term=technology_type) - elif technology_type is None and len(hyb_prots_used) > 0: - print("Detected probable DNA microarray technology type") - ttoa = OntologyAnnotation(term="DNA microarray") + elif technology_type is None and inferred_t_type is not None: + print("Detected probable '{}' technology type".format(inferred_t_type)) + ttoa = OntologyAnnotation(term=inferred_t_type) + mtoa = None if measurement_type is not None: mtoa = OntologyAnnotation(term=measurement_type) + elif measurement_type is None and inferred_m_type is not None: + print("Detected probable '{}' measurement type".format(inferred_m_type)) + mtoa = OntologyAnnotation(term=inferred_m_type) + + tp = '' + if technology_platform is not None: + tp = technology_platform + elif technology_platform is None and inferred_t_plat is not None: + print("Detected probable '{}' technology platform".format(inferred_t_plat)) + tp = inferred_t_plat + S.assays = [ - Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa) + Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp) ] ISA.identifier = S.identifier @@ -1214,4 +1256,4 @@ def get_measurement_and_type(design_type): if re.match(".*genotyping by array.*", design_type): return "SNP analysis", "DNA microarray", "SNPChip" if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type): - return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq" \ No newline at end of file + return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq" From 6403fb54d68237a5e2e6c0948b859aeab23bf50d Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 12:45:39 +0100 Subject: [PATCH 03/18] Implement get experiment MAGE TABs via FTP #220 --- isatools/io/ax.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 isatools/io/ax.py diff --git a/isatools/io/ax.py b/isatools/io/ax.py new file mode 100644 index 00000000..950588db --- /dev/null +++ b/isatools/io/ax.py @@ -0,0 +1,58 @@ +import ftplib +import logging +import os +import tempfile + +EBI_FTP_SERVER = 'ftp.ebi.ac.uk' +AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/' + +logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + + +def get(arrayexpress_id, target_dir=None): + """ + This function downloads ISA content from the ArrayExpress FTP site. + + :param ax_experiment_id: Study identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671) + :param target_dir: Path to write files to. If None, writes to temporary directory (generated on the fly) + :return: Path where the files were written to + + Example usage: + AX.get_study('E-GEOD-59671', '/tmp/ax') + """ + + idbits = arrayexpress_id.split('-') + exp_type = idbits[1] + + logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER)) + ftp = ftplib.FTP(EBI_FTP_SERVER) + logging.info("Logging in as anonymous user...") + response = ftp.login() + if '230' in response: # 230 means Login successful + logging.info("Log in successful!") + try: + logging.info("Looking for experiment '{}'".format(arrayexpress_id)) + ftp.cwd('{base_dir}/{exp_type}/{arrayexpress_id}'.format(base_dir=AX_EXPERIMENT_BASE_DIR, exp_type=exp_type, + arrayexpress_id=arrayexpress_id)) + if target_dir is None: + target_dir = tempfile.mkdtemp() + logging.info("Using directory '{}'".format(target_dir)) + idf_filename = "{}.idf.txt".format(arrayexpress_id) + with open(os.path.join(target_dir, idf_filename), 'wb') as out_file: + logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' + + arrayexpress_id + '/' + idf_filename)) + ftp.retrbinary('RETR ' + idf_filename, out_file.write) + sdrf_filename = "{}.sdrf.txt".format(arrayexpress_id) + with open(os.path.join(target_dir, sdrf_filename), 'wb') as out_file: + logging.info("Retrieving file '{}'".format( + EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' + arrayexpress_id + '/' + + sdrf_filename)) + ftp.retrbinary('RETR ' + sdrf_filename, out_file.write) + except ftplib.error_perm as ftperr: + logger.fatal("Could not retrieve ArrayExpress study '{study}': {error}".format(study=arrayexpress_id, + error=ftperr)) + finally: + return target_dir + else: + raise ConnectionError("There was a problem connecting to ArrayExpress: " + response) From b3f7e5032672261ae90d503da309d026e933a128 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 12:46:05 +0100 Subject: [PATCH 04/18] Update some naming in mtbls io package --- isatools/io/mtbls.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/isatools/io/mtbls.py b/isatools/io/mtbls.py index b191d0d7..d51ca2cc 100644 --- a/isatools/io/mtbls.py +++ b/isatools/io/mtbls.py @@ -7,11 +7,10 @@ import glob from isatools.convert import isatab2json from isatools import isatab -from isatools.model.v1 import OntologyAnnotation, Process, ParameterValue -import networkx as nx +from isatools.model.v1 import OntologyAnnotation import pandas as pd -MTBLS_FTP_SERVER = 'ftp.ebi.ac.uk' +EBI_FTP_SERVER = 'ftp.ebi.ac.uk' MTBLS_BASE_DIR = '/pub/databases/metabolights/studies/public' INVESTIGATION_FILENAME = 'i_Investigation.txt' @@ -33,8 +32,8 @@ def get(mtbls_study_id, target_dir=None): Example usage: isa_json = MTBLS.get_study('MTBLS1', '/tmp/mtbls') """ - logging.info("Setting up ftp with {}".format(MTBLS_FTP_SERVER)) - ftp = ftplib.FTP(MTBLS_FTP_SERVER) + logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER)) + ftp = ftplib.FTP(EBI_FTP_SERVER) logging.info("Logging in as anonymous user...") response = ftp.login() if '230' in response: # 230 means Login successful @@ -46,7 +45,7 @@ def get(mtbls_study_id, target_dir=None): target_dir = tempfile.mkdtemp() logging.info("Using directory '{}'".format(target_dir)) out_file = open(os.path.join(target_dir, INVESTIGATION_FILENAME), 'wb') - logging.info("Retrieving file '{}'".format(MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME)) + logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME)) ftp.retrbinary('RETR ' + INVESTIGATION_FILENAME, out_file.write) with open(out_file.name, encoding='utf-8') as i_fp: i_bytes = i_fp.read() @@ -55,14 +54,14 @@ def get(mtbls_study_id, target_dir=None): for s_filename in s_filenames: out_file = open(os.path.join(target_dir, s_filename), 'wb') logging.info("Retrieving file '{}'".format( - MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename)) + EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename)) ftp.retrbinary('RETR ' + s_filename, out_file.write) a_filenames_lines = [l.split('\t') for l in lines if 'Study Assay File Name' in l] for a_filename_line in a_filenames_lines: for a_filename in [f[1:-1] for f in a_filename_line[1:]]: out_file = open(os.path.join(target_dir, a_filename), 'wb') logging.info("Retrieving file '{}'".format( - MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename)) + EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename)) ftp.retrbinary('RETR ' + a_filename, out_file.write) except ftplib.error_perm as ftperr: logger.fatal("Could not retrieve MetaboLights study '{study}': {error}".format(study=mtbls_study_id, error=ftperr)) From 4b8bb35c6a7eae0e85850f054ff7386c4ab304e8 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 14:17:39 +0100 Subject: [PATCH 05/18] Add function to grab MAGE-TAB and use magetab2isatab to convert to ISA-Tab #220 --- isatools/io/ax.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/isatools/io/ax.py b/isatools/io/ax.py index 950588db..bb448761 100644 --- a/isatools/io/ax.py +++ b/isatools/io/ax.py @@ -2,6 +2,8 @@ import logging import os import tempfile +import shutil +from isatools.convert import magetab2isatab EBI_FTP_SERVER = 'ftp.ebi.ac.uk' AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/' @@ -12,14 +14,15 @@ def get(arrayexpress_id, target_dir=None): """ - This function downloads ISA content from the ArrayExpress FTP site. + This function downloads MAGE-TAB content from the ArrayExpress FTP site. - :param ax_experiment_id: Study identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671) - :param target_dir: Path to write files to. If None, writes to temporary directory (generated on the fly) + :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671) + :param target_dir: Path to write MAGE-TAB files to. If None, writes to temporary directory (generated on the fly) :return: Path where the files were written to Example usage: - AX.get_study('E-GEOD-59671', '/tmp/ax') + from isatools.io import ax as AX + AX.get('E-GEOD-59671', '/tmp/ax') """ idbits = arrayexpress_id.split('-') @@ -56,3 +59,30 @@ def get(arrayexpress_id, target_dir=None): return target_dir else: raise ConnectionError("There was a problem connecting to ArrayExpress: " + response) + + +def get_isatab(arrayexpress_id, target_dir=None): + """ + This function downloads MAGE-TAB content as ISA-Tab from the ArrayExpress FTP site. + + :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671) + :param target_dir: Path to write ISA-Tab files to. If None, writes to temporary directory (generated on the fly) + :return: Path where the files were written to + + Example usage: + from isatools.io import ax as AX + AX.get_isatab('E-GEOD-59671', '/tmp/ax') + """ + tmp_dir = tempfile.mkdtemp() + try: + get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir) + if target_dir is None: + target_dir = tempfile.mkdtemp() + logging.info("Using directory '{}'".format(target_dir)) + with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp: + magetab2isatab.convert(source_idf_fp=idf_fp, output_path=target_dir) + except Exception as e: + logger.fatal("Something went wrong: {}".format(e)) + finally: + shutil.rmtree(tmp_dir) + return target_dir From bb50680e1d5de3ef67273dbb1d8f3c67d8a2d195 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 14:24:04 +0100 Subject: [PATCH 06/18] Implement get MAGE as JSON from ArrayExpress #220 --- isatools/io/ax.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/isatools/io/ax.py b/isatools/io/ax.py index bb448761..4ff51815 100644 --- a/isatools/io/ax.py +++ b/isatools/io/ax.py @@ -3,7 +3,7 @@ import os import tempfile import shutil -from isatools.convert import magetab2isatab +from isatools.convert import magetab2isatab, magetab2json EBI_FTP_SERVER = 'ftp.ebi.ac.uk' AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/' @@ -86,3 +86,27 @@ def get_isatab(arrayexpress_id, target_dir=None): finally: shutil.rmtree(tmp_dir) return target_dir + + +def getj(arrayexpress_id): + """ + This function downloads MAGE-TAB content as ISA-JSON from the ArrayExpress FTP site. + + :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671) + :return: ISA-JSON representation of the MAGE-TAB content + + Example usage: + from isatools.io import ax as AX + my_json = AX.getj('E-GEOD-59671') + """ + tmp_dir = tempfile.mkdtemp() + mage_json = None + try: + get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir) + with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp: + mage_json = magetab2json.convert(source_idf_fp=idf_fp) + except Exception as e: + logger.fatal("Something went wrong: {}".format(e)) + finally: + shutil.rmtree(tmp_dir) + return mage_json From 8b4f6f5eccbfe8b135f4e3db62633956dcfffe10 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 14:32:57 +0100 Subject: [PATCH 07/18] Fixes for ax.getj() #220; updates to tests --- isatools/convert/magetab2isatab.py | 6 +++--- isatools/convert/magetab2json.py | 16 ++++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py index 41c68d3f..b5fcacfb 100644 --- a/isatools/convert/magetab2isatab.py +++ b/isatools/convert/magetab2isatab.py @@ -8,10 +8,10 @@ logger = logging.getLogger(__name__) -def convert(source_idf_fp, output_path, technology_type, measurement_type): +def convert(source_idf_fp, output_path, technology_type=None, measurement_type=None): """ Converter for MAGE-TAB to ISA-Tab :param source_idf_fp: File descriptor of input IDF file - :param output_dir: Path to directory to write output ISA-Tab files to + :param output_path: Path to directory to write output ISA-Tab files to """ df = pd.read_csv(source_idf_fp, names=range(0, 128), sep='\t', engine='python', encoding='utf-8', comment='#').dropna(axis=1, how='all') df = df.T # transpose @@ -46,7 +46,7 @@ def get_investigation_title(line, ISA): def get_first_node_index(header): - sqaushed_header = map(lambda x: magetab.squashstr(x), header) + sqaushed_header = list(map(lambda x: magetab.squashstr(x), header)) nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"] for node in nodes: try: diff --git a/isatools/convert/magetab2json.py b/isatools/convert/magetab2json.py index 7f14342e..8386b255 100644 --- a/isatools/convert/magetab2json.py +++ b/isatools/convert/magetab2json.py @@ -7,11 +7,15 @@ import shutil -def convert(source_idf_fp, technology_type, measurement_type): +def convert(source_idf_fp, technology_type=None, measurement_type=None): tmp = tempfile.mkdtemp() - magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type, - measurement_type=measurement_type) - with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp: - ISA = isatab.load(isa_inv_fp) + ISA = None + try: + magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type, + measurement_type=measurement_type) + with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp: + ISA = isatab.load(isa_inv_fp) + finally: shutil.rmtree(tmp) - return json.loads(json.dumps(ISA, cls=ISAJSONEncoder)) + if ISA is not None: + return json.loads(json.dumps(ISA, cls=ISAJSONEncoder)) From 5b9e71c71992cc58503a1a1f83caded6007a829f Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 15:19:48 +0100 Subject: [PATCH 08/18] Add logging if can't find config --- isatools/isatab.py | 77 ++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/isatools/isatab.py b/isatools/isatab.py index 78069ea6..cb0b04d1 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -2527,40 +2527,49 @@ def validate(fp, config_dir=default_config_dir, log_level=logging.INFO): technology_type = assay_df['Study Assay Technology Type'].tolist()[x] if assay_filename is not '': try: - logger.info("Loading... {}".format(assay_filename)) - with open(os.path.join(os.path.dirname(fp.name), assay_filename), encoding='utf-8') as a_fp: - assay_table = load_table(a_fp) - assay_table.filename = assay_filename - assay_tables.append(assay_table) - config = configs[(measurement_type, technology_type)] - logger.info( - "Validating {} against assay table configuration ({}, {})...".format( - assay_filename, measurement_type, technology_type)) - logger.info("Checking Factor Value presence...") - check_factor_value_presence(assay_table) # Rule 4007 - logger.info("Checking required fields...") - check_required_fields(assay_table, config) # Rule 4003-8, 4010 - logger.info("Checking generic fields...") - if not check_field_values(assay_table, config): # Rule 4011 - logger.warn( - "(W) There are some field value inconsistencies in {} against {} configuration".format( - assay_table.filename, (measurement_type, technology_type))) - logger.info("Checking unit fields...") - if not check_unit_field(assay_table, config): - logger.warn( - "(W) There are some unit value inconsistencies in {} against {} configuration".format( - assay_table.filename, (measurement_type, technology_type))) - logger.info("Checking protocol fields...") - if not check_protocol_fields(assay_table, config, protocol_names_and_types): # Rule 4009 - logger.warn("(W) There are some protocol inconsistencies in {} against {} " - "configuration".format(assay_table.filename, (measurement_type, technology_type))) - logger.info("Checking ontology fields...") - if not check_ontology_fields(assay_table, config): # Rule 3010 - logger.warn("(W) There are some ontology annotation inconsistencies in {} against {} " - "configuration".format(assay_table.filename, (measurement_type, technology_type))) - logger.info("Finished validation on {}".format(assay_filename)) - except FileNotFoundError: - pass + config = configs[(measurement_type, technology_type)] + except KeyError: + logger.error("Could not load config matching ({}, {})".format(measurement_type, technology_type)) + logger.error("Only have configs matching:") + for k in configs.keys(): + logger.error(k) + if config is None: + logger.warn("Skipping configuration validation as could not load config...") + else: + try: + logger.info("Loading... {}".format(assay_filename)) + with open(os.path.join(os.path.dirname(fp.name), assay_filename), encoding='utf-8') as a_fp: + assay_table = load_table(a_fp) + assay_table.filename = assay_filename + assay_tables.append(assay_table) + logger.info( + "Validating {} against assay table configuration ({}, {})...".format( + assay_filename, measurement_type, technology_type)) + logger.info("Checking Factor Value presence...") + check_factor_value_presence(assay_table) # Rule 4007 + logger.info("Checking required fields...") + check_required_fields(assay_table, config) # Rule 4003-8, 4010 + logger.info("Checking generic fields...") + if not check_field_values(assay_table, config): # Rule 4011 + logger.warn( + "(W) There are some field value inconsistencies in {} against {} configuration".format( + assay_table.filename, (measurement_type, technology_type))) + logger.info("Checking unit fields...") + if not check_unit_field(assay_table, config): + logger.warn( + "(W) There are some unit value inconsistencies in {} against {} configuration".format( + assay_table.filename, (measurement_type, technology_type))) + logger.info("Checking protocol fields...") + if not check_protocol_fields(assay_table, config, protocol_names_and_types): # Rule 4009 + logger.warn("(W) There are some protocol inconsistencies in {} against {} " + "configuration".format(assay_table.filename, (measurement_type, technology_type))) + logger.info("Checking ontology fields...") + if not check_ontology_fields(assay_table, config): # Rule 3010 + logger.warn("(W) There are some ontology annotation inconsistencies in {} against {} " + "configuration".format(assay_table.filename, (measurement_type, technology_type))) + logger.info("Finished validation on {}".format(assay_filename)) + except FileNotFoundError: + pass if study_sample_table is not None: logger.info("Checking consistencies between study sample table and assay tables...") check_sample_names(study_sample_table, assay_tables) From cd8aa7ec717bb78b24eb05073eea360fd9901ee8 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 15:20:37 +0100 Subject: [PATCH 09/18] Add detection of m/t types; ensure Experimental Design cast to Study Design Type #219 --- isatools/magetab.py | 62 ++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/isatools/magetab.py b/isatools/magetab.py index 94295888..da957254 100644 --- a/isatools/magetab.py +++ b/isatools/magetab.py @@ -847,12 +847,12 @@ def get_single(values): except KeyError: pass - if len(experimental_designs) > 0: - S.comments.append(Comment(name="Experimental Design", value=';'.join(experimental_designs))) - if len(experimental_design_tsrs) > 0: - S.comments.append(Comment(name="Experimental Design Term Source REF", value=';'.join(experimental_design_tsrs))) - if len(experimental_design_tans) > 0: - S.comments.append(Comment(name="Experimental Design Term Accession Number", value=';'.join(experimental_design_tans))) + for design, tsr, tan in zip_longest(experimental_designs, experimental_design_tsrs, experimental_design_tans): + try: + ts = ts_dict[tsr] + except KeyError: + ts = None + S.design_descriptors.append(OntologyAnnotation(term=design, term_source=ts, term_accession=tan)) # Experimental Factor section of IDF @@ -1194,16 +1194,19 @@ def get_single(values): if "ArrayExpressAccession" in comments_dict.keys(): S.identifier = comments_dict["ArrayExpressAccession"] # ArrayExpress adds this, so use it as the study ID - design_type = None + design_types = None - if "AEExperimentType" in comments_dict.keys(): - design_type = comments_dict["AEExperimentType"] + if "experimentaldesign" in squashed_table_dict.keys(): + design_types = experimental_designs + + elif "AEExperimentType" in comments_dict.keys(): + design_types = [comments_dict["AEExperimentType"]] - inferred_t_type = None inferred_m_type = None + inferred_t_type = None inferred_t_plat = None - if design_type is not None: - inferred_t_type, inferred_m_type, inferred_t_plat = get_measurement_and_type(design_type=design_type) + if design_types is not None: + inferred_m_type, inferred_t_type, inferred_t_plat = get_measurement_and_tech(design_types=design_types) if sdrf_file is not None: S.filename = "s_{}".format(sdrf_file) @@ -1240,20 +1243,21 @@ def get_single(values): return ISA -def get_measurement_and_type(design_type): - - if re.match("(?i).*ChIP-Chip.*", design_type): - return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip" - if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match( - "(?i).*transcription profiling by high throughput sequencing.*", design_type): - return "transcription profiling", "nucleotide sequencing", "RNA-Seq" - if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type): - return "transcription profiling", "DNA microarray", "GeneChip" - if re.match("(?i).*methylation profiling by array.*", design_type): - return "DNA methylation profiling", "DNA microarray", "Me-Chip" - if re.match("(?i).*comparative genomic hybridization by array.*", design_type): - return "comparative genomic hybridization", "DNA microarray", "CGH-Chip" - if re.match(".*genotyping by array.*", design_type): - return "SNP analysis", "DNA microarray", "SNPChip" - if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type): - return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq" +def get_measurement_and_tech(design_types): + for design_type in design_types: + if re.match("(?i).*ChIP-Chip.*", design_type): + return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip" + if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match( + "(?i).*transcription profiling by high throughput sequencing.*", design_type): + return "transcription profiling", "nucleotide sequencing", "RNA-Seq" + if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type): + return "transcription profiling", "DNA microarray", "GeneChip" + if re.match("(?i).*methylation profiling by array.*", design_type): + return "DNA methylation profiling", "DNA microarray", "Me-Chip" + if re.match("(?i).*comparative genomic hybridization by array.*", design_type): + return "comparative genomic hybridization", "DNA microarray", "CGH-Chip" + if re.match(".*genotyping by array.*", design_type): + return "SNP analysis", "DNA microarray", "SNPChip" + if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type): + return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq" + From 3af8da2e8b134e327d8fbefa27fbbc07771c0633 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 15:32:44 +0100 Subject: [PATCH 10/18] Implement tests for ArrayExpress IO; closes #220 --- tests/test_ax.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_ax.py diff --git a/tests/test_ax.py b/tests/test_ax.py new file mode 100644 index 00000000..8dfb7db3 --- /dev/null +++ b/tests/test_ax.py @@ -0,0 +1,45 @@ +import unittest +from unittest.mock import patch, mock_open +from isatools.io import ax as AX +import shutil +import os + + +class TestArrayExpressIO(unittest.TestCase): + + def setUp(self): + pass # detect if MTBLS is reachable. If so, run test of real server, otherwise run Mocks only? + + def tearDown(self): + pass + + """Mock-only test on E-AFMX1""" + @patch('ftplib.FTP', autospec=True) + def test_get_experiment(self, mock_ftp_constructor): + mock_ftp = mock_ftp_constructor.return_value + mock_ftp.login.return_value = '230' # means login OK + tmp_dir = AX.get('E-AFMX-1') # only retrieves ISA files from MTBLS + self.assertTrue(mock_ftp.login.called) + mock_ftp_constructor.assert_called_with('ftp.ebi.ac.uk') + mock_ftp.cwd.assert_called_with('/pub/databases/arrayexpress/data/experiment/AFMX/E-AFMX-1') + shutil.rmtree(tmp_dir) + + """Tries to do actual call on ArrayExpress; uses E-AFMX-1 as not so big""" + def test_get_experiment_as_magetab(self): + tmp_dir = AX.get('E-AFMX-1') # gets E-AFMX-1 MAGE-TAB files + self.assertEqual(len(os.listdir(tmp_dir)), 2) + self.assertSetEqual(set(os.listdir(tmp_dir)), {'E-AFMX-1.sdrf.txt', 'E-AFMX-1.idf.txt'}) + shutil.rmtree(tmp_dir) + + def test_get_experiment_as_isatab(self): + tmp_dir = AX.get_isatab('E-AFMX-1') # gets E-AFMX-1 MAGE-TAB files + self.assertEqual(len(os.listdir(tmp_dir)), 3) + self.assertSetEqual(set(os.listdir(tmp_dir)), {'i_investigation.txt', 'a_E-AFMX-1.sdrf.txt', + 's_E-AFMX-1.sdrf.txt'}) + shutil.rmtree(tmp_dir) + + def test_get_experiment_as_json(self): + isa_json = AX.getj('E-AFMX-1') # loads E-AFMX-1 study into ISA-JSON + self.assertIsInstance(isa_json, dict) + self.assertEqual(isa_json['identifier'], 'E-AFMX-1') + self.assertEqual(isa_json['studies'][0]['people'][0]['email'], 'khaitovich@eva.mpg.de') From ef5520f1ac345be7863f3483c49779cc2eca8d17 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 16:17:23 +0100 Subject: [PATCH 11/18] Add missing file from last commit to close #220 --- isatools/io/ax.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/isatools/io/ax.py b/isatools/io/ax.py index 4ff51815..2d7aaf9f 100644 --- a/isatools/io/ax.py +++ b/isatools/io/ax.py @@ -6,7 +6,7 @@ from isatools.convert import magetab2isatab, magetab2json EBI_FTP_SERVER = 'ftp.ebi.ac.uk' -AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/' +AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment' logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) @@ -43,8 +43,8 @@ def get(arrayexpress_id, target_dir=None): logging.info("Using directory '{}'".format(target_dir)) idf_filename = "{}.idf.txt".format(arrayexpress_id) with open(os.path.join(target_dir, idf_filename), 'wb') as out_file: - logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' + - arrayexpress_id + '/' + idf_filename)) + logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + + '/' + arrayexpress_id + '/' + idf_filename)) ftp.retrbinary('RETR ' + idf_filename, out_file.write) sdrf_filename = "{}.sdrf.txt".format(arrayexpress_id) with open(os.path.join(target_dir, sdrf_filename), 'wb') as out_file: From 588a91974ee093c42d13d648ffe72d651131c8ad Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 17:12:36 +0100 Subject: [PATCH 12/18] Work towards #219 --- isatools/magetab.py | 37 +++++++++++++++++++++++++++---------- isatools/utils.py | 3 ++- tests/test_ax.py | 2 +- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/isatools/magetab.py b/isatools/magetab.py index da957254..478188d7 100644 --- a/isatools/magetab.py +++ b/isatools/magetab.py @@ -500,29 +500,34 @@ def get_first_node_index(header): def split_tables(sdrf_path): def split_on_sample(sdrf_df): - sdrf_df_isatab_header = sdrf_df.isatab_header sdrf_df_cols = list(sdrf_df.columns) + sample_name_index = sdrf_df_cols.index("Sample Name") + study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates() - study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1] + study_df.isatab_header = sdrf_df.isatab_header[0:sample_name_index + 1] + assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]] - assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:] + assay_df.isatab_header = sdrf_df.isatab_header[sample_name_index:] + return study_df, assay_df sdrf_df = isatab.read_tfile(sdrf_path) - if "Sample Name" in sdrf_df.columns: + sdrf_columns = list(sdrf_df.columns) + if "Hybridization Name" in sdrf_columns: + sdrf_df.columns = [x.replace("Hybridization Name", "Hybridization Assay Name") for x in sdrf_columns] + + if "Sample Name" in list(sdrf_df.columns): return split_on_sample(sdrf_df) else: # insert Sample Name sdrf_df_columns = list(sdrf_df.columns) sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]] - sdrf_df_isatab_header = sdrf_df.isatab_header - sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name") + sdrf_df.isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name") sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name") sdrf_df = sdrf_df[sdrf_df_columns] - sdrf_df.isatab_header = sdrf_df_isatab_header return split_on_sample(sdrf_df) @@ -1233,9 +1238,21 @@ def get_single(values): print("Detected probable '{}' technology platform".format(inferred_t_plat)) tp = inferred_t_plat - S.assays = [ - Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp) - ] + A = Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp) + + if (A.measurement_type, A.technology_type) in [ + ("transcription profiling", "nucleotide sequencing"), + ("protein-DNA binding site identification", "nucleotide sequencing") + ]: + if "library construction" not in [x.name for x in S.protocols]: + logger.info("PROTOCOL INSERTION: {}, library construction".format(a_filename)) + S.protocols.append(Protocol(name="library construction", + protocol_type=OntologyAnnotation(term="library construction"))) + if "nucleic acid sequencing" not in [x.name for x in S.protocols]: + logger.info("PROTOCOL INSERTION: {}, nucleic acid sequencing".format(a_filename)) + S.protocols.append(Protocol(name="nucleic acid sequencing", + protocol_type=OntologyAnnotation(term="nucleic acid sequencing"))) + S.assays = [A] ISA.identifier = S.identifier ISA.title = S.title diff --git a/isatools/utils.py b/isatools/utils.py index 8533da24..2ca050e5 100644 --- a/isatools/utils.py +++ b/isatools/utils.py @@ -31,7 +31,8 @@ def detect_graph_process_pooling(G): report = list() for process in [n for n in G.nodes() if isinstance(n, Process)]: if len(G.in_edges(process)) > 1: - print("Possible process pooling detected on: ", process.id) + print("Possible process pooling detected on: {}" + .format(' '.join([process.id, process.executes_protocol.name]))) report.append(process.id) return report diff --git a/tests/test_ax.py b/tests/test_ax.py index 8dfb7db3..8ab297e9 100644 --- a/tests/test_ax.py +++ b/tests/test_ax.py @@ -24,7 +24,7 @@ def test_get_experiment(self, mock_ftp_constructor): mock_ftp.cwd.assert_called_with('/pub/databases/arrayexpress/data/experiment/AFMX/E-AFMX-1') shutil.rmtree(tmp_dir) - """Tries to do actual call on ArrayExpress; uses E-AFMX-1 as not so big""" + """Tries to do actual call on ArrayExpress; uses E-AFMX-1""" def test_get_experiment_as_magetab(self): tmp_dir = AX.get('E-AFMX-1') # gets E-AFMX-1 MAGE-TAB files self.assertEqual(len(os.listdir(tmp_dir)), 2) From 1510072f8a542734a2ae8dab78fd5dfd4aaecdf6 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 17:12:57 +0100 Subject: [PATCH 13/18] Work towards #219 --- tests/test_magetab2isatab.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py index eb2da96a..c28f7700 100644 --- a/tests/test_magetab2isatab.py +++ b/tests/test_magetab2isatab.py @@ -4,6 +4,8 @@ from isatools.convert import magetab2isatab from tests import utils import tempfile +from isatools.io import ax as AX +from isatools import isatab def setUpModule(): @@ -37,10 +39,25 @@ def test_magetab2isatab_convert_e_mexp_31(self): def test_magetab2isatab_convert_e_geod_59671(self): with open(os.path.join(self._magetab_data_dir, 'E-GEOD-59671.idf.txt')) as idf_fp: - magetab2isatab.convert(idf_fp, self._tmp_dir, 'DNA microarray', 'expression profiling') - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'i_investigation.txt'))) - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 's_E-GEOD-59671.sdrf.txt'))) - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'a_E-GEOD-59671.sdrf.txt'))) + magetab2isatab.convert(idf_fp, self._tmp_dir2) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'i_investigation.txt'))) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 's_E-GEOD-59671.sdrf.txt'))) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'a_E-GEOD-59671.sdrf.txt'))) from isatools import isatab with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: - isatab.validate(i_fp) \ No newline at end of file + isatab.validate(i_fp) + + def test_get_experiment_as_isatab_afmx_1(self): + AX.get_isatab('E-AFMX-1', self._tmp_dir) # gets E-AFMX-1 MAGE-TAB files + with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: + isatab.validate(i_fp) + + def test_get_experiment_as_isatab_afmx_2(self): + AX.get_isatab('E-AFMX-2', self._tmp_dir) # gets E-AFMX-2 MAGE-TAB files + with open(os.path.join(self._tmp_dir2, 'i_investigation.txt')) as i_fp: + isatab.validate(i_fp) + + def test_get_experiment_as_isatab_afmx_3(self): + AX.get_isatab('E-AFMX-3', self._tmp_dir) # gets E-AFMX-3 MAGE-TAB files + with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: + isatab.validate(i_fp) From 0311dd0174ca5c94bde4358d442fa57a198d3507 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 17:13:29 +0100 Subject: [PATCH 14/18] Work towards #219 --- tests/test_magetab2isatab.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py index c28f7700..f9fd146e 100644 --- a/tests/test_magetab2isatab.py +++ b/tests/test_magetab2isatab.py @@ -39,10 +39,10 @@ def test_magetab2isatab_convert_e_mexp_31(self): def test_magetab2isatab_convert_e_geod_59671(self): with open(os.path.join(self._magetab_data_dir, 'E-GEOD-59671.idf.txt')) as idf_fp: - magetab2isatab.convert(idf_fp, self._tmp_dir2) - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'i_investigation.txt'))) - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 's_E-GEOD-59671.sdrf.txt'))) - self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'a_E-GEOD-59671.sdrf.txt'))) + magetab2isatab.convert(idf_fp, self._tmp_dir) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'i_investigation.txt'))) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 's_E-GEOD-59671.sdrf.txt'))) + self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'a_E-GEOD-59671.sdrf.txt'))) from isatools import isatab with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: isatab.validate(i_fp) @@ -54,7 +54,7 @@ def test_get_experiment_as_isatab_afmx_1(self): def test_get_experiment_as_isatab_afmx_2(self): AX.get_isatab('E-AFMX-2', self._tmp_dir) # gets E-AFMX-2 MAGE-TAB files - with open(os.path.join(self._tmp_dir2, 'i_investigation.txt')) as i_fp: + with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: isatab.validate(i_fp) def test_get_experiment_as_isatab_afmx_3(self): From 8e7e2620e75a058677c43859af7cfe5c0215d8c3 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Thu, 22 Jun 2017 17:34:58 +0100 Subject: [PATCH 15/18] Work towards #219 --- isatools/convert/magetab2isatab.py | 3 ++- isatools/magetab.py | 15 +++++++-------- tests/test_magetab2isatab.py | 7 ++++++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py index b5fcacfb..9223fef8 100644 --- a/isatools/convert/magetab2isatab.py +++ b/isatools/convert/magetab2isatab.py @@ -22,7 +22,8 @@ def convert(source_idf_fp, output_path, technology_type=None, measurement_type=N for _, row in df.iterrows(): sdrf_file = row["SDRF File"] if isinstance(sdrf_file, str): - study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file)) + study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), + sdrf_file)) study_df.columns = study_df.isatab_header assay_df.columns = assay_df.isatab_header # write out ISA table files diff --git a/isatools/magetab.py b/isatools/magetab.py index 478188d7..071f87a8 100644 --- a/isatools/magetab.py +++ b/isatools/magetab.py @@ -500,16 +500,13 @@ def get_first_node_index(header): def split_tables(sdrf_path): def split_on_sample(sdrf_df): + sdrf_df_isatab_header = sdrf_df.isatab_header sdrf_df_cols = list(sdrf_df.columns) - sample_name_index = sdrf_df_cols.index("Sample Name") - study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates() - study_df.isatab_header = sdrf_df.isatab_header[0:sample_name_index + 1] - + study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1] assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]] - assay_df.isatab_header = sdrf_df.isatab_header[sample_name_index:] - + assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:] return study_df, assay_df sdrf_df = isatab.read_tfile(sdrf_path) @@ -518,16 +515,18 @@ def split_on_sample(sdrf_df): if "Hybridization Name" in sdrf_columns: sdrf_df.columns = [x.replace("Hybridization Name", "Hybridization Assay Name") for x in sdrf_columns] - if "Sample Name" in list(sdrf_df.columns): + if "Sample Name" in sdrf_df.columns: return split_on_sample(sdrf_df) else: # insert Sample Name sdrf_df_columns = list(sdrf_df.columns) sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]] - sdrf_df.isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name") + sdrf_df_isatab_header = sdrf_df.isatab_header + sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name") sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name") sdrf_df = sdrf_df[sdrf_df_columns] + sdrf_df.isatab_header = sdrf_df_isatab_header return split_on_sample(sdrf_df) diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py index f9fd146e..db7f2fe8 100644 --- a/tests/test_magetab2isatab.py +++ b/tests/test_magetab2isatab.py @@ -52,7 +52,7 @@ def test_get_experiment_as_isatab_afmx_1(self): with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: isatab.validate(i_fp) - def test_get_experiment_as_isatab_afmx_2(self): + def test_get_experiment_as_isatab_afmx_2(self): # FIXME -> output ISA-Tab has many missing cells! WHY!?!? AX.get_isatab('E-AFMX-2', self._tmp_dir) # gets E-AFMX-2 MAGE-TAB files with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: isatab.validate(i_fp) @@ -61,3 +61,8 @@ def test_get_experiment_as_isatab_afmx_3(self): AX.get_isatab('E-AFMX-3', self._tmp_dir) # gets E-AFMX-3 MAGE-TAB files with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: isatab.validate(i_fp) + + def test_get_experiment_as_isatab_afmx_5(self): + AX.get_isatab('E-AFMX-5', self._tmp_dir) # gets E-AFMX-5 MAGE-TAB files + with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp: + isatab.validate(i_fp) From d45be02f988c6a04ce5089f07b4a784e70ce32ea Mon Sep 17 00:00:00 2001 From: David Johnson Date: Fri, 23 Jun 2017 09:39:37 +0100 Subject: [PATCH 16/18] Fixes #221 --- isatools/isatab.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/isatools/isatab.py b/isatools/isatab.py index cb0b04d1..df76affc 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -2941,12 +2941,12 @@ def process_keygen(protocol_ref, column_group, object_label_index, all_columns, output_node_index = find_gt(node_cols, object_label_index) if output_node_index > -1: output_node_label = all_columns[output_node_index] - output_node_value = series[output_node_label] + output_node_value = str(series[output_node_label]) input_node_index = find_lt(node_cols, object_label_index) if input_node_index > -1: input_node_label = all_columns[input_node_index] - input_node_value = series[input_node_label] + input_node_value = str(series[input_node_label]) input_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[input_node_index]]].drop_duplicates() output_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[output_node_index]]].drop_duplicates() @@ -3203,7 +3203,7 @@ def create_from_df(self, DF): # from DF of a table file if self.samples is not None: sample_map = dict(map(lambda x: ('Sample Name:' + x.name, x), self.samples)) sample_keys = list(map(lambda x: 'Sample Name:' + x, - [x for x in DF['Sample Name'].drop_duplicates() if x != ''])) + [str(x) for x in DF['Sample Name'].drop_duplicates() if x != ''])) for k in sample_keys: try: samples[k] = sample_map[k] @@ -3211,7 +3211,7 @@ def create_from_df(self, DF): # from DF of a table file print('warning! Did not find sample referenced at assay level in study samples') else: samples = dict(map(lambda x: ('Sample Name:' + x, Sample(name=x)), - [x for x in DF['Sample Name'].drop_duplicates() if x != ''])) + [str(x) for x in DF['Sample Name'].drop_duplicates() if x != ''])) except KeyError: pass @@ -3280,7 +3280,7 @@ def get_node_by_label_and_key(l, k): ETA()]).start() for _, object_series in pbar(DF[column_group].drop_duplicates().iterrows()): - node_name = object_series[object_label] + node_name = str(object_series[object_label]) node_key = ":".join([object_label, node_name]) material = None if object_label == "Source Name": @@ -3357,7 +3357,7 @@ def get_node_by_label_and_key(l, k): for _, object_series in pbar(DF[column_group].drop_duplicates().iterrows()): try: - data_file = get_node_by_label_and_key(object_label, object_series[object_label]) + data_file = get_node_by_label_and_key(object_label, str(object_series[object_label])) for comment_column in [c for c in column_group if c.startswith('Comment[')]: if comment_column[8:-1] not in [x.name for x in data_file.comments]: data_file.comments.append(Comment(name=comment_column[8:-1], value=str(object_series[comment_column]))) @@ -3375,7 +3375,7 @@ def get_node_by_label_and_key(l, k): for _, object_series in pbar(DF.iterrows()): # don't drop duplicates # if _ == 0: # print('processing: ', object_series[object_label]) - protocol_ref = object_series[object_label] + protocol_ref = str(object_series[object_label]) process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF) # TODO: Keep process key sequence here to reduce number of passes on Protocol REF columns? @@ -3392,7 +3392,7 @@ def get_node_by_label_and_key(l, k): if output_proc_index < output_node_index > -1: output_node_label = DF.columns[output_node_index] - output_node_value = object_series[output_node_label] + output_node_value = str(object_series[output_node_label]) node_key = output_node_value @@ -3413,7 +3413,7 @@ def get_node_by_label_and_key(l, k): if input_proc_index < input_node_index > -1: input_node_label = DF.columns[input_node_index] - input_node_value = object_series[input_node_label] + input_node_value = str(object_series[input_node_label]) node_key = input_node_value @@ -3481,13 +3481,13 @@ def get_node_by_label_and_key(l, k): if object_label.startswith('Source Name'): try: - source_node_context = get_node_by_label_and_key(object_label, object_series[object_label]) + source_node_context = get_node_by_label_and_key(object_label, str(object_series[object_label])) except KeyError: pass # skip if object not found if object_label.startswith('Sample Name'): try: - sample_node_context = get_node_by_label_and_key(object_label, object_series[object_label]) + sample_node_context = get_node_by_label_and_key(object_label, str(object_series[object_label])) except KeyError: pass # skip if object not found if source_node_context is not None: @@ -3495,14 +3495,14 @@ def get_node_by_label_and_key(l, k): sample_node_context.derives_from.append(source_node_context) if object_label.startswith('Protocol REF'): - protocol_ref = object_series[object_label] + protocol_ref = str(object_series[object_label]) process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF) process_key_sequence.append(process_key) if object_label.endswith(' File'): data_node = None try: - data_node = get_node_by_label_and_key(object_label, object_series[object_label]) + data_node = get_node_by_label_and_key(object_label, str(object_series[object_label])) except KeyError: pass # skip if object not found if sample_node_context is not None and data_node is not None: From 1f16e42b2002cf2ff7bd46fa5a358ab0e87a645d Mon Sep 17 00:00:00 2001 From: David Johnson Date: Fri, 23 Jun 2017 09:43:30 +0100 Subject: [PATCH 17/18] Tests #221; closes #221 --- tests/test_mtbls.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_mtbls.py b/tests/test_mtbls.py index 085e330f..79dc4579 100644 --- a/tests/test_mtbls.py +++ b/tests/test_mtbls.py @@ -57,4 +57,9 @@ def test_get_datafiles(self): factor_selection = {"genotype": "Col-0"} results = MTBLS.get_data_files('MTBLS2', factor_selection) self.assertEqual(len(results), 8) - self.assertEqual(len(results[0]['data_files']), 1) \ No newline at end of file + self.assertEqual(len(results[0]['data_files']), 1) + + def test_get_factors_summary(self): # Test for issue #221 + factors_summary = MTBLS.get_factors_summary('MTBLS26') + self.assertIsInstance(factors_summary, list) + self.assertEqual(len(factors_summary), 18) \ No newline at end of file From bb777f7995063410506faa814cad6eee9359a680 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Fri, 23 Jun 2017 10:17:03 +0100 Subject: [PATCH 18/18] Update version number for release; update known issues in docs --- docs/knownissues.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/knownissues.rst b/docs/knownissues.rst index a855994f..9f9e8d6c 100644 --- a/docs/knownissues.rst +++ b/docs/knownissues.rst @@ -4,7 +4,7 @@ Known issues isatools v0.8 package --------------------- -- Issues #153 is still outstanding, as per below; new issues #205 (json2isatab conversion issue), #208 (ISA-Tab validation issue) and #218 (MAGE-TAB conversion issue) +- Issues #153 is still outstanding, as per below; new issue #208 (ISA-Tab validation issue) - SRA/ENA importer and Biocrates importer relies on XSLT2 processing only available with SAXON and requires .jar file to run isatools v0.7 package diff --git a/setup.py b/setup.py index 3df50c14..40f472d5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='isatools', - version='0.8.1', + version='0.8.2', packages=['isatools', 'isatools.convert', 'isatools.io', 'isatools.model'], package_data={'isatools': ['schemas/cedar/*.json', 'schemas/isa_model_version_1_0_schemas/core/*.json',