From aecd43de10338f1914094f140efe74dc878a2386 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Mon, 19 Jun 2017 20:23:49 +0100
Subject: [PATCH 01/18] Start reimplementing getMeasurementAndTech() from Java
 code, towards #218

---
 isatools/magetab.py | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/isatools/magetab.py b/isatools/magetab.py
index 56665f19..135b7676 100644
--- a/isatools/magetab.py
+++ b/isatools/magetab.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from io import StringIO
 from itertools import zip_longest
+import re
 
 
 logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
@@ -1153,20 +1154,31 @@ def get_single(values):
 
     # Comments in IDF
 
-    comment_keys = [x for x in squashed_table_dict.keys() if x.startswith("comment")]
+    comments_dict = dict(map(lambda x: (x[0][8:-1], get_single(x[1])), [x for x in squashed_table_dict.items()
+                                                                        if x[0].startswith("comment")]))
 
-    for key in comment_keys:
-        c = Comment(name=key[8:-1], value=get_single(squashed_table_dict[key]))
-        if c.name == "ArrayExpressAccession":
-            S.identifier = c.value  # ArrayExpress adds this comment, so use it as the study ID if it's available
+    for key in comments_dict.keys():
+        c = Comment(name=key, value=comments_dict[key])
         S.comments.append(c)
 
+    if "ArrayExpressAccession" in comments_dict.keys():
+        S.identifier = comments_dict["ArrayExpressAccession"]  # ArrayExpress adds this, so use it as the study ID
+
+
+    design_type = None
+
+    if "AEExperimentType" in comments_dict.keys():
+        design_type = comments_dict["AEExperimentType"]
+
     protocol_types = [x.protocol_type for x in S.protocols]
     hyb_prots_used = {"nucleic acid hybridization",
                       "hybridization"}.intersection({squashstr(x.term) for x in protocol_types})
     if sdrf_file is not None:
         S.filename = "s_{}".format(sdrf_file)
         a_filename = "a_{}".format(sdrf_file)
+
+
+
         ttoa = None
         if technology_type is not None:
             ttoa = OntologyAnnotation(term=technology_type)
@@ -1183,4 +1195,23 @@ def get_single(values):
     ISA.identifier = S.identifier
     ISA.title = S.title
     ISA.studies = [S]
-    return ISA
\ No newline at end of file
+    return ISA
+
+
+def get_measurement_and_type(design_type):
+
+    if re.match("(?i).*ChIP-Chip.*", design_type):
+        return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip"
+    if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match(
+            "(?i).*transcription profiling by high throughput sequencing.*", design_type):
+        return "transcription profiling", "nucleotide sequencing", "RNA-Seq"
+    if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type):
+        return "transcription profiling", "DNA microarray", "GeneChip"
+    if re.match("(?i).*methylation profiling by array.*", design_type):
+        return "DNA methylation profiling", "DNA microarray", "Me-Chip"
+    if re.match("(?i).*comparative genomic hybridization by array.*", design_type):
+        return "comparative genomic hybridization", "DNA microarray", "CGH-Chip"
+    if re.match(".*genotyping by array.*", design_type):
+        return "SNP analysis", "DNA microarray", "SNPChip"
+    if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type):
+        return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq"
\ No newline at end of file

From 830d4313f4fc85c2b46319e3bd86275dd2c89cf3 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Tue, 20 Jun 2017 16:14:39 +0100
Subject: [PATCH 02/18] Fixes for #218

---
 isatools/convert/magetab2isatab.py | 27 ++++------
 isatools/magetab.py                | 80 +++++++++++++++++++++++-------
 2 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py
index 85bedd98..41c68d3f 100644
--- a/isatools/convert/magetab2isatab.py
+++ b/isatools/convert/magetab2isatab.py
@@ -22,7 +22,7 @@ def convert(source_idf_fp, output_path, technology_type, measurement_type):
     for _, row in df.iterrows():
         sdrf_file = row["SDRF File"]
         if isinstance(sdrf_file, str):
-            study_df, assay_df = split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file))
+            study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file))
             study_df.columns = study_df.isatab_header
             assay_df.columns = assay_df.isatab_header
             # write out ISA table files
@@ -45,19 +45,12 @@ def get_investigation_title(line, ISA):
         ISA.title = value
 
 
-def split_tables(sdrf_path):
-    sdrf_df = isatab.read_tfile(sdrf_path)
-    sdrf_df_isatab_header = sdrf_df.isatab_header
-    if "Sample Name" in sdrf_df.columns:
-        sample_name_index = list(sdrf_df.columns).index("Sample Name")
-    elif "Extract Name" in sdrf_df.columns:
-        sample_name_index = list(sdrf_df.columns).index("Extract Name")
-    elif "Labeled Extract Name" in sdrf_df.columns:
-        sample_name_index = list(sdrf_df.columns).index("Labeled Extract Name")
-    else:
-        raise magetab.MageTabParserException("Could not split SDRF table as could not find suitable column to split on")
-    study_df = sdrf_df[sdrf_df.columns[0:sample_name_index+1]].drop_duplicates()
-    study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index+1]
-    assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
-    assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
-    return study_df, assay_df
\ No newline at end of file
+def get_first_node_index(header):
+    sqaushed_header = map(lambda x: magetab.squashstr(x), header)
+    nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"]
+    for node in nodes:
+        try:
+            index = sqaushed_header.index(node)
+            return index
+        except ValueError:
+            pass
diff --git a/isatools/magetab.py b/isatools/magetab.py
index 135b7676..94295888 100644
--- a/isatools/magetab.py
+++ b/isatools/magetab.py
@@ -486,15 +486,45 @@ def export_to_isatab(FP, output_dir):
         assay_df.to_csv(assay_fp, sep='\t', index=False, header=assay_df.isatab_header)
 
 
+def get_first_node_index(header):
+    squashed_header = list(map(lambda x: squashstr(x), header))
+    nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"]
+    for node in nodes:
+        try:
+            index = squashed_header.index(node)
+            return index
+        except ValueError:
+            pass
+
+
 def split_tables(sdrf_path):
+
+    def split_on_sample(sdrf_df):
+        sdrf_df_isatab_header = sdrf_df.isatab_header
+        sdrf_df_cols = list(sdrf_df.columns)
+        sample_name_index = sdrf_df_cols.index("Sample Name")
+        study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates()
+        study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1]
+        assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
+        assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
+        return study_df, assay_df
+
     sdrf_df = isatab.read_tfile(sdrf_path)
-    sdrf_df_isatab_header = sdrf_df.isatab_header
-    sample_name_index = list(sdrf_df.columns).index("Sample Name")
-    study_df = sdrf_df[sdrf_df.columns[0:sample_name_index+1]].drop_duplicates()
-    study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index+1]
-    assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
-    assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
-    return study_df, assay_df
+
+    if "Sample Name" in sdrf_df.columns:
+        return split_on_sample(sdrf_df)
+    else:  # insert Sample Name
+        sdrf_df_columns = list(sdrf_df.columns)
+        sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]]
+        sdrf_df_isatab_header = sdrf_df.isatab_header
+        sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
+
+        sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
+
+        sdrf_df = sdrf_df[sdrf_df_columns]
+        sdrf_df.isatab_header = sdrf_df_isatab_header
+
+        return split_on_sample(sdrf_df)
 
 
 idf_map = {
@@ -731,7 +761,7 @@ def get_squashed(key):  # for MAGE-TAB spec 2.1.7, deal with variants on labels
         return squashstr(key)
 
 
-def parse_idf(file_path, technology_type=None, measurement_type=None):
+def parse_idf(file_path, technology_type=None, measurement_type=None, technology_platform=None):
 
     def get_single(values):
         stripped_values = [x for x in values if x != '']
@@ -1164,32 +1194,44 @@ def get_single(values):
     if "ArrayExpressAccession" in comments_dict.keys():
         S.identifier = comments_dict["ArrayExpressAccession"]  # ArrayExpress adds this, so use it as the study ID
 
-
     design_type = None
 
     if "AEExperimentType" in comments_dict.keys():
         design_type = comments_dict["AEExperimentType"]
 
-    protocol_types = [x.protocol_type for x in S.protocols]
-    hyb_prots_used = {"nucleic acid hybridization",
-                      "hybridization"}.intersection({squashstr(x.term) for x in protocol_types})
+    inferred_t_type = None
+    inferred_m_type = None
+    inferred_t_plat = None
+    if design_type is not None:
+        inferred_t_type, inferred_m_type, inferred_t_plat = get_measurement_and_type(design_type=design_type)
+
     if sdrf_file is not None:
         S.filename = "s_{}".format(sdrf_file)
         a_filename = "a_{}".format(sdrf_file)
 
-
-
         ttoa = None
         if technology_type is not None:
             ttoa = OntologyAnnotation(term=technology_type)
-        elif technology_type is None and len(hyb_prots_used) > 0:
-            print("Detected probable DNA microarray technology type")
-            ttoa = OntologyAnnotation(term="DNA microarray")
+        elif technology_type is None and inferred_t_type is not None:
+            print("Detected probable '{}' technology type".format(inferred_t_type))
+            ttoa = OntologyAnnotation(term=inferred_t_type)
+
         mtoa = None
         if measurement_type is not None:
             mtoa = OntologyAnnotation(term=measurement_type)
+        elif measurement_type is None and inferred_m_type is not None:
+            print("Detected probable '{}' measurement type".format(inferred_m_type))
+            mtoa = OntologyAnnotation(term=inferred_m_type)
+
+        tp = ''
+        if technology_platform is not None:
+            tp = technology_platform
+        elif technology_platform is None and inferred_t_plat is not None:
+            print("Detected probable '{}' technology platform".format(inferred_t_plat))
+            tp = inferred_t_plat
+
         S.assays = [
-            Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa)
+            Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp)
         ]
 
     ISA.identifier = S.identifier
@@ -1214,4 +1256,4 @@ def get_measurement_and_type(design_type):
     if re.match(".*genotyping by array.*", design_type):
         return "SNP analysis", "DNA microarray", "SNPChip"
     if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type):
-        return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq"
\ No newline at end of file
+        return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq"

From 6403fb54d68237a5e2e6c0948b859aeab23bf50d Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 12:45:39 +0100
Subject: [PATCH 03/18] Implement get experiment MAGE TABs via FTP #220

---
 isatools/io/ax.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 isatools/io/ax.py

diff --git a/isatools/io/ax.py b/isatools/io/ax.py
new file mode 100644
index 00000000..950588db
--- /dev/null
+++ b/isatools/io/ax.py
@@ -0,0 +1,58 @@
+import ftplib
+import logging
+import os
+import tempfile
+
+EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
+AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/'
+
+logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def get(arrayexpress_id, target_dir=None):
+    """
+    This function downloads ISA content from the ArrayExpress FTP site.
+
+    :param ax_experiment_id: Study identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
+    :param target_dir: Path to write files to. If None, writes to temporary directory (generated on the fly)
+    :return: Path where the files were written to
+
+    Example usage:
+        AX.get_study('E-GEOD-59671', '/tmp/ax')
+    """
+
+    idbits = arrayexpress_id.split('-')
+    exp_type = idbits[1]
+
+    logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER))
+    ftp = ftplib.FTP(EBI_FTP_SERVER)
+    logging.info("Logging in as anonymous user...")
+    response = ftp.login()
+    if '230' in response:  # 230 means Login successful
+        logging.info("Log in successful!")
+        try:
+            logging.info("Looking for experiment '{}'".format(arrayexpress_id))
+            ftp.cwd('{base_dir}/{exp_type}/{arrayexpress_id}'.format(base_dir=AX_EXPERIMENT_BASE_DIR, exp_type=exp_type,
+                                                                     arrayexpress_id=arrayexpress_id))
+            if target_dir is None:
+                target_dir = tempfile.mkdtemp()
+            logging.info("Using directory '{}'".format(target_dir))
+            idf_filename = "{}.idf.txt".format(arrayexpress_id)
+            with open(os.path.join(target_dir, idf_filename), 'wb') as out_file:
+                logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' +
+                                                           arrayexpress_id + '/' + idf_filename))
+                ftp.retrbinary('RETR ' + idf_filename, out_file.write)
+            sdrf_filename = "{}.sdrf.txt".format(arrayexpress_id)
+            with open(os.path.join(target_dir, sdrf_filename), 'wb') as out_file:
+                logging.info("Retrieving file '{}'".format(
+                    EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' + arrayexpress_id + '/'
+                    + sdrf_filename))
+                ftp.retrbinary('RETR ' + sdrf_filename, out_file.write)
+        except ftplib.error_perm as ftperr:
+            logger.fatal("Could not retrieve ArrayExpress study '{study}': {error}".format(study=arrayexpress_id,
+                                                                                           error=ftperr))
+        finally:
+            return target_dir
+    else:
+        raise ConnectionError("There was a problem connecting to ArrayExpress: " + response)

From b3f7e5032672261ae90d503da309d026e933a128 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 12:46:05 +0100
Subject: [PATCH 04/18] Update some naming in mtbls io package

---
 isatools/io/mtbls.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/isatools/io/mtbls.py b/isatools/io/mtbls.py
index b191d0d7..d51ca2cc 100644
--- a/isatools/io/mtbls.py
+++ b/isatools/io/mtbls.py
@@ -7,11 +7,10 @@
 import glob
 from isatools.convert import isatab2json
 from isatools import isatab
-from isatools.model.v1 import OntologyAnnotation, Process, ParameterValue
-import networkx as nx
+from isatools.model.v1 import OntologyAnnotation
 import pandas as pd
 
-MTBLS_FTP_SERVER = 'ftp.ebi.ac.uk'
+EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
 MTBLS_BASE_DIR = '/pub/databases/metabolights/studies/public'
 INVESTIGATION_FILENAME = 'i_Investigation.txt'
 
@@ -33,8 +32,8 @@ def get(mtbls_study_id, target_dir=None):
     Example usage:
         isa_json = MTBLS.get_study('MTBLS1', '/tmp/mtbls')
     """
-    logging.info("Setting up ftp with {}".format(MTBLS_FTP_SERVER))
-    ftp = ftplib.FTP(MTBLS_FTP_SERVER)
+    logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER))
+    ftp = ftplib.FTP(EBI_FTP_SERVER)
     logging.info("Logging in as anonymous user...")
     response = ftp.login()
     if '230' in response:  # 230 means Login successful
@@ -46,7 +45,7 @@ def get(mtbls_study_id, target_dir=None):
                 target_dir = tempfile.mkdtemp()
             logging.info("Using directory '{}'".format(target_dir))
             out_file = open(os.path.join(target_dir, INVESTIGATION_FILENAME), 'wb')
-            logging.info("Retrieving file '{}'".format(MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME))
+            logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME))
             ftp.retrbinary('RETR ' + INVESTIGATION_FILENAME, out_file.write)
             with open(out_file.name, encoding='utf-8') as i_fp:
                 i_bytes = i_fp.read()
@@ -55,14 +54,14 @@ def get(mtbls_study_id, target_dir=None):
                 for s_filename in s_filenames:
                     out_file = open(os.path.join(target_dir, s_filename), 'wb')
                     logging.info("Retrieving file '{}'".format(
-                        MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename))
+                        EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename))
                     ftp.retrbinary('RETR ' + s_filename, out_file.write)
                 a_filenames_lines = [l.split('\t') for l in lines if 'Study Assay File Name' in l]
                 for a_filename_line in a_filenames_lines:
                     for a_filename in [f[1:-1] for f in a_filename_line[1:]]:
                         out_file = open(os.path.join(target_dir, a_filename), 'wb')
                         logging.info("Retrieving file '{}'".format(
-                            MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename))
+                            EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename))
                         ftp.retrbinary('RETR ' + a_filename, out_file.write)
         except ftplib.error_perm as ftperr:
             logger.fatal("Could not retrieve MetaboLights study '{study}': {error}".format(study=mtbls_study_id, error=ftperr))

From 4b8bb35c6a7eae0e85850f054ff7386c4ab304e8 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 14:17:39 +0100
Subject: [PATCH 05/18] Add function to grab MAGE-TAB and use magetab2isatab to
 convert to ISA-Tab #220

---
 isatools/io/ax.py | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/isatools/io/ax.py b/isatools/io/ax.py
index 950588db..bb448761 100644
--- a/isatools/io/ax.py
+++ b/isatools/io/ax.py
@@ -2,6 +2,8 @@
 import logging
 import os
 import tempfile
+import shutil
+from isatools.convert import magetab2isatab
 
 EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
 AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/'
@@ -12,14 +14,15 @@
 
 def get(arrayexpress_id, target_dir=None):
     """
-    This function downloads ISA content from the ArrayExpress FTP site.
+    This function downloads MAGE-TAB content from the ArrayExpress FTP site.
 
-    :param ax_experiment_id: Study identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
-    :param target_dir: Path to write files to. If None, writes to temporary directory (generated on the fly)
+    :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
+    :param target_dir: Path to write MAGE-TAB files to. If None, writes to temporary directory (generated on the fly)
     :return: Path where the files were written to
 
     Example usage:
-        AX.get_study('E-GEOD-59671', '/tmp/ax')
+        from isatools.io import ax as AX
+        AX.get('E-GEOD-59671', '/tmp/ax')
     """
 
     idbits = arrayexpress_id.split('-')
@@ -56,3 +59,30 @@ def get(arrayexpress_id, target_dir=None):
             return target_dir
     else:
         raise ConnectionError("There was a problem connecting to ArrayExpress: " + response)
+
+
+def get_isatab(arrayexpress_id, target_dir=None):
+    """
+    This function downloads MAGE-TAB content as ISA-Tab from the ArrayExpress FTP site.
+
+    :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
+    :param target_dir: Path to write ISA-Tab files to. If None, writes to temporary directory (generated on the fly)
+    :return: Path where the files were written to
+
+    Example usage:
+        from isatools.io import ax as AX
+        AX.get_isatab('E-GEOD-59671', '/tmp/ax')
+    """
+    tmp_dir = tempfile.mkdtemp()
+    try:
+        get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir)
+        if target_dir is None:
+            target_dir = tempfile.mkdtemp()
+        logging.info("Using directory '{}'".format(target_dir))
+        with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp:
+            magetab2isatab.convert(source_idf_fp=idf_fp, output_path=target_dir)
+    except Exception as e:
+        logger.fatal("Something went wrong: {}".format(e))
+    finally:
+        shutil.rmtree(tmp_dir)
+        return target_dir

From bb50680e1d5de3ef67273dbb1d8f3c67d8a2d195 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 14:24:04 +0100
Subject: [PATCH 06/18] Implement get MAGE as JSON from ArrayExpress #220

---
 isatools/io/ax.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/isatools/io/ax.py b/isatools/io/ax.py
index bb448761..4ff51815 100644
--- a/isatools/io/ax.py
+++ b/isatools/io/ax.py
@@ -3,7 +3,7 @@
 import os
 import tempfile
 import shutil
-from isatools.convert import magetab2isatab
+from isatools.convert import magetab2isatab, magetab2json
 
 EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
 AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/'
@@ -86,3 +86,27 @@ def get_isatab(arrayexpress_id, target_dir=None):
     finally:
         shutil.rmtree(tmp_dir)
         return target_dir
+
+
+def getj(arrayexpress_id):
+    """
+    This function downloads MAGE-TAB content as ISA-JSON from the ArrayExpress FTP site.
+
+    :param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
+    :return: ISA-JSON representation of the MAGE-TAB content
+
+    Example usage:
+        from isatools.io import ax as AX
+        my_json = AX.getj('E-GEOD-59671')
+    """
+    tmp_dir = tempfile.mkdtemp()
+    mage_json = None
+    try:
+        get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir)
+        with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp:
+            mage_json = magetab2json.convert(source_idf_fp=idf_fp)
+    except Exception as e:
+        logger.fatal("Something went wrong: {}".format(e))
+    finally:
+        shutil.rmtree(tmp_dir)
+        return mage_json

From 8b4f6f5eccbfe8b135f4e3db62633956dcfffe10 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 14:32:57 +0100
Subject: [PATCH 07/18] Fixes for ax.getj() #220; updates to tests

---
 isatools/convert/magetab2isatab.py |  6 +++---
 isatools/convert/magetab2json.py   | 16 ++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py
index 41c68d3f..b5fcacfb 100644
--- a/isatools/convert/magetab2isatab.py
+++ b/isatools/convert/magetab2isatab.py
@@ -8,10 +8,10 @@
 logger = logging.getLogger(__name__)
 
 
-def convert(source_idf_fp, output_path, technology_type, measurement_type):
+def convert(source_idf_fp, output_path, technology_type=None, measurement_type=None):
     """ Converter for MAGE-TAB to ISA-Tab
     :param source_idf_fp: File descriptor of input IDF file
-    :param output_dir: Path to directory to write output ISA-Tab files to
+    :param output_path: Path to directory to write output ISA-Tab files to
     """
     df = pd.read_csv(source_idf_fp, names=range(0, 128), sep='\t', engine='python', encoding='utf-8', comment='#').dropna(axis=1, how='all')
     df = df.T  # transpose
@@ -46,7 +46,7 @@ def get_investigation_title(line, ISA):
 
 
 def get_first_node_index(header):
-    sqaushed_header = map(lambda x: magetab.squashstr(x), header)
+    sqaushed_header = list(map(lambda x: magetab.squashstr(x), header))
     nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"]
     for node in nodes:
         try:
diff --git a/isatools/convert/magetab2json.py b/isatools/convert/magetab2json.py
index 7f14342e..8386b255 100644
--- a/isatools/convert/magetab2json.py
+++ b/isatools/convert/magetab2json.py
@@ -7,11 +7,15 @@
 import shutil
 
 
-def convert(source_idf_fp, technology_type, measurement_type):
+def convert(source_idf_fp, technology_type=None, measurement_type=None):
     tmp = tempfile.mkdtemp()
-    magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type,
-                           measurement_type=measurement_type)
-    with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
-        ISA = isatab.load(isa_inv_fp)
+    ISA = None
+    try:
+        magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type,
+                               measurement_type=measurement_type)
+        with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
+            ISA = isatab.load(isa_inv_fp)
+    finally:
         shutil.rmtree(tmp)
-        return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
+        if ISA is not None:
+            return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))

From 5b9e71c71992cc58503a1a1f83caded6007a829f Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 15:19:48 +0100
Subject: [PATCH 08/18] Add logging if can't find config

---
 isatools/isatab.py | 77 ++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 34 deletions(-)

diff --git a/isatools/isatab.py b/isatools/isatab.py
index 78069ea6..cb0b04d1 100644
--- a/isatools/isatab.py
+++ b/isatools/isatab.py
@@ -2527,40 +2527,49 @@ def validate(fp, config_dir=default_config_dir, log_level=logging.INFO):
                     technology_type = assay_df['Study Assay Technology Type'].tolist()[x]
                     if assay_filename is not '':
                         try:
-                            logger.info("Loading... {}".format(assay_filename))
-                            with open(os.path.join(os.path.dirname(fp.name), assay_filename), encoding='utf-8') as a_fp:
-                                assay_table = load_table(a_fp)
-                                assay_table.filename = assay_filename
-                                assay_tables.append(assay_table)
-                                config = configs[(measurement_type, technology_type)]
-                                logger.info(
-                                    "Validating {} against assay table configuration ({}, {})...".format(
-                                        assay_filename, measurement_type, technology_type))
-                                logger.info("Checking Factor Value presence...")
-                                check_factor_value_presence(assay_table)  # Rule 4007
-                                logger.info("Checking required fields...")
-                                check_required_fields(assay_table, config)  # Rule 4003-8, 4010
-                                logger.info("Checking generic fields...")
-                                if not check_field_values(assay_table, config):  # Rule 4011
-                                    logger.warn(
-                                        "(W) There are some field value inconsistencies in {} against {} configuration".format(
-                                            assay_table.filename, (measurement_type, technology_type)))
-                                logger.info("Checking unit fields...")
-                                if not check_unit_field(assay_table, config):
-                                    logger.warn(
-                                        "(W) There are some unit value inconsistencies in {} against {} configuration".format(
-                                            assay_table.filename, (measurement_type, technology_type)))
-                                logger.info("Checking protocol fields...")
-                                if not check_protocol_fields(assay_table, config, protocol_names_and_types):  # Rule 4009
-                                    logger.warn("(W) There are some protocol inconsistencies in {} against {} "
-                                                "configuration".format(assay_table.filename, (measurement_type, technology_type)))
-                                logger.info("Checking ontology fields...")
-                                if not check_ontology_fields(assay_table, config):  # Rule 3010
-                                    logger.warn("(W) There are some ontology annotation inconsistencies in {} against {} "
-                                                "configuration".format(assay_table.filename, (measurement_type, technology_type)))
-                                logger.info("Finished validation on {}".format(assay_filename))
-                        except FileNotFoundError:
-                            pass
+                            config = configs[(measurement_type, technology_type)]
+                        except KeyError:
+                            logger.error("Could not load config matching ({}, {})".format(measurement_type, technology_type))
+                            logger.error("Only have configs matching:")
+                            for k in configs.keys():
+                                logger.error(k)
+                        if config is None:
+                            logger.warn("Skipping configuration validation as could not load config...")
+                        else:
+                            try:
+                                logger.info("Loading... {}".format(assay_filename))
+                                with open(os.path.join(os.path.dirname(fp.name), assay_filename), encoding='utf-8') as a_fp:
+                                    assay_table = load_table(a_fp)
+                                    assay_table.filename = assay_filename
+                                    assay_tables.append(assay_table)
+                                    logger.info(
+                                        "Validating {} against assay table configuration ({}, {})...".format(
+                                            assay_filename, measurement_type, technology_type))
+                                    logger.info("Checking Factor Value presence...")
+                                    check_factor_value_presence(assay_table)  # Rule 4007
+                                    logger.info("Checking required fields...")
+                                    check_required_fields(assay_table, config)  # Rule 4003-8, 4010
+                                    logger.info("Checking generic fields...")
+                                    if not check_field_values(assay_table, config):  # Rule 4011
+                                        logger.warn(
+                                            "(W) There are some field value inconsistencies in {} against {} configuration".format(
+                                                assay_table.filename, (measurement_type, technology_type)))
+                                    logger.info("Checking unit fields...")
+                                    if not check_unit_field(assay_table, config):
+                                        logger.warn(
+                                            "(W) There are some unit value inconsistencies in {} against {} configuration".format(
+                                                assay_table.filename, (measurement_type, technology_type)))
+                                    logger.info("Checking protocol fields...")
+                                    if not check_protocol_fields(assay_table, config, protocol_names_and_types):  # Rule 4009
+                                        logger.warn("(W) There are some protocol inconsistencies in {} against {} "
+                                                    "configuration".format(assay_table.filename, (measurement_type, technology_type)))
+                                    logger.info("Checking ontology fields...")
+                                    if not check_ontology_fields(assay_table, config):  # Rule 3010
+                                        logger.warn("(W) There are some ontology annotation inconsistencies in {} against {} "
+                                                    "configuration".format(assay_table.filename, (measurement_type, technology_type)))
+                                    logger.info("Finished validation on {}".format(assay_filename))
+                            except FileNotFoundError:
+                                pass
             if study_sample_table is not None:
                 logger.info("Checking consistencies between study sample table and assay tables...")
                 check_sample_names(study_sample_table, assay_tables)

From cd8aa7ec717bb78b24eb05073eea360fd9901ee8 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 15:20:37 +0100
Subject: [PATCH 09/18] Add detection of m/t types; ensure Experimental Design
 cast to Study Design Type #219

---
 isatools/magetab.py | 62 ++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/isatools/magetab.py b/isatools/magetab.py
index 94295888..da957254 100644
--- a/isatools/magetab.py
+++ b/isatools/magetab.py
@@ -847,12 +847,12 @@ def get_single(values):
     except KeyError:
         pass
 
-    if len(experimental_designs) > 0:
-        S.comments.append(Comment(name="Experimental Design", value=';'.join(experimental_designs)))
-    if len(experimental_design_tsrs) > 0:
-        S.comments.append(Comment(name="Experimental Design Term Source REF", value=';'.join(experimental_design_tsrs)))
-    if len(experimental_design_tans) > 0:
-        S.comments.append(Comment(name="Experimental Design Term Accession Number", value=';'.join(experimental_design_tans)))
+    for design, tsr, tan in zip_longest(experimental_designs, experimental_design_tsrs, experimental_design_tans):
+        try:
+            ts = ts_dict[tsr]
+        except KeyError:
+            ts = None
+        S.design_descriptors.append(OntologyAnnotation(term=design, term_source=ts, term_accession=tan))
 
     # Experimental Factor section of IDF
 
@@ -1194,16 +1194,19 @@ def get_single(values):
     if "ArrayExpressAccession" in comments_dict.keys():
         S.identifier = comments_dict["ArrayExpressAccession"]  # ArrayExpress adds this, so use it as the study ID
 
-    design_type = None
+    design_types = None
 
-    if "AEExperimentType" in comments_dict.keys():
-        design_type = comments_dict["AEExperimentType"]
+    if "experimentaldesign" in squashed_table_dict.keys():
+        design_types = experimental_designs
+
+    elif "AEExperimentType" in comments_dict.keys():
+        design_types = [comments_dict["AEExperimentType"]]
 
-    inferred_t_type = None
     inferred_m_type = None
+    inferred_t_type = None
     inferred_t_plat = None
-    if design_type is not None:
-        inferred_t_type, inferred_m_type, inferred_t_plat = get_measurement_and_type(design_type=design_type)
+    if design_types is not None:
+        inferred_m_type, inferred_t_type, inferred_t_plat = get_measurement_and_tech(design_types=design_types)
 
     if sdrf_file is not None:
         S.filename = "s_{}".format(sdrf_file)
@@ -1240,20 +1243,21 @@ def get_single(values):
     return ISA
 
 
-def get_measurement_and_type(design_type):
-
-    if re.match("(?i).*ChIP-Chip.*", design_type):
-        return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip"
-    if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match(
-            "(?i).*transcription profiling by high throughput sequencing.*", design_type):
-        return "transcription profiling", "nucleotide sequencing", "RNA-Seq"
-    if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type):
-        return "transcription profiling", "DNA microarray", "GeneChip"
-    if re.match("(?i).*methylation profiling by array.*", design_type):
-        return "DNA methylation profiling", "DNA microarray", "Me-Chip"
-    if re.match("(?i).*comparative genomic hybridization by array.*", design_type):
-        return "comparative genomic hybridization", "DNA microarray", "CGH-Chip"
-    if re.match(".*genotyping by array.*", design_type):
-        return "SNP analysis", "DNA microarray", "SNPChip"
-    if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type):
-        return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq"
+def get_measurement_and_tech(design_types):
+    for design_type in design_types:
+        if re.match("(?i).*ChIP-Chip.*", design_type):
+            return "protein-DNA binding site identification", "DNA microarray", "ChIP-Chip"
+        if re.match("(?i).*RNA-seq.*", design_type) or re.match("(?i).*RNA-Seq.*", design_type) or re.match(
+                "(?i).*transcription profiling by high throughput sequencing.*", design_type):
+            return "transcription profiling", "nucleotide sequencing", "RNA-Seq"
+        if re.match(".*transcription profiling by array.*", design_type) or re.match("dye_swap_design", design_type):
+            return "transcription profiling", "DNA microarray", "GeneChip"
+        if re.match("(?i).*methylation profiling by array.*", design_type):
+            return "DNA methylation profiling", "DNA microarray", "Me-Chip"
+        if re.match("(?i).*comparative genomic hybridization by array.*", design_type):
+            return "comparative genomic hybridization", "DNA microarray", "CGH-Chip"
+        if re.match(".*genotyping by array.*", design_type):
+            return "SNP analysis", "DNA microarray", "SNPChip"
+        if re.match("(?i).*ChIP-Seq.*", design_type) or re.match("(?i).*chip-seq.*", design_type):
+            return "protein-DNA binding site identification", "nucleotide sequencing", "ChIP-Seq"
+

From 3af8da2e8b134e327d8fbefa27fbbc07771c0633 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 15:32:44 +0100
Subject: [PATCH 10/18] Implement tests for ArrayExpress IO; closes  #220

---
 tests/test_ax.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/test_ax.py

diff --git a/tests/test_ax.py b/tests/test_ax.py
new file mode 100644
index 00000000..8dfb7db3
--- /dev/null
+++ b/tests/test_ax.py
@@ -0,0 +1,45 @@
+import unittest
+from unittest.mock import patch, mock_open
+from isatools.io import ax as AX
+import shutil
+import os
+
+
+class TestArrayExpressIO(unittest.TestCase):
+
+    def setUp(self):
+        pass  # detect if MTBLS is reachable. If so, run test of real server, otherwise run Mocks only?
+
+    def tearDown(self):
+        pass
+
+    """Mock-only test on E-AFMX1"""
+    @patch('ftplib.FTP', autospec=True)
+    def test_get_experiment(self, mock_ftp_constructor):
+        mock_ftp = mock_ftp_constructor.return_value
+        mock_ftp.login.return_value = '230'  # means login OK
+        tmp_dir = AX.get('E-AFMX-1')  # only retrieves ISA files from MTBLS
+        self.assertTrue(mock_ftp.login.called)
+        mock_ftp_constructor.assert_called_with('ftp.ebi.ac.uk')
+        mock_ftp.cwd.assert_called_with('/pub/databases/arrayexpress/data/experiment/AFMX/E-AFMX-1')
+        shutil.rmtree(tmp_dir)
+
+    """Tries to do actual call on ArrayExpress; uses E-AFMX-1 as not so big"""
+    def test_get_experiment_as_magetab(self):
+        tmp_dir = AX.get('E-AFMX-1')  # gets E-AFMX-1 MAGE-TAB files
+        self.assertEqual(len(os.listdir(tmp_dir)), 2)
+        self.assertSetEqual(set(os.listdir(tmp_dir)), {'E-AFMX-1.sdrf.txt', 'E-AFMX-1.idf.txt'})
+        shutil.rmtree(tmp_dir)
+
+    def test_get_experiment_as_isatab(self):
+        tmp_dir = AX.get_isatab('E-AFMX-1')  # gets E-AFMX-1 MAGE-TAB files
+        self.assertEqual(len(os.listdir(tmp_dir)), 3)
+        self.assertSetEqual(set(os.listdir(tmp_dir)), {'i_investigation.txt', 'a_E-AFMX-1.sdrf.txt',
+                                                       's_E-AFMX-1.sdrf.txt'})
+        shutil.rmtree(tmp_dir)
+
+    def test_get_experiment_as_json(self):
+        isa_json = AX.getj('E-AFMX-1')  # loads E-AFMX-1 study into ISA-JSON
+        self.assertIsInstance(isa_json, dict)
+        self.assertEqual(isa_json['identifier'], 'E-AFMX-1')
+        self.assertEqual(isa_json['studies'][0]['people'][0]['email'], 'khaitovich@eva.mpg.de')

From ef5520f1ac345be7863f3483c49779cc2eca8d17 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 16:17:23 +0100
Subject: [PATCH 11/18] Add missing file from last commit to close #220

---
 isatools/io/ax.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/isatools/io/ax.py b/isatools/io/ax.py
index 4ff51815..2d7aaf9f 100644
--- a/isatools/io/ax.py
+++ b/isatools/io/ax.py
@@ -6,7 +6,7 @@
 from isatools.convert import magetab2isatab, magetab2json
 
 EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
-AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment/'
+AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment'
 
 logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -43,8 +43,8 @@ def get(arrayexpress_id, target_dir=None):
             logging.info("Using directory '{}'".format(target_dir))
             idf_filename = "{}.idf.txt".format(arrayexpress_id)
             with open(os.path.join(target_dir, idf_filename), 'wb') as out_file:
-                logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' +
-                                                           arrayexpress_id + '/' + idf_filename))
+                logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type +
+                                                           '/' + arrayexpress_id + '/' + idf_filename))
                 ftp.retrbinary('RETR ' + idf_filename, out_file.write)
             sdrf_filename = "{}.sdrf.txt".format(arrayexpress_id)
             with open(os.path.join(target_dir, sdrf_filename), 'wb') as out_file:

From 588a91974ee093c42d13d648ffe72d651131c8ad Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 17:12:36 +0100
Subject: [PATCH 12/18] Work towards #219

---
 isatools/magetab.py | 37 +++++++++++++++++++++++++++----------
 isatools/utils.py   |  3 ++-
 tests/test_ax.py    |  2 +-
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/isatools/magetab.py b/isatools/magetab.py
index da957254..478188d7 100644
--- a/isatools/magetab.py
+++ b/isatools/magetab.py
@@ -500,29 +500,34 @@ def get_first_node_index(header):
 def split_tables(sdrf_path):
 
     def split_on_sample(sdrf_df):
-        sdrf_df_isatab_header = sdrf_df.isatab_header
         sdrf_df_cols = list(sdrf_df.columns)
+
         sample_name_index = sdrf_df_cols.index("Sample Name")
+
         study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates()
-        study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1]
+        study_df.isatab_header = sdrf_df.isatab_header[0:sample_name_index + 1]
+
         assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
-        assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
+        assay_df.isatab_header = sdrf_df.isatab_header[sample_name_index:]
+
         return study_df, assay_df
 
     sdrf_df = isatab.read_tfile(sdrf_path)
 
-    if "Sample Name" in sdrf_df.columns:
+    sdrf_columns = list(sdrf_df.columns)
+    if "Hybridization Name" in sdrf_columns:
+        sdrf_df.columns = [x.replace("Hybridization Name", "Hybridization Assay Name") for x in sdrf_columns]
+
+    if "Sample Name" in list(sdrf_df.columns):
         return split_on_sample(sdrf_df)
     else:  # insert Sample Name
         sdrf_df_columns = list(sdrf_df.columns)
         sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]]
-        sdrf_df_isatab_header = sdrf_df.isatab_header
-        sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
+        sdrf_df.isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
 
         sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
 
         sdrf_df = sdrf_df[sdrf_df_columns]
-        sdrf_df.isatab_header = sdrf_df_isatab_header
 
         return split_on_sample(sdrf_df)
 
@@ -1233,9 +1238,21 @@ def get_single(values):
             print("Detected probable '{}' technology platform".format(inferred_t_plat))
             tp = inferred_t_plat
 
-        S.assays = [
-            Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp)
-        ]
+        A = Assay(filename=a_filename, technology_type=ttoa, measurement_type=mtoa, technology_platform=tp)
+
+        if (A.measurement_type, A.technology_type) in [
+            ("transcription profiling", "nucleotide sequencing"),
+            ("protein-DNA binding site identification", "nucleotide sequencing")
+        ]:
+            if "library construction" not in [x.name for x in S.protocols]:
+                logger.info("PROTOCOL INSERTION: {}, library construction".format(a_filename))
+                S.protocols.append(Protocol(name="library construction",
+                                            protocol_type=OntologyAnnotation(term="library construction")))
+            if "nucleic acid sequencing" not in [x.name for x in S.protocols]:
+                logger.info("PROTOCOL INSERTION: {}, nucleic acid sequencing".format(a_filename))
+                S.protocols.append(Protocol(name="nucleic acid sequencing",
+                                            protocol_type=OntologyAnnotation(term="nucleic acid sequencing")))
+        S.assays = [A]
 
     ISA.identifier = S.identifier
     ISA.title = S.title
diff --git a/isatools/utils.py b/isatools/utils.py
index 8533da24..2ca050e5 100644
--- a/isatools/utils.py
+++ b/isatools/utils.py
@@ -31,7 +31,8 @@ def detect_graph_process_pooling(G):
     report = list()
     for process in [n for n in G.nodes() if isinstance(n, Process)]:
         if len(G.in_edges(process)) > 1:
-            print("Possible process pooling detected on: ", process.id)
+            print("Possible process pooling detected on: {}"
+                  .format(' '.join([process.id, process.executes_protocol.name])))
             report.append(process.id)
     return report
 
diff --git a/tests/test_ax.py b/tests/test_ax.py
index 8dfb7db3..8ab297e9 100644
--- a/tests/test_ax.py
+++ b/tests/test_ax.py
@@ -24,7 +24,7 @@ def test_get_experiment(self, mock_ftp_constructor):
         mock_ftp.cwd.assert_called_with('/pub/databases/arrayexpress/data/experiment/AFMX/E-AFMX-1')
         shutil.rmtree(tmp_dir)
 
-    """Tries to do actual call on ArrayExpress; uses E-AFMX-1 as not so big"""
+    """Tries to do actual call on ArrayExpress; uses E-AFMX-1"""
     def test_get_experiment_as_magetab(self):
         tmp_dir = AX.get('E-AFMX-1')  # gets E-AFMX-1 MAGE-TAB files
         self.assertEqual(len(os.listdir(tmp_dir)), 2)

From 1510072f8a542734a2ae8dab78fd5dfd4aaecdf6 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 17:12:57 +0100
Subject: [PATCH 13/18] Work towards #219

---
 tests/test_magetab2isatab.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py
index eb2da96a..c28f7700 100644
--- a/tests/test_magetab2isatab.py
+++ b/tests/test_magetab2isatab.py
@@ -4,6 +4,8 @@
 from isatools.convert import magetab2isatab
 from tests import utils
 import tempfile
+from isatools.io import ax as AX
+from isatools import isatab
 
 
 def setUpModule():
@@ -37,10 +39,25 @@ def test_magetab2isatab_convert_e_mexp_31(self):
 
     def test_magetab2isatab_convert_e_geod_59671(self):
         with open(os.path.join(self._magetab_data_dir, 'E-GEOD-59671.idf.txt')) as idf_fp:
-            magetab2isatab.convert(idf_fp, self._tmp_dir, 'DNA microarray', 'expression profiling')
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'i_investigation.txt')))
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 's_E-GEOD-59671.sdrf.txt')))
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'a_E-GEOD-59671.sdrf.txt')))
+            magetab2isatab.convert(idf_fp, self._tmp_dir2)
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'i_investigation.txt')))
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 's_E-GEOD-59671.sdrf.txt')))
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'a_E-GEOD-59671.sdrf.txt')))
             from isatools import isatab
             with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
-                isatab.validate(i_fp)
\ No newline at end of file
+                isatab.validate(i_fp)
+
+    def test_get_experiment_as_isatab_afmx_1(self):
+        AX.get_isatab('E-AFMX-1', self._tmp_dir)  # gets E-AFMX-1 MAGE-TAB files
+        with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
+            isatab.validate(i_fp)
+
+    def test_get_experiment_as_isatab_afmx_2(self):
+        AX.get_isatab('E-AFMX-2', self._tmp_dir)  # gets E-AFMX-2 MAGE-TAB files
+        with open(os.path.join(self._tmp_dir2, 'i_investigation.txt')) as i_fp:
+            isatab.validate(i_fp)
+
+    def test_get_experiment_as_isatab_afmx_3(self):
+        AX.get_isatab('E-AFMX-3', self._tmp_dir)  # gets E-AFMX-3 MAGE-TAB files
+        with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
+            isatab.validate(i_fp)

From 0311dd0174ca5c94bde4358d442fa57a198d3507 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 17:13:29 +0100
Subject: [PATCH 14/18] Work towards #219

---
 tests/test_magetab2isatab.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py
index c28f7700..f9fd146e 100644
--- a/tests/test_magetab2isatab.py
+++ b/tests/test_magetab2isatab.py
@@ -39,10 +39,10 @@ def test_magetab2isatab_convert_e_mexp_31(self):
 
     def test_magetab2isatab_convert_e_geod_59671(self):
         with open(os.path.join(self._magetab_data_dir, 'E-GEOD-59671.idf.txt')) as idf_fp:
-            magetab2isatab.convert(idf_fp, self._tmp_dir2)
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'i_investigation.txt')))
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 's_E-GEOD-59671.sdrf.txt')))
-            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir2, 'a_E-GEOD-59671.sdrf.txt')))
+            magetab2isatab.convert(idf_fp, self._tmp_dir)
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'i_investigation.txt')))
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 's_E-GEOD-59671.sdrf.txt')))
+            self.assertTrue(os.path.isfile(os.path.join(self._tmp_dir, 'a_E-GEOD-59671.sdrf.txt')))
             from isatools import isatab
             with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
                 isatab.validate(i_fp)
@@ -54,7 +54,7 @@ def test_get_experiment_as_isatab_afmx_1(self):
 
     def test_get_experiment_as_isatab_afmx_2(self):
         AX.get_isatab('E-AFMX-2', self._tmp_dir)  # gets E-AFMX-2 MAGE-TAB files
-        with open(os.path.join(self._tmp_dir2, 'i_investigation.txt')) as i_fp:
+        with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
             isatab.validate(i_fp)
 
     def test_get_experiment_as_isatab_afmx_3(self):

From 8e7e2620e75a058677c43859af7cfe5c0215d8c3 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Thu, 22 Jun 2017 17:34:58 +0100
Subject: [PATCH 15/18] Work towards #219

---
 isatools/convert/magetab2isatab.py |  3 ++-
 isatools/magetab.py                | 15 +++++++--------
 tests/test_magetab2isatab.py       |  7 ++++++-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/isatools/convert/magetab2isatab.py b/isatools/convert/magetab2isatab.py
index b5fcacfb..9223fef8 100644
--- a/isatools/convert/magetab2isatab.py
+++ b/isatools/convert/magetab2isatab.py
@@ -22,7 +22,8 @@ def convert(source_idf_fp, output_path, technology_type=None, measurement_type=N
     for _, row in df.iterrows():
         sdrf_file = row["SDRF File"]
         if isinstance(sdrf_file, str):
-            study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file))
+            study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name),
+                                                                             sdrf_file))
             study_df.columns = study_df.isatab_header
             assay_df.columns = assay_df.isatab_header
             # write out ISA table files
diff --git a/isatools/magetab.py b/isatools/magetab.py
index 478188d7..071f87a8 100644
--- a/isatools/magetab.py
+++ b/isatools/magetab.py
@@ -500,16 +500,13 @@ def get_first_node_index(header):
 def split_tables(sdrf_path):
 
     def split_on_sample(sdrf_df):
+        sdrf_df_isatab_header = sdrf_df.isatab_header
         sdrf_df_cols = list(sdrf_df.columns)
-
         sample_name_index = sdrf_df_cols.index("Sample Name")
-
         study_df = sdrf_df[sdrf_df.columns[0:sample_name_index + 1]].drop_duplicates()
-        study_df.isatab_header = sdrf_df.isatab_header[0:sample_name_index + 1]
-
+        study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index + 1]
         assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
-        assay_df.isatab_header = sdrf_df.isatab_header[sample_name_index:]
-
+        assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
         return study_df, assay_df
 
     sdrf_df = isatab.read_tfile(sdrf_path)
@@ -518,16 +515,18 @@ def split_on_sample(sdrf_df):
     if "Hybridization Name" in sdrf_columns:
         sdrf_df.columns = [x.replace("Hybridization Name", "Hybridization Assay Name") for x in sdrf_columns]
 
-    if "Sample Name" in list(sdrf_df.columns):
+    if "Sample Name" in sdrf_df.columns:
         return split_on_sample(sdrf_df)
     else:  # insert Sample Name
         sdrf_df_columns = list(sdrf_df.columns)
         sdrf_df["Sample Name"] = sdrf_df[sdrf_df_columns[get_first_node_index(sdrf_df_columns)]]
-        sdrf_df.isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
+        sdrf_df_isatab_header = sdrf_df.isatab_header
+        sdrf_df_isatab_header.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
 
         sdrf_df_columns.insert(get_first_node_index(sdrf_df_columns), "Sample Name")
 
         sdrf_df = sdrf_df[sdrf_df_columns]
+        sdrf_df.isatab_header = sdrf_df_isatab_header
 
         return split_on_sample(sdrf_df)
 
diff --git a/tests/test_magetab2isatab.py b/tests/test_magetab2isatab.py
index f9fd146e..db7f2fe8 100644
--- a/tests/test_magetab2isatab.py
+++ b/tests/test_magetab2isatab.py
@@ -52,7 +52,7 @@ def test_get_experiment_as_isatab_afmx_1(self):
         with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
             isatab.validate(i_fp)
 
-    def test_get_experiment_as_isatab_afmx_2(self):
+    def test_get_experiment_as_isatab_afmx_2(self):  # FIXME -> output ISA-Tab has many missing cells! WHY!?!?
         AX.get_isatab('E-AFMX-2', self._tmp_dir)  # gets E-AFMX-2 MAGE-TAB files
         with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
             isatab.validate(i_fp)
@@ -61,3 +61,8 @@ def test_get_experiment_as_isatab_afmx_3(self):
         AX.get_isatab('E-AFMX-3', self._tmp_dir)  # gets E-AFMX-3 MAGE-TAB files
         with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
             isatab.validate(i_fp)
+
+    def test_get_experiment_as_isatab_afmx_5(self):
+        AX.get_isatab('E-AFMX-5', self._tmp_dir)  # gets E-AFMX-5 MAGE-TAB files
+        with open(os.path.join(self._tmp_dir, 'i_investigation.txt')) as i_fp:
+            isatab.validate(i_fp)

From d45be02f988c6a04ce5089f07b4a784e70ce32ea Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Fri, 23 Jun 2017 09:39:37 +0100
Subject: [PATCH 16/18] Fixes #221

---
 isatools/isatab.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/isatools/isatab.py b/isatools/isatab.py
index cb0b04d1..df76affc 100644
--- a/isatools/isatab.py
+++ b/isatools/isatab.py
@@ -2941,12 +2941,12 @@ def process_keygen(protocol_ref, column_group, object_label_index, all_columns,
     output_node_index = find_gt(node_cols, object_label_index)
     if output_node_index > -1:
         output_node_label = all_columns[output_node_index]
-        output_node_value = series[output_node_label]
+        output_node_value = str(series[output_node_label])
 
     input_node_index = find_lt(node_cols, object_label_index)
     if input_node_index > -1:
         input_node_label = all_columns[input_node_index]
-        input_node_value = series[input_node_label]
+        input_node_value = str(series[input_node_label])
 
     input_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[input_node_index]]].drop_duplicates()
     output_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[output_node_index]]].drop_duplicates()
@@ -3203,7 +3203,7 @@ def create_from_df(self, DF):  # from DF of a table file
             if self.samples is not None:
                 sample_map = dict(map(lambda x: ('Sample Name:' + x.name, x), self.samples))
                 sample_keys = list(map(lambda x: 'Sample Name:' + x,
-                                   [x for x in DF['Sample Name'].drop_duplicates() if x != '']))
+                                   [str(x) for x in DF['Sample Name'].drop_duplicates() if x != '']))
                 for k in sample_keys:
                     try:
                         samples[k] = sample_map[k]
@@ -3211,7 +3211,7 @@ def create_from_df(self, DF):  # from DF of a table file
                         print('warning! Did not find sample referenced at assay level in study samples')
             else:
                 samples = dict(map(lambda x: ('Sample Name:' + x, Sample(name=x)),
-                               [x for x in DF['Sample Name'].drop_duplicates() if x != '']))
+                               [str(x) for x in DF['Sample Name'].drop_duplicates() if x != '']))
         except KeyError:
             pass
 
@@ -3280,7 +3280,7 @@ def get_node_by_label_and_key(l, k):
                                                                                   ETA()]).start()
 
                 for _, object_series in pbar(DF[column_group].drop_duplicates().iterrows()):
-                    node_name = object_series[object_label]
+                    node_name = str(object_series[object_label])
                     node_key = ":".join([object_label, node_name])
                     material = None
                     if object_label == "Source Name":
@@ -3357,7 +3357,7 @@ def get_node_by_label_and_key(l, k):
 
                 for _, object_series in pbar(DF[column_group].drop_duplicates().iterrows()):
                     try:
-                        data_file = get_node_by_label_and_key(object_label, object_series[object_label])
+                        data_file = get_node_by_label_and_key(object_label, str(object_series[object_label]))
                         for comment_column in [c for c in column_group if c.startswith('Comment[')]:
                             if comment_column[8:-1] not in [x.name for x in data_file.comments]:
                                 data_file.comments.append(Comment(name=comment_column[8:-1], value=str(object_series[comment_column])))
@@ -3375,7 +3375,7 @@ def get_node_by_label_and_key(l, k):
                 for _, object_series in pbar(DF.iterrows()):  # don't drop duplicates
                     # if _ == 0:
                     #     print('processing: ', object_series[object_label])
-                    protocol_ref = object_series[object_label]
+                    protocol_ref = str(object_series[object_label])
                     process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
 
                     # TODO: Keep process key sequence here to reduce number of passes on Protocol REF columns?
@@ -3392,7 +3392,7 @@ def get_node_by_label_and_key(l, k):
                     if output_proc_index < output_node_index > -1:
 
                         output_node_label = DF.columns[output_node_index]
-                        output_node_value = object_series[output_node_label]
+                        output_node_value = str(object_series[output_node_label])
 
                         node_key = output_node_value
 
@@ -3413,7 +3413,7 @@ def get_node_by_label_and_key(l, k):
                     if input_proc_index < input_node_index > -1:
 
                         input_node_label = DF.columns[input_node_index]
-                        input_node_value = object_series[input_node_label]
+                        input_node_value = str(object_series[input_node_label])
 
                         node_key = input_node_value
 
@@ -3481,13 +3481,13 @@ def get_node_by_label_and_key(l, k):
 
                 if object_label.startswith('Source Name'):
                     try:
-                        source_node_context = get_node_by_label_and_key(object_label, object_series[object_label])
+                        source_node_context = get_node_by_label_and_key(object_label, str(object_series[object_label]))
                     except KeyError:
                         pass  # skip if object not found
 
                 if object_label.startswith('Sample Name'):
                     try:
-                        sample_node_context = get_node_by_label_and_key(object_label, object_series[object_label])
+                        sample_node_context = get_node_by_label_and_key(object_label, str(object_series[object_label]))
                     except KeyError:
                         pass  # skip if object not found
                     if source_node_context is not None:
@@ -3495,14 +3495,14 @@ def get_node_by_label_and_key(l, k):
                             sample_node_context.derives_from.append(source_node_context)
 
                 if object_label.startswith('Protocol REF'):
-                    protocol_ref = object_series[object_label]
+                    protocol_ref = str(object_series[object_label])
                     process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
                     process_key_sequence.append(process_key)
 
                 if object_label.endswith(' File'):
                     data_node = None
                     try:
-                        data_node = get_node_by_label_and_key(object_label, object_series[object_label])
+                        data_node = get_node_by_label_and_key(object_label, str(object_series[object_label]))
                     except KeyError:
                         pass  # skip if object not found
                     if sample_node_context is not None and data_node is not None:

From 1f16e42b2002cf2ff7bd46fa5a358ab0e87a645d Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Fri, 23 Jun 2017 09:43:30 +0100
Subject: [PATCH 17/18] Tests #221; closes #221

---
 tests/test_mtbls.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_mtbls.py b/tests/test_mtbls.py
index 085e330f..79dc4579 100644
--- a/tests/test_mtbls.py
+++ b/tests/test_mtbls.py
@@ -57,4 +57,9 @@ def test_get_datafiles(self):
         factor_selection = {"genotype": "Col-0"}
         results = MTBLS.get_data_files('MTBLS2', factor_selection)
         self.assertEqual(len(results), 8)
-        self.assertEqual(len(results[0]['data_files']), 1)
\ No newline at end of file
+        self.assertEqual(len(results[0]['data_files']), 1)
+
+    def test_get_factors_summary(self):  # Test for issue #221
+        factors_summary = MTBLS.get_factors_summary('MTBLS26')
+        self.assertIsInstance(factors_summary, list)
+        self.assertEqual(len(factors_summary), 18)
\ No newline at end of file

From bb777f7995063410506faa814cad6eee9359a680 Mon Sep 17 00:00:00 2001
From: David Johnson <david.johnson@oerc.ox.ac.uk>
Date: Fri, 23 Jun 2017 10:17:03 +0100
Subject: [PATCH 18/18] Update version number for release; update known issues
 in docs

---
 docs/knownissues.rst | 2 +-
 setup.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/knownissues.rst b/docs/knownissues.rst
index a855994f..9f9e8d6c 100644
--- a/docs/knownissues.rst
+++ b/docs/knownissues.rst
@@ -4,7 +4,7 @@ Known issues
 
 isatools v0.8 package
 ---------------------
-- Issues #153 is still outstanding, as per below; new issues #205 (json2isatab conversion issue), #208 (ISA-Tab validation issue) and #218 (MAGE-TAB conversion issue)
+- Issues #153 is still outstanding, as per below; new issue #208 (ISA-Tab validation issue)
 - SRA/ENA importer and Biocrates importer relies on XSLT2 processing only available with SAXON and requires .jar file to run
 
 isatools v0.7 package
diff --git a/setup.py b/setup.py
index 3df50c14..40f472d5 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='isatools',
-    version='0.8.1',
+    version='0.8.2',
     packages=['isatools', 'isatools.convert', 'isatools.io', 'isatools.model'],
     package_data={'isatools': ['schemas/cedar/*.json',
                                'schemas/isa_model_version_1_0_schemas/core/*.json',