Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
djcomlab committed Jun 23, 2017
2 parents 4634886 + bb777f7 commit fa286b6
Show file tree
Hide file tree
Showing 12 changed files with 399 additions and 115 deletions.
2 changes: 1 addition & 1 deletion docs/knownissues.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Known issues

isatools v0.8 package
---------------------
- Issues #153 is still outstanding, as per below; new issues #205 (json2isatab conversion issue), #208 (ISA-Tab validation issue) and #218 (MAGE-TAB conversion issue)
- Issues #153 is still outstanding, as per below; new issue #208 (ISA-Tab validation issue)
- SRA/ENA importer and Biocrates importer relies on XSLT2 processing only available with SAXON and requires .jar file to run

isatools v0.7 package
Expand Down
32 changes: 13 additions & 19 deletions isatools/convert/magetab2isatab.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
logger = logging.getLogger(__name__)


def convert(source_idf_fp, output_path, technology_type, measurement_type):
def convert(source_idf_fp, output_path, technology_type=None, measurement_type=None):
""" Converter for MAGE-TAB to ISA-Tab
:param source_idf_fp: File descriptor of input IDF file
:param output_dir: Path to directory to write output ISA-Tab files to
:param output_path: Path to directory to write output ISA-Tab files to
"""
df = pd.read_csv(source_idf_fp, names=range(0, 128), sep='\t', engine='python', encoding='utf-8', comment='#').dropna(axis=1, how='all')
df = df.T # transpose
Expand All @@ -22,7 +22,8 @@ def convert(source_idf_fp, output_path, technology_type, measurement_type):
for _, row in df.iterrows():
sdrf_file = row["SDRF File"]
if isinstance(sdrf_file, str):
study_df, assay_df = split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file))
study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name),
sdrf_file))
study_df.columns = study_df.isatab_header
assay_df.columns = assay_df.isatab_header
# write out ISA table files
Expand All @@ -45,19 +46,12 @@ def get_investigation_title(line, ISA):
ISA.title = value


def split_tables(sdrf_path):
sdrf_df = isatab.read_tfile(sdrf_path)
sdrf_df_isatab_header = sdrf_df.isatab_header
if "Sample Name" in sdrf_df.columns:
sample_name_index = list(sdrf_df.columns).index("Sample Name")
elif "Extract Name" in sdrf_df.columns:
sample_name_index = list(sdrf_df.columns).index("Extract Name")
elif "Labeled Extract Name" in sdrf_df.columns:
sample_name_index = list(sdrf_df.columns).index("Labeled Extract Name")
else:
raise magetab.MageTabParserException("Could not split SDRF table as could not find suitable column to split on")
study_df = sdrf_df[sdrf_df.columns[0:sample_name_index+1]].drop_duplicates()
study_df.isatab_header = sdrf_df_isatab_header[0:sample_name_index+1]
assay_df = sdrf_df[sdrf_df.columns[sample_name_index:]]
assay_df.isatab_header = sdrf_df_isatab_header[sample_name_index:]
return study_df, assay_df
def get_first_node_index(header):
sqaushed_header = list(map(lambda x: magetab.squashstr(x), header))
nodes = ["samplename", "extractname", "labeledextractname", "hybridizationname", "assayname"]
for node in nodes:
try:
index = sqaushed_header.index(node)
return index
except ValueError:
pass
16 changes: 10 additions & 6 deletions isatools/convert/magetab2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
import shutil


def convert(source_idf_fp, technology_type, measurement_type):
def convert(source_idf_fp, technology_type=None, measurement_type=None):
tmp = tempfile.mkdtemp()
magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type,
measurement_type=measurement_type)
with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
ISA = isatab.load(isa_inv_fp)
ISA = None
try:
magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type,
measurement_type=measurement_type)
with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
ISA = isatab.load(isa_inv_fp)
finally:
shutil.rmtree(tmp)
return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
if ISA is not None:
return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
112 changes: 112 additions & 0 deletions isatools/io/ax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import ftplib
import logging
import os
import tempfile
import shutil
from isatools.convert import magetab2isatab, magetab2json

EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
AX_EXPERIMENT_BASE_DIR = '/pub/databases/arrayexpress/data/experiment'

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


def get(arrayexpress_id, target_dir=None):
"""
This function downloads MAGE-TAB content from the ArrayExpress FTP site.
:param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
:param target_dir: Path to write MAGE-TAB files to. If None, writes to temporary directory (generated on the fly)
:return: Path where the files were written to
Example usage:
from isatools.io import ax as AX
AX.get('E-GEOD-59671', '/tmp/ax')
"""

idbits = arrayexpress_id.split('-')
exp_type = idbits[1]

logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER))
ftp = ftplib.FTP(EBI_FTP_SERVER)
logging.info("Logging in as anonymous user...")
response = ftp.login()
if '230' in response: # 230 means Login successful
logging.info("Log in successful!")
try:
logging.info("Looking for experiment '{}'".format(arrayexpress_id))
ftp.cwd('{base_dir}/{exp_type}/{arrayexpress_id}'.format(base_dir=AX_EXPERIMENT_BASE_DIR, exp_type=exp_type,
arrayexpress_id=arrayexpress_id))
if target_dir is None:
target_dir = tempfile.mkdtemp()
logging.info("Using directory '{}'".format(target_dir))
idf_filename = "{}.idf.txt".format(arrayexpress_id)
with open(os.path.join(target_dir, idf_filename), 'wb') as out_file:
logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type +
'/' + arrayexpress_id + '/' + idf_filename))
ftp.retrbinary('RETR ' + idf_filename, out_file.write)
sdrf_filename = "{}.sdrf.txt".format(arrayexpress_id)
with open(os.path.join(target_dir, sdrf_filename), 'wb') as out_file:
logging.info("Retrieving file '{}'".format(
EBI_FTP_SERVER + AX_EXPERIMENT_BASE_DIR + '/' + exp_type + '/' + arrayexpress_id + '/'
+ sdrf_filename))
ftp.retrbinary('RETR ' + sdrf_filename, out_file.write)
except ftplib.error_perm as ftperr:
logger.fatal("Could not retrieve ArrayExpress study '{study}': {error}".format(study=arrayexpress_id,
error=ftperr))
finally:
return target_dir
else:
raise ConnectionError("There was a problem connecting to ArrayExpress: " + response)


def get_isatab(arrayexpress_id, target_dir=None):
"""
This function downloads MAGE-TAB content as ISA-Tab from the ArrayExpress FTP site.
:param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
:param target_dir: Path to write ISA-Tab files to. If None, writes to temporary directory (generated on the fly)
:return: Path where the files were written to
Example usage:
from isatools.io import ax as AX
AX.get_isatab('E-GEOD-59671', '/tmp/ax')
"""
tmp_dir = tempfile.mkdtemp()
try:
get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir)
if target_dir is None:
target_dir = tempfile.mkdtemp()
logging.info("Using directory '{}'".format(target_dir))
with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp:
magetab2isatab.convert(source_idf_fp=idf_fp, output_path=target_dir)
except Exception as e:
logger.fatal("Something went wrong: {}".format(e))
finally:
shutil.rmtree(tmp_dir)
return target_dir


def getj(arrayexpress_id):
"""
This function downloads MAGE-TAB content as ISA-JSON from the ArrayExpress FTP site.
:param ax_experiment_id: Experiment identifier for ArrayExpress study to get, as a str (e.g. E-GEOD-59671)
:return: ISA-JSON representation of the MAGE-TAB content
Example usage:
from isatools.io import ax as AX
my_json = AX.getj('E-GEOD-59671')
"""
tmp_dir = tempfile.mkdtemp()
mage_json = None
try:
get(arrayexpress_id=arrayexpress_id, target_dir=tmp_dir)
with open(os.path.join(tmp_dir, "{}.idf.txt".format(arrayexpress_id))) as idf_fp:
mage_json = magetab2json.convert(source_idf_fp=idf_fp)
except Exception as e:
logger.fatal("Something went wrong: {}".format(e))
finally:
shutil.rmtree(tmp_dir)
return mage_json
15 changes: 7 additions & 8 deletions isatools/io/mtbls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
import glob
from isatools.convert import isatab2json
from isatools import isatab
from isatools.model.v1 import OntologyAnnotation, Process, ParameterValue
import networkx as nx
from isatools.model.v1 import OntologyAnnotation
import pandas as pd

MTBLS_FTP_SERVER = 'ftp.ebi.ac.uk'
EBI_FTP_SERVER = 'ftp.ebi.ac.uk'
MTBLS_BASE_DIR = '/pub/databases/metabolights/studies/public'
INVESTIGATION_FILENAME = 'i_Investigation.txt'

Expand All @@ -33,8 +32,8 @@ def get(mtbls_study_id, target_dir=None):
Example usage:
isa_json = MTBLS.get_study('MTBLS1', '/tmp/mtbls')
"""
logging.info("Setting up ftp with {}".format(MTBLS_FTP_SERVER))
ftp = ftplib.FTP(MTBLS_FTP_SERVER)
logging.info("Setting up ftp with {}".format(EBI_FTP_SERVER))
ftp = ftplib.FTP(EBI_FTP_SERVER)
logging.info("Logging in as anonymous user...")
response = ftp.login()
if '230' in response: # 230 means Login successful
Expand All @@ -46,7 +45,7 @@ def get(mtbls_study_id, target_dir=None):
target_dir = tempfile.mkdtemp()
logging.info("Using directory '{}'".format(target_dir))
out_file = open(os.path.join(target_dir, INVESTIGATION_FILENAME), 'wb')
logging.info("Retrieving file '{}'".format(MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME))
logging.info("Retrieving file '{}'".format(EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + INVESTIGATION_FILENAME))
ftp.retrbinary('RETR ' + INVESTIGATION_FILENAME, out_file.write)
with open(out_file.name, encoding='utf-8') as i_fp:
i_bytes = i_fp.read()
Expand All @@ -55,14 +54,14 @@ def get(mtbls_study_id, target_dir=None):
for s_filename in s_filenames:
out_file = open(os.path.join(target_dir, s_filename), 'wb')
logging.info("Retrieving file '{}'".format(
MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename))
EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + s_filename))
ftp.retrbinary('RETR ' + s_filename, out_file.write)
a_filenames_lines = [l.split('\t') for l in lines if 'Study Assay File Name' in l]
for a_filename_line in a_filenames_lines:
for a_filename in [f[1:-1] for f in a_filename_line[1:]]:
out_file = open(os.path.join(target_dir, a_filename), 'wb')
logging.info("Retrieving file '{}'".format(
MTBLS_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename))
EBI_FTP_SERVER + MTBLS_BASE_DIR + '/' + mtbls_study_id + '/' + a_filename))
ftp.retrbinary('RETR ' + a_filename, out_file.write)
except ftplib.error_perm as ftperr:
logger.fatal("Could not retrieve MetaboLights study '{study}': {error}".format(study=mtbls_study_id, error=ftperr))
Expand Down
Loading

0 comments on commit fa286b6

Please sign in to comment.