diff --git a/bin/create_metadata_table.py b/bin/create_metadata_table.py index 26191c8d..85a94386 100755 --- a/bin/create_metadata_table.py +++ b/bin/create_metadata_table.py @@ -27,7 +27,7 @@ from retry import retry from assembly_stats import run_assembly_stats -from get_ENA_metadata import get_location, load_xml, load_gca_json, get_gca_location +from get_ENA_metadata import get_location, load_xml, get_gca_location logging.basicConfig(level=logging.INFO) @@ -115,8 +115,8 @@ def load_geography(geofile): def get_metadata(acc): location = None - project = None - biosample = None + project = "N/A" + biosample = "N/A" if acc.startswith("ERZ"): json_data_erz = load_xml(acc) biosample = json_data_erz["ANALYSIS_SET"]["ANALYSIS"]["SAMPLE_REF"][ @@ -128,44 +128,29 @@ def get_metadata(acc): elif acc.startswith("GUT"): pass elif acc.startswith("GCA"): - json_data_gca = load_xml(acc) - biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"] - project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"] + try: + json_data_gca = load_xml(acc) + biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"] + project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"] + except: + logging.info("Missing metadata in ENA XML for sample {}. Using API instead.".format(acc)) + try: + biosample, project = ena_api_request(acc) + except: + logging.exception("Could not obtain biosample and project information for {}".format(acc)) else: - if acc.startswith("CA"): - acc = acc + "0" * 7 - r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl") - if r.ok: - match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text) - if match_pr: - project = match_pr[0] - else: - project = "" - match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text) - if match_samp: - biosample = match_samp[0] - else: - biosample = "" - else: - logging.error("Cannot obtain metadata from ENA") - sys.exit() + biosample, project = ena_api_request(acc) if not acc.startswith("GUT"): if acc.startswith("GCA"): location = get_gca_location(biosample) else: location = get_location(biosample) if not location: + logging.warning("Unable to obtain location for sample {}".format(biosample)) location = "not provided" if not acc.startswith("GUT"): if acc.startswith("GCA"): - json_data_sample = load_xml(acc) converted_sample = biosample - try: - project = json_data_sample["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"][ - "IDENTIFIERS" - ]["PRIMARY_ID"] - except: - project = "N/A" else: json_data_sample = load_xml(biosample) try: @@ -191,6 +176,24 @@ def get_metadata(acc): return converted_sample, converted_project, location +def ena_api_request(acc): + biosample = project = "" + if acc.startswith("CA"): + acc = acc + "0" * 7 + r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl") + if r.ok: + match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text) + if match_pr: + project = match_pr[0] + match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text) + if match_samp: + biosample = match_samp[0] + else: + logging.error("Cannot obtain metadata from ENA") + sys.exit() + return biosample, project + + @retry(tries=5, delay=10, backoff=1.5) def run_request(acc, url): r = requests.get("{}/{}".format(url, acc)) diff --git a/bin/get_ENA_metadata.py b/bin/get_ENA_metadata.py index 0769d9dd..c0ffdbd5 100755 --- a/bin/get_ENA_metadata.py +++ b/bin/get_ENA_metadata.py @@ -59,12 +59,23 @@ def get_location(sample_id): def get_gca_location(sample_id): - location = "" + location = None + geo_data_list = list() json_data = load_gca_json(sample_id) - try: - geo_data_list = json_data["characteristics"]["geo loc name"] - except: + if not json_data: + return None + + possible_keys = ["characteristics.geo loc name", "characteristics.geo_loc_name", "description.geo_loc_name"] + for key_pair in possible_keys: + try: + attr1, attr2 = key_pair.split('.') + geo_data_list = json_data[attr1][attr2] + break + except KeyError: + pass + if not geo_data_list: return None + for item in geo_data_list: if "text" in item: location = item["text"].strip().split(":")[0]