Merge pull request #52 from EBI-Metagenomics/dev

Dev
EBI-Metagenomics · Jul 4, 2023 · fc2698f · fc2698f
2 parents 467b47b + d413eea
commit fc2698f
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 34 deletions.
diff --git a/bin/create_metadata_table.py b/bin/create_metadata_table.py
@@ -27,7 +27,7 @@
 from retry import retry
 
 from assembly_stats import run_assembly_stats
-from get_ENA_metadata import get_location, load_xml, load_gca_json, get_gca_location
+from get_ENA_metadata import get_location, load_xml, get_gca_location
 
 logging.basicConfig(level=logging.INFO)
 
@@ -115,8 +115,8 @@ def load_geography(geofile):
 
 def get_metadata(acc):
     location = None
-    project = None
-    biosample = None
+    project = "N/A"
+    biosample = "N/A"
     if acc.startswith("ERZ"):
         json_data_erz = load_xml(acc)
         biosample = json_data_erz["ANALYSIS_SET"]["ANALYSIS"]["SAMPLE_REF"][
@@ -128,44 +128,29 @@ def get_metadata(acc):
     elif acc.startswith("GUT"):
         pass
     elif acc.startswith("GCA"):
-        json_data_gca = load_xml(acc)
-        biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
-        project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
+        try:
+            json_data_gca = load_xml(acc)
+            biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
+            project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
+        except:
+            logging.info("Missing metadata in ENA XML for sample {}. Using API instead.".format(acc))
+            try:
+                biosample, project = ena_api_request(acc)
+            except:
+                logging.exception("Could not obtain biosample and project information for {}".format(acc))
     else:
-        if acc.startswith("CA"):
-            acc = acc + "0" * 7
-        r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl")
-        if r.ok:
-            match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text)
-            if match_pr:
-                project = match_pr[0]
-            else:
-                project = ""
-            match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text)
-            if match_samp:
-                biosample = match_samp[0]
-            else:
-                biosample = ""
-        else:
-            logging.error("Cannot obtain metadata from ENA")
-            sys.exit()
+        biosample, project = ena_api_request(acc)
     if not acc.startswith("GUT"):
         if acc.startswith("GCA"):
             location = get_gca_location(biosample)
         else:
             location = get_location(biosample)
     if not location:
+        logging.warning("Unable to obtain location for sample {}".format(biosample))
         location = "not provided"
     if not acc.startswith("GUT"):
         if acc.startswith("GCA"):
-            json_data_sample = load_xml(acc)
             converted_sample = biosample
-            try:
-                project = json_data_sample["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"][
-                    "IDENTIFIERS"
-                ]["PRIMARY_ID"]
-            except:
-                project = "N/A"
         else:
             json_data_sample = load_xml(biosample)
             try:
@@ -191,6 +176,24 @@ def get_metadata(acc):
     return converted_sample, converted_project, location
 
 
+def ena_api_request(acc):
+    biosample = project = ""
+    if acc.startswith("CA"):
+        acc = acc + "0" * 7
+    r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl")
+    if r.ok:
+        match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text)
+        if match_pr:
+            project = match_pr[0]
+        match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text)
+        if match_samp:
+            biosample = match_samp[0]
+    else:
+        logging.error("Cannot obtain metadata from ENA")
+        sys.exit()
+    return biosample, project
+
+
 @retry(tries=5, delay=10, backoff=1.5)
 def run_request(acc, url):
     r = requests.get("{}/{}".format(url, acc))

diff --git a/bin/get_ENA_metadata.py b/bin/get_ENA_metadata.py
@@ -59,12 +59,23 @@ def get_location(sample_id):
 
 
 def get_gca_location(sample_id):
-    location = ""
+    location = None
+    geo_data_list = list()
     json_data = load_gca_json(sample_id)
-    try:
-        geo_data_list = json_data["characteristics"]["geo loc name"]
-    except:
+    if not json_data:
+        return None
+
+    possible_keys = ["characteristics.geo loc name", "characteristics.geo_loc_name", "description.geo_loc_name"]
+    for key_pair in possible_keys:
+        try:
+            attr1, attr2 = key_pair.split('.')
+            geo_data_list = json_data[attr1][attr2]
+            break
+        except KeyError:
+            pass
+    if not geo_data_list:
         return None
+
     for item in geo_data_list:
         if "text" in item:
             location = item["text"].strip().split(":")[0]