Skip to content

Commit

Permalink
Merge pull request #51 from EBI-Metagenomics/bugfix/ENA_metadata
Browse files Browse the repository at this point in the history
Bugfix/ena metadata
  • Loading branch information
mberacochea authored Jul 4, 2023
2 parents dffd772 + 731888b commit d413eea
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 34 deletions.
63 changes: 33 additions & 30 deletions bin/create_metadata_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from retry import retry

from assembly_stats import run_assembly_stats
from get_ENA_metadata import get_location, load_xml, load_gca_json, get_gca_location
from get_ENA_metadata import get_location, load_xml, get_gca_location

logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -115,8 +115,8 @@ def load_geography(geofile):

def get_metadata(acc):
location = None
project = None
biosample = None
project = "N/A"
biosample = "N/A"
if acc.startswith("ERZ"):
json_data_erz = load_xml(acc)
biosample = json_data_erz["ANALYSIS_SET"]["ANALYSIS"]["SAMPLE_REF"][
Expand All @@ -128,44 +128,29 @@ def get_metadata(acc):
elif acc.startswith("GUT"):
pass
elif acc.startswith("GCA"):
json_data_gca = load_xml(acc)
biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
try:
json_data_gca = load_xml(acc)
biosample = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["SAMPLE_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
project = json_data_gca["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"]["IDENTIFIERS"]["PRIMARY_ID"]
except:
logging.info("Missing metadata in ENA XML for sample {}. Using API instead.".format(acc))
try:
biosample, project = ena_api_request(acc)
except:
logging.exception("Could not obtain biosample and project information for {}".format(acc))
else:
if acc.startswith("CA"):
acc = acc + "0" * 7
r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl")
if r.ok:
match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text)
if match_pr:
project = match_pr[0]
else:
project = ""
match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text)
if match_samp:
biosample = match_samp[0]
else:
biosample = ""
else:
logging.error("Cannot obtain metadata from ENA")
sys.exit()
biosample, project = ena_api_request(acc)
if not acc.startswith("GUT"):
if acc.startswith("GCA"):
location = get_gca_location(biosample)
else:
location = get_location(biosample)
if not location:
logging.warning("Unable to obtain location for sample {}".format(biosample))
location = "not provided"
if not acc.startswith("GUT"):
if acc.startswith("GCA"):
json_data_sample = load_xml(acc)
converted_sample = biosample
try:
project = json_data_sample["ASSEMBLY_SET"]["ASSEMBLY"]["STUDY_REF"][
"IDENTIFIERS"
]["PRIMARY_ID"]
except:
project = "N/A"
else:
json_data_sample = load_xml(biosample)
try:
Expand All @@ -191,6 +176,24 @@ def get_metadata(acc):
return converted_sample, converted_project, location


def ena_api_request(acc):
biosample = project = ""
if acc.startswith("CA"):
acc = acc + "0" * 7
r = run_request(acc, "https://www.ebi.ac.uk/ena/browser/api/embl")
if r.ok:
match_pr = re.findall("PR +Project: *(PRJ[A-Z0-9]+)", r.text)
if match_pr:
project = match_pr[0]
match_samp = re.findall("DR +BioSample; ([A-Z0-9]+)", r.text)
if match_samp:
biosample = match_samp[0]
else:
logging.error("Cannot obtain metadata from ENA")
sys.exit()
return biosample, project


@retry(tries=5, delay=10, backoff=1.5)
def run_request(acc, url):
r = requests.get("{}/{}".format(url, acc))
Expand Down
19 changes: 15 additions & 4 deletions bin/get_ENA_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,23 @@ def get_location(sample_id):


def get_gca_location(sample_id):
location = ""
location = None
geo_data_list = list()
json_data = load_gca_json(sample_id)
try:
geo_data_list = json_data["characteristics"]["geo loc name"]
except:
if not json_data:
return None

possible_keys = ["characteristics.geo loc name", "characteristics.geo_loc_name", "description.geo_loc_name"]
for key_pair in possible_keys:
try:
attr1, attr2 = key_pair.split('.')
geo_data_list = json_data[attr1][attr2]
break
except KeyError:
pass
if not geo_data_list:
return None

for item in geo_data_list:
if "text" in item:
location = item["text"].strip().split(":")[0]
Expand Down

0 comments on commit d413eea

Please sign in to comment.