diff --git a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py b/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py index c2e29b19..c4115958 100644 --- a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py +++ b/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py @@ -248,6 +248,7 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) """ publications = list() biosamples = list() + ena_format_issue = False samples_to_check = [genome_sample_accession] # Check if there are read files associated with the sample (meaning sample accession points to raw data already) @@ -275,6 +276,16 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) for attribute in sample_attributes: if all([x in attribute["TAG"] for x in ["derived", "from"]]): derived_from_list = re.findall("SAM[A-Z]+\d+|ERS\d+|SRS\d+|DRS\d+", attribute["VALUE"]) + if len(derived_from_list) == 0: + logging.warning("Found an issue with derived samples format for sample {}. " + "Attempting to resolve.".format(genome_sample_accession)) + derived_from_list = identify_derived_sample_issue(attribute["VALUE"]) + if len(derived_from_list) == 0: + ena_format_issue = True + else: + logging.warning("Resolved the format issue for sample {} successfully.".format( + genome_sample_accession)) + sample_to_check = derived_from_list samples_for_next_iteration = samples_for_next_iteration + derived_from_list sample_is_derived = True break @@ -296,17 +307,44 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) publications_to_add.extend(list(extracted_publications)) publications.extend(list(publications_to_add)) else: - logging.error("Could not obtain project accessions for sample {}". + logging.warning("Could not obtain project accessions for sample {}". format(genome_sample_accession)) if not biosamples: - logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession)) - if insdc_accession.startswith("CA"): - logging.error("Biosample could not be obtained for an ENA genome {}". - format(genome_sample_accession)) - sys.exit("ERROR: cannot proceed because ENA samples are not found in ENA.") + if ena_format_issue is True: + logging.warning("Unable to obtain citation information for sample {} due to format errors in ENA. " + "No citations will be reported to RNAcentral for this sample.".format( + genome_sample_accession)) + else: + logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession)) + if insdc_accession.startswith("CA"): + logging.error("Biosample could not be obtained for an ENA genome {}". + format(insdc_accession)) + sys.exit("ERROR: cannot proceed because at least one ENA sample is not found in ENA.") return list(filter(None, list(set(publications)))) +def identify_derived_sample_issue(ena_data): + derived_from_list = list() + # check if runs were reported instead of sampples + runs = re.findall("ERR\d+|SRR\d+|DRR\d+", ena_data) + if len(runs) > 0: + for run in runs: + xml_data = load_xml(run) + try: + run_links = xml_data["RUN_SET"]["RUN"]["RUN_LINKS"]["RUN_LINK"] + for entry in run_links: + # Check if 'DB' is 'ENA-SAMPLE' inside the inner dictionary + if entry.get('XREF_LINK', {}).get('DB') == 'ENA-SAMPLE': + # If present, print the 'ID' value + sample_acc = entry.get('XREF_LINK', {}).get('ID') + if not sample_acc.startswith("SAM"): + sample_acc = convert_bin_sample(sample_acc) + derived_from_list.append(sample_acc) + except: + logging.warning("Unable to obtain sample accession for run {} from ENA".format(run)) + return derived_from_list + + def check_sample_level(genome_sample_accession): api_endpoint = "https://www.ebi.ac.uk/ena/portal/api/filereport" query = { @@ -372,9 +410,15 @@ def get_project_accession(biosample): if r.ok: projects = list() elements = r.text.split("\n") - for e in elements: - if not e.startswith("run_accession") and not e == "": - projects.append(e.strip().split("\t")[1]) + header = elements[0] + col_names = header.strip().split("\t") + study_index = col_names.index("secondary_study_accession") + for e in elements[1:]: + if not e == "": + project_to_add = e.strip().split("\t")[study_index] + assert any(project_to_add.startswith(prefix) for prefix in + ["SRP", "ERP", "DRP", "PR"]), "Incorrect project format: {}".format(project_to_add) + projects.append(project_to_add) return set(projects) else: logging.error("Error when requesting study accession for biosample {}".format(biosample)) diff --git a/helpers/file_organiser.sh b/helpers/file_organiser.sh index 6da5cbc2..c7863cd0 100644 --- a/helpers/file_organiser.sh +++ b/helpers/file_organiser.sh @@ -40,6 +40,8 @@ function GenerateRNACentralJSON { echo "Copying GFFs" for R in $REPS do + mkdir -p ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/ + mv ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/*.gff* ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/ cp ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/${R}.gff* ${RESULTS_PATH}/additional_data/rnacentral/GFFs/ done @@ -190,7 +192,7 @@ function ZipAllGenomes { SUBFOLDERS=$(ls -d MGYG*) for S in $SUBFOLDERS do - bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/MGYG*gff" + bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/genomes1/MGYG*gff" done cd .. done