diff --git a/README.md b/README.md index 09c0d5c7..75c5b397 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ cd containers && bash build.sh ## Data preparation -1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders. +1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders. 2. When genomes are fetched from ENA using the `fetch_ena.py` script, a CSV file with contamination and completeness statistics is also created in the same directory where genomes are saved to. If you are downloading genomes using a different approach, a CSV file needs to be created manually (each line should be genome accession, % completeness, % contamination). The ENA fetching script also pre-filters genomes to satisfy the QS50 cut-off (QS = % completeness - 5 * % contamination). diff --git a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py b/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py index c2e29b19..c4115958 100644 --- a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py +++ b/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py @@ -248,6 +248,7 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) """ publications = list() biosamples = list() + ena_format_issue = False samples_to_check = [genome_sample_accession] # Check if there are read files associated with the sample (meaning sample accession points to raw data already) @@ -275,6 +276,16 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) for attribute in sample_attributes: if all([x in attribute["TAG"] for x in ["derived", "from"]]): derived_from_list = re.findall("SAM[A-Z]+\d+|ERS\d+|SRS\d+|DRS\d+", attribute["VALUE"]) + if len(derived_from_list) == 0: + logging.warning("Found an issue with derived samples format for sample {}. " + "Attempting to resolve.".format(genome_sample_accession)) + derived_from_list = identify_derived_sample_issue(attribute["VALUE"]) + if len(derived_from_list) == 0: + ena_format_issue = True + else: + logging.warning("Resolved the format issue for sample {} successfully.".format( + genome_sample_accession)) + sample_to_check = derived_from_list samples_for_next_iteration = samples_for_next_iteration + derived_from_list sample_is_derived = True break @@ -296,17 +307,44 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession) publications_to_add.extend(list(extracted_publications)) publications.extend(list(publications_to_add)) else: - logging.error("Could not obtain project accessions for sample {}". + logging.warning("Could not obtain project accessions for sample {}". format(genome_sample_accession)) if not biosamples: - logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession)) - if insdc_accession.startswith("CA"): - logging.error("Biosample could not be obtained for an ENA genome {}". - format(genome_sample_accession)) - sys.exit("ERROR: cannot proceed because ENA samples are not found in ENA.") + if ena_format_issue is True: + logging.warning("Unable to obtain citation information for sample {} due to format errors in ENA. " + "No citations will be reported to RNAcentral for this sample.".format( + genome_sample_accession)) + else: + logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession)) + if insdc_accession.startswith("CA"): + logging.error("Biosample could not be obtained for an ENA genome {}". + format(insdc_accession)) + sys.exit("ERROR: cannot proceed because at least one ENA sample is not found in ENA.") return list(filter(None, list(set(publications)))) +def identify_derived_sample_issue(ena_data): + derived_from_list = list() + # check if runs were reported instead of sampples + runs = re.findall("ERR\d+|SRR\d+|DRR\d+", ena_data) + if len(runs) > 0: + for run in runs: + xml_data = load_xml(run) + try: + run_links = xml_data["RUN_SET"]["RUN"]["RUN_LINKS"]["RUN_LINK"] + for entry in run_links: + # Check if 'DB' is 'ENA-SAMPLE' inside the inner dictionary + if entry.get('XREF_LINK', {}).get('DB') == 'ENA-SAMPLE': + # If present, print the 'ID' value + sample_acc = entry.get('XREF_LINK', {}).get('ID') + if not sample_acc.startswith("SAM"): + sample_acc = convert_bin_sample(sample_acc) + derived_from_list.append(sample_acc) + except: + logging.warning("Unable to obtain sample accession for run {} from ENA".format(run)) + return derived_from_list + + def check_sample_level(genome_sample_accession): api_endpoint = "https://www.ebi.ac.uk/ena/portal/api/filereport" query = { @@ -372,9 +410,15 @@ def get_project_accession(biosample): if r.ok: projects = list() elements = r.text.split("\n") - for e in elements: - if not e.startswith("run_accession") and not e == "": - projects.append(e.strip().split("\t")[1]) + header = elements[0] + col_names = header.strip().split("\t") + study_index = col_names.index("secondary_study_accession") + for e in elements[1:]: + if not e == "": + project_to_add = e.strip().split("\t")[study_index] + assert any(project_to_add.startswith(prefix) for prefix in + ["SRP", "ERP", "DRP", "PR"]), "Incorrect project format: {}".format(project_to_add) + projects.append(project_to_add) return set(projects) else: logging.error("Error when requesting study accession for biosample {}".format(biosample)) diff --git a/helpers/file_organiser.sh b/helpers/file_organiser.sh index 6da5cbc2..c7863cd0 100644 --- a/helpers/file_organiser.sh +++ b/helpers/file_organiser.sh @@ -40,6 +40,8 @@ function GenerateRNACentralJSON { echo "Copying GFFs" for R in $REPS do + mkdir -p ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/ + mv ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/*.gff* ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/ cp ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/${R}.gff* ${RESULTS_PATH}/additional_data/rnacentral/GFFs/ done @@ -190,7 +192,7 @@ function ZipAllGenomes { SUBFOLDERS=$(ls -d MGYG*) for S in $SUBFOLDERS do - bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/MGYG*gff" + bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/genomes1/MGYG*gff" done cd .. done