Merge pull request #78 from EBI-Metagenomics/dev

Merge dev to a new release
EBI-Metagenomics · Dec 13, 2023 · 853487f · 853487f
2 parents ac2e8f1 + 0714ac3
commit 853487f
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ cd containers && bash build.sh
 
 ## Data preparation
 
-1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders.
+1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders.
 
 2. When genomes are fetched from ENA using the `fetch_ena.py` script, a CSV file with contamination and completeness statistics is also created in the same directory where genomes are saved to. If you are downloading genomes using a different approach, a CSV file needs to be created manually (each line should be genome accession, % completeness, % contamination). The ENA fetching script also pre-filters genomes to satisfy the QS50 cut-off (QS = % completeness - 5 * % contamination).
 

diff --git a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py b/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py
@@ -248,6 +248,7 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
     """
     publications = list()
     biosamples = list()
+    ena_format_issue = False
 
     samples_to_check = [genome_sample_accession]
     # Check if there are read files associated with the sample (meaning sample accession points to raw data already)
@@ -275,6 +276,16 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
                 for attribute in sample_attributes:
                     if all([x in attribute["TAG"] for x in ["derived", "from"]]):
                         derived_from_list = re.findall("SAM[A-Z]+\d+|ERS\d+|SRS\d+|DRS\d+", attribute["VALUE"])
+                        if len(derived_from_list) == 0:
+                            logging.warning("Found an issue with derived samples format for sample {}. "
+                                            "Attempting to resolve.".format(genome_sample_accession))
+                            derived_from_list = identify_derived_sample_issue(attribute["VALUE"])
+                            if len(derived_from_list) == 0:
+                                ena_format_issue = True
+                            else:
+                                logging.warning("Resolved the format issue for sample {} successfully.".format(
+                                    genome_sample_accession))
+                                sample_to_check = derived_from_list
                         samples_for_next_iteration = samples_for_next_iteration + derived_from_list
                         sample_is_derived = True
                         break
@@ -296,17 +307,44 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
                     publications_to_add.extend(list(extracted_publications))
             publications.extend(list(publications_to_add))
         else:
-            logging.error("Could not obtain project accessions for sample {}".
+            logging.warning("Could not obtain project accessions for sample {}".
                           format(genome_sample_accession))
     if not biosamples:
-        logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession))
-        if insdc_accession.startswith("CA"):
-            logging.error("Biosample could not be obtained for an ENA genome {}".
-                          format(genome_sample_accession))
-            sys.exit("ERROR: cannot proceed because ENA samples are not found in ENA.")
+        if ena_format_issue is True:
+            logging.warning("Unable to obtain citation information for sample {} due to format errors in ENA. "
+                            "No citations will be reported to RNAcentral for this sample.".format(
+                genome_sample_accession))
+        else:
+            logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession))
+            if insdc_accession.startswith("CA"):
+                logging.error("Biosample could not be obtained for an ENA genome {}".
+                              format(insdc_accession))
+                sys.exit("ERROR: cannot proceed because at least one ENA sample is not found in ENA.")
     return list(filter(None, list(set(publications))))
 
 
+def identify_derived_sample_issue(ena_data):
+    derived_from_list = list()
+    # check if runs were reported instead of sampples
+    runs = re.findall("ERR\d+|SRR\d+|DRR\d+", ena_data)
+    if len(runs) > 0:
+        for run in runs:
+            xml_data = load_xml(run)
+            try:
+                run_links = xml_data["RUN_SET"]["RUN"]["RUN_LINKS"]["RUN_LINK"]
+                for entry in run_links:
+                    # Check if 'DB' is 'ENA-SAMPLE' inside the inner dictionary
+                    if entry.get('XREF_LINK', {}).get('DB') == 'ENA-SAMPLE':
+                        # If present, print the 'ID' value
+                        sample_acc = entry.get('XREF_LINK', {}).get('ID')
+                        if not sample_acc.startswith("SAM"):
+                            sample_acc = convert_bin_sample(sample_acc)
+                            derived_from_list.append(sample_acc)
+            except:
+                logging.warning("Unable to obtain sample accession for  run {} from ENA".format(run))
+    return derived_from_list
+
+
 def check_sample_level(genome_sample_accession):
     api_endpoint = "https://www.ebi.ac.uk/ena/portal/api/filereport"
     query = {
@@ -372,9 +410,15 @@ def get_project_accession(biosample):
     if r.ok:
         projects = list()
         elements = r.text.split("\n")
-        for e in elements:
-            if not e.startswith("run_accession") and not e == "":
-                projects.append(e.strip().split("\t")[1])
+        header = elements[0]
+        col_names = header.strip().split("\t")
+        study_index = col_names.index("secondary_study_accession")
+        for e in elements[1:]:
+            if not e == "":
+                project_to_add = e.strip().split("\t")[study_index]
+                assert any(project_to_add.startswith(prefix) for prefix in
+                           ["SRP", "ERP", "DRP", "PR"]), "Incorrect project format: {}".format(project_to_add)
+                projects.append(project_to_add)
         return set(projects)
     else:
         logging.error("Error when requesting study accession for biosample {}".format(biosample))

diff --git a/helpers/file_organiser.sh b/helpers/file_organiser.sh
@@ -40,6 +40,8 @@ function GenerateRNACentralJSON {
     echo "Copying GFFs"
     for R in $REPS
     do
+        mkdir -p ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/
+        mv ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/*.gff* ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/
         cp ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/${R}.gff* ${RESULTS_PATH}/additional_data/rnacentral/GFFs/
     done
 
@@ -190,7 +192,7 @@ function ZipAllGenomes {
         SUBFOLDERS=$(ls -d MGYG*)
         for S in $SUBFOLDERS
         do
-            bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/MGYG*gff"
+            bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/genomes1/MGYG*gff"
         done
         cd ..
     done