Skip to content

Commit

Permalink
Merge pull request #78 from EBI-Metagenomics/dev
Browse files Browse the repository at this point in the history
Merge dev to a new release
  • Loading branch information
mberacochea authored Dec 13, 2023
2 parents ac2e8f1 + 0714ac3 commit 853487f
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ cd containers && bash build.sh

## Data preparation

1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/containers/genomes-catalog-update/scripts/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders.
1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders.

2. When genomes are fetched from ENA using the `fetch_ena.py` script, a CSV file with contamination and completeness statistics is also created in the same directory where genomes are saved to. If you are downloading genomes using a different approach, a CSV file needs to be created manually (each line should be genome accession, % completeness, % contamination). The ENA fetching script also pre-filters genomes to satisfy the QS50 cut-off (QS = % completeness - 5 * % contamination).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
"""
publications = list()
biosamples = list()
ena_format_issue = False

samples_to_check = [genome_sample_accession]
# Check if there are read files associated with the sample (meaning sample accession points to raw data already)
Expand Down Expand Up @@ -275,6 +276,16 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
for attribute in sample_attributes:
if all([x in attribute["TAG"] for x in ["derived", "from"]]):
derived_from_list = re.findall("SAM[A-Z]+\d+|ERS\d+|SRS\d+|DRS\d+", attribute["VALUE"])
if len(derived_from_list) == 0:
logging.warning("Found an issue with derived samples format for sample {}. "
"Attempting to resolve.".format(genome_sample_accession))
derived_from_list = identify_derived_sample_issue(attribute["VALUE"])
if len(derived_from_list) == 0:
ena_format_issue = True
else:
logging.warning("Resolved the format issue for sample {} successfully.".format(
genome_sample_accession))
sample_to_check = derived_from_list
samples_for_next_iteration = samples_for_next_iteration + derived_from_list
sample_is_derived = True
break
Expand All @@ -296,17 +307,44 @@ def get_publications(genome_sample_accession, reported_project, insdc_accession)
publications_to_add.extend(list(extracted_publications))
publications.extend(list(publications_to_add))
else:
logging.error("Could not obtain project accessions for sample {}".
logging.warning("Could not obtain project accessions for sample {}".
format(genome_sample_accession))
if not biosamples:
logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession))
if insdc_accession.startswith("CA"):
logging.error("Biosample could not be obtained for an ENA genome {}".
format(genome_sample_accession))
sys.exit("ERROR: cannot proceed because ENA samples are not found in ENA.")
if ena_format_issue is True:
logging.warning("Unable to obtain citation information for sample {} due to format errors in ENA. "
"No citations will be reported to RNAcentral for this sample.".format(
genome_sample_accession))
else:
logging.error("Biosample couldn't be obtained for sample {}".format(genome_sample_accession))
if insdc_accession.startswith("CA"):
logging.error("Biosample could not be obtained for an ENA genome {}".
format(insdc_accession))
sys.exit("ERROR: cannot proceed because at least one ENA sample is not found in ENA.")
return list(filter(None, list(set(publications))))


def identify_derived_sample_issue(ena_data):
derived_from_list = list()
# check if runs were reported instead of sampples
runs = re.findall("ERR\d+|SRR\d+|DRR\d+", ena_data)
if len(runs) > 0:
for run in runs:
xml_data = load_xml(run)
try:
run_links = xml_data["RUN_SET"]["RUN"]["RUN_LINKS"]["RUN_LINK"]
for entry in run_links:
# Check if 'DB' is 'ENA-SAMPLE' inside the inner dictionary
if entry.get('XREF_LINK', {}).get('DB') == 'ENA-SAMPLE':
# If present, print the 'ID' value
sample_acc = entry.get('XREF_LINK', {}).get('ID')
if not sample_acc.startswith("SAM"):
sample_acc = convert_bin_sample(sample_acc)
derived_from_list.append(sample_acc)
except:
logging.warning("Unable to obtain sample accession for run {} from ENA".format(run))
return derived_from_list


def check_sample_level(genome_sample_accession):
api_endpoint = "https://www.ebi.ac.uk/ena/portal/api/filereport"
query = {
Expand Down Expand Up @@ -372,9 +410,15 @@ def get_project_accession(biosample):
if r.ok:
projects = list()
elements = r.text.split("\n")
for e in elements:
if not e.startswith("run_accession") and not e == "":
projects.append(e.strip().split("\t")[1])
header = elements[0]
col_names = header.strip().split("\t")
study_index = col_names.index("secondary_study_accession")
for e in elements[1:]:
if not e == "":
project_to_add = e.strip().split("\t")[study_index]
assert any(project_to_add.startswith(prefix) for prefix in
["SRP", "ERP", "DRP", "PR"]), "Incorrect project format: {}".format(project_to_add)
projects.append(project_to_add)
return set(projects)
else:
logging.error("Error when requesting study accession for biosample {}".format(biosample))
Expand Down
4 changes: 3 additions & 1 deletion helpers/file_organiser.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ function GenerateRNACentralJSON {
echo "Copying GFFs"
for R in $REPS
do
mkdir -p ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/
mv ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/*.gff* ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/
cp ${RESULTS_PATH}/all_genomes/${R::-2}/${R}/genomes1/${R}.gff* ${RESULTS_PATH}/additional_data/rnacentral/GFFs/
done

Expand Down Expand Up @@ -190,7 +192,7 @@ function ZipAllGenomes {
SUBFOLDERS=$(ls -d MGYG*)
for S in $SUBFOLDERS
do
bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/MGYG*gff"
bsub -q production -M 1G -n 1 -o /dev/null "gzip ${S}/genomes1/MGYG*gff"
done
cd ..
done
Expand Down

0 comments on commit 853487f

Please sign in to comment.