From b069a0a80a14522f754250645fd4acf8c3227bcb Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 23 Jul 2024 14:18:22 -0400 Subject: [PATCH 01/14] pushing changes to remote --- .../from_bash_copy_from_tdr.py | 145 ++++++++++++------ .../tdr/copy_from_tdr_to_gcs/requirements.txt | 4 - 2 files changed, 102 insertions(+), 47 deletions(-) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py index eeeacd7a..f6bfc563 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py +++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py @@ -1,69 +1,128 @@ import os import sys +import csv import json import requests import subprocess from google.auth import compute_engine from google.auth.transport.requests import Request +STAGING_AREA_BUCKETS = { + "prod": { + "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", + "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", + "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern", + "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", + "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset" + } +} + # input should be a manifest csv of those projects that need data copied back -# Check if a filename is provided as an argument +# Check if n csv_path is provided as an argument if len(sys.argv) != 2: - print("Usage: python3 script.py ") + print("Usage: python3 copy_from_tdr.py ") sys.exit(1) -filename = sys.argv[1] +csv_path = sys.argv[1] # Check if the file exists -if not os.path.isfile(filename): - print(f"File {filename} not found") +if not os.path.isfile(csv_path): + print(f"{csv_path} not found") + sys.exit(1) + +# Check if the file is a csv +if not csv_path.endswith('.csv'): + print(f"{csv_path} is not a csv file") sys.exit(1) +def _parse_csv(csv_path:str): + keys = set() + with open(csv_path, "r") as f: + reader = csv.reader(f) + for row in reader: + if not row: + logging.debug("Empty path detected, skipping") + continue + + assert len(row) == 2 + institution = row[0] + project_id = find_project_id_in_str(row[1]) + + key = None + if project_id_only: + project_id = row[1] + key = project_id + else: + + if institution not in STAGING_AREA_BUCKETS[env]: + raise Exception(f"Unknown institution {institution} found") + + institution_bucket = STAGING_AREA_BUCKETS[env][institution] + path = institution_bucket + "/" + project_id + + # sanitize and dedupe + path = _sanitize_gs_path(path) + assert path.startswith("gs://"), "Staging area path must start with gs:// scheme" + key = path + + if include_release_tag: + key = key + f",{release_tag}" + keys.add(key) + + chunked_paths = chunked(keys, MAX_STAGING_AREAS_PER_PARTITION_SET) + return [chunk for chunk in chunked_paths] + # Read the file line by line -with open(filename, 'r') as file: +with open csv_path, 'r') as file: lines = file.read().splitlines() + # + +print(lines) # TODO create the list of snapshot IDs from the list of UUIDs # use manifest.csv to get the UUIDs that need data copied back # for each UUID, construct the snapshot name - which will be latest snapshot with a dataset id like # "hca_prod_*" -# use those dataset ids to get the latest snapshot id for each dataset -# this then becomes the lines list +# use those snapshot ids to get the latest snapshot id for each dataset +# this then becomes the snapshot list + +snapshots = [] +for line in lines: # TODO - is this needed? or can we just run locally as Monster members? # Get access token -credentials = compute_engine.Credentials() -credentials.refresh(Request()) -access_token = credentials.token - -for snapshot in lines: - # Make request to the API with the current snapshot - response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000", - headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}) - - # Write the response to a JSON file - with open(f"response_{snapshot}.json", 'w') as outfile: - json.dump(response.json(), outfile) - - # Extract file details from the JSON file and append them to a text file - with open(f"response_{snapshot}.json", 'r') as json_file: - data = json.load(json_file) - with open("list_of_access_urls.txt", 'a') as outfile: - for item in data: - outfile.write(item['fileDetail']['accessUrl'] + '\n') - - -# Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp -with open("list_of_access_urls.txt", 'r') as file: - access_urls = file.read().splitlines() - -# TODO -# copy command will look something like\ -# gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/. -# Also need to construct the staging/data gs:// path from the manifest.csv -# "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", -# "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", -# "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern", -# "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", -for access_url in access_urls: - subprocess.run(['gcloud storage', 'cp', access_url, ""]) \ No newline at end of file +# credentials = compute_engine.Credentials() +# credentials.refresh(Request()) +# access_token = credentials.token + +# for snapshot in lines: +# # Make request to the API with the current snapshot +# response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000", +# headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}) +# +# # Write the response to a JSON file +# with open(f"response_{snapshot}.json", 'w') as outfile: +# json.dump(response.json(), outfile) +# +# # Extract file details from the JSON file and append them to a text file +# with open(f"response_{snapshot}.json", 'r') as json_file: +# data = json.load(json_file) +# with open("list_of_access_urls.txt", 'a') as outfile: +# for item in data: +# outfile.write(item['fileDetail']['accessUrl'] + '\n') +# +# +# # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp +# with open("list_of_access_urls.txt", 'r') as file: +# access_urls = file.read().splitlines() + +# # TODO +# # copy command will look something like\ +# # gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/. +# # Also need to construct the staging/data gs:// path from the manifest.csv +# # "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", +# # "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", +# # "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern", +# # "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", +# for access_url in access_urls: +# subprocess.run(['gcloud storage', 'cp', access_url, ""]) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt index 190fd236..fa3af485 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt +++ b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt @@ -3,9 +3,5 @@ # source venv/bin/activate # pip install -r requirements.txt -os -sys -json requests -subprocess google-auth \ No newline at end of file From 48ddc849a26221a595eda1f302c7651a2e81276e Mon Sep 17 00:00:00 2001 From: bahill Date: Wed, 31 Jul 2024 17:52:34 -0400 Subject: [PATCH 02/14] continuing to convert pseudo code to functional code --- .../from_bash_copy_from_tdr.py | 206 +++++++++++------- .../tdr/copy_from_tdr_to_gcs/requirements.txt | 3 +- 2 files changed, 131 insertions(+), 78 deletions(-) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py index f6bfc563..4e943b37 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py +++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py @@ -1,42 +1,71 @@ import os import sys import csv +import logging import json +import re import requests +import google.auth +import google.auth.transport.requests import subprocess -from google.auth import compute_engine -from google.auth.transport.requests import Request STAGING_AREA_BUCKETS = { - "prod": { "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern", "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset" - } } -# input should be a manifest csv of those projects that need data copied back -# Check if n csv_path is provided as an argument -if len(sys.argv) != 2: - print("Usage: python3 copy_from_tdr.py ") - sys.exit(1) - -csv_path = sys.argv[1] - -# Check if the file exists -if not os.path.isfile(csv_path): - print(f"{csv_path} not found") - sys.exit(1) - -# Check if the file is a csv -if not csv_path.endswith('.csv'): - print(f"{csv_path} is not a csv file") - sys.exit(1) +def setup_cli_logging_format() -> None: + logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) + +def validate_input(csv_path:str): + """ + input should be a manifest csv of those projects that need data copied back + format is , + """ + if not os.path.isfile(csv_path): + logging.debug(f"{csv_path} not found") + sys.exit(1) + + if not csv_path.endswith('.csv'): + logging.debug(f"{csv_path} is not a csv file") + sys.exit(1) + + else: + return csv_path + +def find_project_id_in_str(s: str) -> str: + """ + The selected function find_project_id_in_str(s: str) -> str: + is used to extract a UUID (Universally Unique Identifier) from a given string s. + :param s: + :return: + Attribution: + https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_orchestration/support/matchers.py + """ + uuid_matcher = re.compile('[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}', re.I) + project_ids = uuid_matcher.findall(s) + + if len(project_ids) != 1: + raise Exception(f"Found more than one or zero project UUIDs in {s}") + + return str(project_ids[0]) + +def _sanitize_gs_path(path: str) -> str: + return path.strip().strip("/") def _parse_csv(csv_path:str): - keys = set() + """ + Parses the csv file and returns a list of staging areas + :param csv_path: + :return: + Attribution: + https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py + """ + gs_paths = set() + project_ids = set() with open(csv_path, "r") as f: reader = csv.reader(f) for row in reader: @@ -48,61 +77,60 @@ def _parse_csv(csv_path:str): institution = row[0] project_id = find_project_id_in_str(row[1]) - key = None - if project_id_only: - project_id = row[1] - key = project_id - else: - - if institution not in STAGING_AREA_BUCKETS[env]: - raise Exception(f"Unknown institution {institution} found") - - institution_bucket = STAGING_AREA_BUCKETS[env][institution] - path = institution_bucket + "/" + project_id - - # sanitize and dedupe - path = _sanitize_gs_path(path) - assert path.startswith("gs://"), "Staging area path must start with gs:// scheme" - key = path - - if include_release_tag: - key = key + f",{release_tag}" - keys.add(key) - - chunked_paths = chunked(keys, MAX_STAGING_AREAS_PER_PARTITION_SET) - return [chunk for chunk in chunked_paths] - -# Read the file line by line -with open csv_path, 'r') as file: - lines = file.read().splitlines() - # - -print(lines) - -# TODO create the list of snapshot IDs from the list of UUIDs -# use manifest.csv to get the UUIDs that need data copied back -# for each UUID, construct the snapshot name - which will be latest snapshot with a dataset id like -# "hca_prod_*" -# use those snapshot ids to get the latest snapshot id for each dataset -# this then becomes the snapshot list - -snapshots = [] -for line in lines: - -# TODO - is this needed? or can we just run locally as Monster members? -# Get access token -# credentials = compute_engine.Credentials() -# credentials.refresh(Request()) -# access_token = credentials.token - -# for snapshot in lines: -# # Make request to the API with the current snapshot -# response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000", -# headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}) -# -# # Write the response to a JSON file -# with open(f"response_{snapshot}.json", 'w') as outfile: -# json.dump(response.json(), outfile) + project_ids.add(project_id) + + gs_path = None + if institution not in STAGING_AREA_BUCKETS: + raise Exception(f"Unknown institution {institution} found") + + institution_bucket = STAGING_AREA_BUCKETS[institution] + path = institution_bucket + "/" + project_id + + # sanitize and dedupe + path = _sanitize_gs_path(path) + assert path.startswith("gs://"), "Staging area path must start with gs:// scheme" + gs_path = path + + gs_paths.add(gs_path) + + # print(f"These are the parsed gs_paths {gs_paths}") + # print(f"These are the parsed project_ids {project_ids}") + return gs_paths, project_ids + + +def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]: + """ + This function gets the target snapshot name filters for the given project ids + :param project_ids: + :return: + """ + target_snapshots = set() + for project in project_ids: + target_snapshot = f"hca_prod_{project.replace('-', '')}" + target_snapshots.add(target_snapshot) + return target_snapshots + + +# TODO: make this work in a Docker image +def get_access_token(): + creds, project = google.auth.default() + auth_req = google.auth.transport.requests.Request() + creds.refresh(auth_req) + access_token = creds.token + return access_token + + +def _get_latest_snapshot(target_snapshots: set[str], access_token: str): + for snapshot_name in target_snapshots: + response = requests.get(f'https://data.terra.bio/api/repository/v1/snapshots?sort=createdDate,' + f'desc&limit=1&filter={snapshot_name}', + headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}) + response.raise_for_status() + with open(f"response_{snapshot_name}.json", 'w') as outfile: + json.dump(response.json(), outfile) + +# TODO 8/1/24 - got a 400 for invalid filter - yay! +# need to add note that you need to gcloud auth to run this. # # # Extract file details from the JSON file and append them to a text file # with open(f"response_{snapshot}.json", 'r') as json_file: @@ -126,3 +154,27 @@ def _parse_csv(csv_path:str): # # "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", # for access_url in access_urls: # subprocess.run(['gcloud storage', 'cp', access_url, ""]) + + +def main(): + """Parse command-line arguments and run specified tool. + + Note: Does not take explicit input arguments, but uses sys.argv inputs + from the command line. + + """ + setup_cli_logging_format() + access_token = get_access_token() + csv_path = sys.argv[1] + validate_input(csv_path) + gs_paths = _parse_csv(csv_path)[0] + print(f"gs_paths are {gs_paths}") + project_ids = _parse_csv(csv_path)[1] + print(f"project_ids are {project_ids}") + target_snapshots = _get_target_snapshot_ids(project_ids) + print(f"target snapshot ids are {target_snapshots}") + tdr_data_path = _get_latest_snapshot(target_snapshots, access_token) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt index fa3af485..44950ab8 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt +++ b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt @@ -4,4 +4,5 @@ # pip install -r requirements.txt requests -google-auth \ No newline at end of file +google-cloud +google-auth From ab4298aa4bf86d238622b451164847ac38bc1b22 Mon Sep 17 00:00:00 2001 From: bahill Date: Wed, 7 Aug 2024 19:30:30 -0400 Subject: [PATCH 03/14] continuing to convert pseudo code to functional code --- scripts/tdr/copy_from_tdr_to_gcs/README.md | 3 +- .../from_bash_copy_from_tdr.py | 113 +++++++++++++----- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/README.md b/scripts/tdr/copy_from_tdr_to_gcs/README.md index 938e541d..2b64aeb1 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/README.md +++ b/scripts/tdr/copy_from_tdr_to_gcs/README.md @@ -4,8 +4,9 @@ This was originally a bash script written by Samantha Velasquez\ which was written to copy files from a TDR snapshot to an Azure bucket.\ Bobbie then translated to python using CoPilot.\ [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \ -**This script is not yet tested.** +Set up: +gcloud auth login ## TODO - [ ] fix requirements.txt as needed diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py index 4e943b37..ae277f58 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py +++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py @@ -17,6 +17,8 @@ "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset" } +# TODO change prints to logging + def setup_cli_logging_format() -> None: logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) @@ -53,7 +55,7 @@ def find_project_id_in_str(s: str) -> str: return str(project_ids[0]) -def _sanitize_gs_path(path: str) -> str: +def _sanitize_staging_gs_path(path: str) -> str: return path.strip().strip("/") def _parse_csv(csv_path:str): @@ -64,7 +66,7 @@ def _parse_csv(csv_path:str): Attribution: https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py """ - gs_paths = set() + staging_gs_paths = set() project_ids = set() with open(csv_path, "r") as f: reader = csv.reader(f) @@ -79,7 +81,7 @@ def _parse_csv(csv_path:str): project_ids.add(project_id) - gs_path = None + staging_gs_path = None if institution not in STAGING_AREA_BUCKETS: raise Exception(f"Unknown institution {institution} found") @@ -87,15 +89,15 @@ def _parse_csv(csv_path:str): path = institution_bucket + "/" + project_id # sanitize and dedupe - path = _sanitize_gs_path(path) + path = _sanitize_staging_gs_path(path) assert path.startswith("gs://"), "Staging area path must start with gs:// scheme" - gs_path = path + staging_gs_path = path - gs_paths.add(gs_path) + staging_gs_paths.add(staging_gs_path) - # print(f"These are the parsed gs_paths {gs_paths}") + # print(f"These are the parsed staging_gs_paths {staging_gs_paths}") # print(f"These are the parsed project_ids {project_ids}") - return gs_paths, project_ids + return staging_gs_paths, project_ids def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]: @@ -120,27 +122,65 @@ def get_access_token(): return access_token -def _get_latest_snapshot(target_snapshots: set[str], access_token: str): +def _get_latest_snapshots(target_snapshots: set[str], access_token: str): + latest_snapshots = [] for snapshot_name in target_snapshots: - response = requests.get(f'https://data.terra.bio/api/repository/v1/snapshots?sort=createdDate,' - f'desc&limit=1&filter={snapshot_name}', - headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}) - response.raise_for_status() - with open(f"response_{snapshot_name}.json", 'w') as outfile: - json.dump(response.json(), outfile) - -# TODO 8/1/24 - got a 400 for invalid filter - yay! -# need to add note that you need to gcloud auth to run this. -# -# # Extract file details from the JSON file and append them to a text file -# with open(f"response_{snapshot}.json", 'r') as json_file: -# data = json.load(json_file) -# with open("list_of_access_urls.txt", 'a') as outfile: -# for item in data: -# outfile.write(item['fileDetail']['accessUrl'] + '\n') -# -# + snapshot_response = requests.get( + f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter={snapshot_name}', + headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} + ) + snapshot_response.raise_for_status() + # with open(f"response_{snapshot_name}.json", 'w') as outfile: + # # not sure that I need to dump this? - not sure I need to write an output file at all actually + # json.dump(snapshot_response.json(), outfile) + latest_snapshot_id = snapshot_response.json()['items'][0]['id'] + latest_snapshots.append(latest_snapshot_id) + return latest_snapshots + +# then for each snapshot get access url and add to a list of access urls for that snapshot +def get_access_urls(latest_snapshot_ids: list[str], access_token: str): + for snapshot in latest_snapshot_ids: + files_response = requests.get( + f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0', + headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} + ) + files_response.raise_for_status() + # TODO - don't need? + # with open(f"response_{snapshot}.json", 'w') as outfile: + # json.dump(files_response.json(), outfile) + + # Extract file details from the JSON file and append them to a text file + list_of_access_urls = [] + data = files_response.json() + for item in data: + list_of_access_urls.append(item['fileDetail']['accessUrl']) + return list_of_access_urls + + + # with open(f"response_{snapshot}.json", 'r') as json_file: + # data = json.load(json_file) + # with open("list_of_access_urls.txt", 'a') as outfile: + # for item in data: + # outfile.write(item['fileDetail']['accessUrl'] + '\n') + +def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]): + # TODO this will need to be modified to work one by one - see nesting below + for staging_dir in staging_gs_paths: + staging_data_dir = staging_dir + '/data/' + # output = subprocess.run(['gcloud', 'storage', 'ls', staging_data_dir], capture_output=True, text=True) + # TODO FIX THIS - it's not actually stopping if it's not empty - and it won't be entirely empty... need another method + # if output.stdout.strip() != '': + # print(f"Staging area {staging_data_dir} is not empty") + # print(output.stdout.strip()) + # else: + # print(f"Staging area {staging_data_dir} is empty - copying files now") + for access_url in access_urls: + print(f"Copying {access_url} to {staging_data_dir}") + subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir]) + # HM... why is copy hard? does it need /. at the end? or is it the access url that's wrong? + # # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp +# we should make sure the staging/data dir is empty before running this # with open("list_of_access_urls.txt", 'r') as file: # access_urls = file.read().splitlines() @@ -167,13 +207,26 @@ def main(): access_token = get_access_token() csv_path = sys.argv[1] validate_input(csv_path) - gs_paths = _parse_csv(csv_path)[0] - print(f"gs_paths are {gs_paths}") + staging_gs_paths = _parse_csv(csv_path)[0] + print(f"staging_gs_paths are {staging_gs_paths}") project_ids = _parse_csv(csv_path)[1] print(f"project_ids are {project_ids}") target_snapshots = _get_target_snapshot_ids(project_ids) print(f"target snapshot ids are {target_snapshots}") - tdr_data_path = _get_latest_snapshot(target_snapshots, access_token) + latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token) + print(f"latest_snapshots_ids are {latest_snapshot_ids}") + access_urls = get_access_urls(latest_snapshot_ids, access_token) + print(f"access_urls are {access_urls}") + copy_tdr_to_staging(access_urls, staging_gs_paths) + + # ultimately + # for each project_id in project_ids + # get the staging gs path + # get the snapshot name + # get the latest snapshot id + # get the access url for each file in the snapshot + # for each file in the snapshot + # copy the file from the access to the staging area if __name__ == '__main__': From efe1b04a8373d61483ecb5b8bad189bb5d637565 Mon Sep 17 00:00:00 2001 From: bahill Date: Mon, 12 Aug 2024 11:34:04 -0400 Subject: [PATCH 04/14] cleaning up code and updated script names --- .../README.md | 1 - .../from_bash_copy_from_tdr_hca.py} | 105 ++++++++---------- .../get_snapshot_files_and_transfer.sh | 0 .../requirements.txt | 0 4 files changed, 44 insertions(+), 62 deletions(-) rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/README.md (99%) rename scripts/tdr/{copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py => copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py} (62%) rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/get_snapshot_files_and_transfer.sh (100%) rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/requirements.txt (100%) diff --git a/scripts/tdr/copy_from_tdr_to_gcs/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md similarity index 99% rename from scripts/tdr/copy_from_tdr_to_gcs/README.md rename to scripts/tdr/copy_from_tdr_to_gcs_hca/README.md index 2b64aeb1..ff96eb3b 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/README.md +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md @@ -7,7 +7,6 @@ Bobbie then translated to python using CoPilot.\ Set up: gcloud auth login - ## TODO - [ ] fix requirements.txt as needed - [ ] update the script to copy to staging /data bucket diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py similarity index 62% rename from scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py rename to scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py index ae277f58..61b8228e 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py @@ -2,7 +2,6 @@ import sys import csv import logging -import json import re import requests import google.auth @@ -19,10 +18,12 @@ # TODO change prints to logging + def setup_cli_logging_format() -> None: logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) -def validate_input(csv_path:str): + +def validate_input(csv_path: str): """ input should be a manifest csv of those projects that need data copied back format is , @@ -38,10 +39,11 @@ def validate_input(csv_path:str): else: return csv_path + def find_project_id_in_str(s: str) -> str: """ The selected function find_project_id_in_str(s: str) -> str: - is used to extract a UUID (Universally Unique Identifier) from a given string s. + is used to extract a valid UUID (Universally Unique Identifier) from a given string s. :param s: :return: Attribution: @@ -55,9 +57,11 @@ def find_project_id_in_str(s: str) -> str: return str(project_ids[0]) + def _sanitize_staging_gs_path(path: str) -> str: return path.strip().strip("/") + def _parse_csv(csv_path:str): """ Parses the csv file and returns a list of staging areas @@ -81,9 +85,9 @@ def _parse_csv(csv_path:str): project_ids.add(project_id) - staging_gs_path = None if institution not in STAGING_AREA_BUCKETS: - raise Exception(f"Unknown institution {institution} found") + raise Exception(f"Unknown institution {institution} found. " + f"Make sure the institution is in the list of staging area buckets and is in all caps") institution_bucket = STAGING_AREA_BUCKETS[institution] path = institution_bucket + "/" + project_id @@ -95,8 +99,6 @@ def _parse_csv(csv_path:str): staging_gs_paths.add(staging_gs_path) - # print(f"These are the parsed staging_gs_paths {staging_gs_paths}") - # print(f"These are the parsed project_ids {project_ids}") return staging_gs_paths, project_ids @@ -130,14 +132,12 @@ def _get_latest_snapshots(target_snapshots: set[str], access_token: str): headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} ) snapshot_response.raise_for_status() - # with open(f"response_{snapshot_name}.json", 'w') as outfile: - # # not sure that I need to dump this? - not sure I need to write an output file at all actually - # json.dump(snapshot_response.json(), outfile) latest_snapshot_id = snapshot_response.json()['items'][0]['id'] latest_snapshots.append(latest_snapshot_id) return latest_snapshots -# then for each snapshot get access url and add to a list of access urls for that snapshot + +# for each snapshot get access url and add to a list of access urls for that snapshot def get_access_urls(latest_snapshot_ids: list[str], access_token: str): for snapshot in latest_snapshot_ids: files_response = requests.get( @@ -145,11 +145,8 @@ def get_access_urls(latest_snapshot_ids: list[str], access_token: str): headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} ) files_response.raise_for_status() - # TODO - don't need? - # with open(f"response_{snapshot}.json", 'w') as outfile: - # json.dump(files_response.json(), outfile) - # Extract file details from the JSON file and append them to a text file + # Extract file details from the JSON file and add them to a list list_of_access_urls = [] data = files_response.json() for item in data: @@ -157,43 +154,28 @@ def get_access_urls(latest_snapshot_ids: list[str], access_token: str): return list_of_access_urls - # with open(f"response_{snapshot}.json", 'r') as json_file: - # data = json.load(json_file) - # with open("list_of_access_urls.txt", 'a') as outfile: - # for item in data: - # outfile.write(item['fileDetail']['accessUrl'] + '\n') - def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]): - # TODO this will need to be modified to work one by one - see nesting below for staging_dir in staging_gs_paths: staging_data_dir = staging_dir + '/data/' - # output = subprocess.run(['gcloud', 'storage', 'ls', staging_data_dir], capture_output=True, text=True) - # TODO FIX THIS - it's not actually stopping if it's not empty - and it won't be entirely empty... need another method - # if output.stdout.strip() != '': - # print(f"Staging area {staging_data_dir} is not empty") - # print(output.stdout.strip()) - # else: - # print(f"Staging area {staging_data_dir} is empty - copying files now") + logging.info(f'staging_data_dir is {staging_data_dir}') + # using gsutil as output is cleaner & faster + output = subprocess.run(['gsutil', 'ls', staging_data_dir], capture_output=True) + stdout = output.stdout.strip() + files = stdout.decode('utf-8').split('\n') + if len(files) > 1: + logging.error(f"Staging area {staging_data_dir} is not empty") + logging.info(f"files in staging area are: {files}") + continue + else: + logging.info(f"Staging area {staging_data_dir} is empty - copying files now") for access_url in access_urls: - print(f"Copying {access_url} to {staging_data_dir}") - subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir]) - # HM... why is copy hard? does it need /. at the end? or is it the access url that's wrong? - -# # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp -# we should make sure the staging/data dir is empty before running this -# with open("list_of_access_urls.txt", 'r') as file: -# access_urls = file.read().splitlines() - -# # TODO -# # copy command will look something like\ -# # gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/. -# # Also need to construct the staging/data gs:// path from the manifest.csv -# # "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", -# # "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod", -# # "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern", -# # "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging", -# for access_url in access_urls: -# subprocess.run(['gcloud storage', 'cp', access_url, ""]) + try: + # strip the filename from the access url because gcp is not a file system - it's all objects + filename = access_url.split('/')[-1] + print(f"Copying {access_url} to {staging_data_dir}{filename}") + subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir + filename]) + except Exception as e: + logging.error(f"Error copying {access_url} to {staging_data_dir}{filename}: {e}") def main(): @@ -205,28 +187,29 @@ def main(): """ setup_cli_logging_format() access_token = get_access_token() + + # read in the manifest and parse out the staging gs paths and project ids csv_path = sys.argv[1] validate_input(csv_path) staging_gs_paths = _parse_csv(csv_path)[0] - print(f"staging_gs_paths are {staging_gs_paths}") + logging.info(f"staging_gs_paths are {staging_gs_paths}") project_ids = _parse_csv(csv_path)[1] - print(f"project_ids are {project_ids}") + logging.info(f"project_ids are {project_ids}") + + # get the target snapshot ids, based on standard HCA ingest naming conventions target_snapshots = _get_target_snapshot_ids(project_ids) - print(f"target snapshot ids are {target_snapshots}") + logging.info(f"target snapshot ids are {target_snapshots}") + + # get the latest snapshot ids for each target snapshot latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token) - print(f"latest_snapshots_ids are {latest_snapshot_ids}") + logging.info(f"latest_snapshots_ids are {latest_snapshot_ids}") + + # get the access urls for each file in the snapshot access_urls = get_access_urls(latest_snapshot_ids, access_token) print(f"access_urls are {access_urls}") - copy_tdr_to_staging(access_urls, staging_gs_paths) - # ultimately - # for each project_id in project_ids - # get the staging gs path - # get the snapshot name - # get the latest snapshot id - # get the access url for each file in the snapshot - # for each file in the snapshot - # copy the file from the access to the staging area + # copy the files from the TDR project bucket to the staging area bucket + copy_tdr_to_staging(access_urls, staging_gs_paths) if __name__ == '__main__': diff --git a/scripts/tdr/copy_from_tdr_to_gcs/get_snapshot_files_and_transfer.sh b/scripts/tdr/copy_from_tdr_to_gcs_hca/get_snapshot_files_and_transfer.sh similarity index 100% rename from scripts/tdr/copy_from_tdr_to_gcs/get_snapshot_files_and_transfer.sh rename to scripts/tdr/copy_from_tdr_to_gcs_hca/get_snapshot_files_and_transfer.sh diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs_hca/requirements.txt similarity index 100% rename from scripts/tdr/copy_from_tdr_to_gcs/requirements.txt rename to scripts/tdr/copy_from_tdr_to_gcs_hca/requirements.txt From a4d7f2388b402abc4e9f57683673fa628cca1b5d Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:24:54 -0400 Subject: [PATCH 05/14] adding docker build and push GitHub actions for copy_from_tdr_to_gcs_hca --- ...h_docker_copy_from_tdr_to_gcs_hca_dev.yaml | 40 +++++++++++++++++ ..._docker_copy_from_tdr_to_gcs_hca_main.yaml | 43 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml create mode 100644 .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml new file mode 100644 index 00000000..cee6d26f --- /dev/null +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml @@ -0,0 +1,40 @@ +name: Build and Publish Dev Images for scripts/tdr/copy_from_tdr_to_gcs_hca +on: + push: + branches-ignore: [main] + paths: + - scripts/copy_from_tdr_to_gcs_hca/** + - .github/workflows/** +env: + GCP_PROJECT_ID: dsp-fieldeng-dev + GCP_REPOSITORY: horsefish + GITHUB_SHA: ${{ github.sha }} + +jobs: + build-and-push-dev-images: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to GCP + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.BASE64_SAKEY_DSPFIELDENG_GARPUSHER }} + + - name: Configure Docker to use the Google Artifact Registry + run: gcloud auth configure-docker us-east4-docker.pkg.dev + + - name: Build and Push General Docker Image + run: | + docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca + docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA + + - name: Set image tag to 'dev' + run: | + docker tag us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:dev + docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:dev \ No newline at end of file diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml new file mode 100644 index 00000000..4c987247 --- /dev/null +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml @@ -0,0 +1,43 @@ +name: Build and Publish Latest Images for scripts/tdr/copy_from_tdr_to_gcs_hca +on: + pull_request_target: + types: + - closed + branches: + - main + paths: + - scripts/tdr/copy_from_tdr_to_gcs_hca/** + - .github/workflows/** +env: + GCP_PROJECT_ID: dsp-fieldeng-dev + GCP_REPOSITORY: horsefish + GITHUB_SHA: ${{ github.sha }} + +jobs: + build-and-push-dev-images: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to GCP + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.BASE64_SAKEY_DSPFIELDENG_GARPUSHER }} + + - name: Configure Docker to use the Google Artifact Registry + run: gcloud auth configure-docker us-east4-docker.pkg.dev + + - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image + run: | + docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/tdr/copy_from_tdr_to_gcs_hca + docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA + + - name: Set image tag to 'latest' + run: | + docker tag us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest + docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest \ No newline at end of file From cfc2cded38e60eb5a082f273f839580b28effb4f Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:32:57 -0400 Subject: [PATCH 06/14] A little more code re-org an optimization to be sure that we never mix up lists --- ...tdr_hca.py => copy_from_tdr_to_gcs_hca.py} | 150 +++++++++--------- 1 file changed, 75 insertions(+), 75 deletions(-) rename scripts/tdr/copy_from_tdr_to_gcs_hca/{from_bash_copy_from_tdr_hca.py => copy_from_tdr_to_gcs_hca.py} (51%) diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py b/scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py similarity index 51% rename from scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py rename to scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py index 61b8228e..058c859c 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py @@ -1,3 +1,6 @@ +# Description: This script copies data from Terra Data Repository (TDR) HCA project buckets to HCA staging area buckets. +# It is based on the bash script get_snapshot_files_and_transfer.sh, written by Samantha Velasquez. + import os import sys import csv @@ -16,8 +19,6 @@ "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset" } -# TODO change prints to logging - def setup_cli_logging_format() -> None: logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) @@ -70,8 +71,7 @@ def _parse_csv(csv_path:str): Attribution: https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py """ - staging_gs_paths = set() - project_ids = set() + tuple_list = [] with open(csv_path, "r") as f: reader = csv.reader(f) for row in reader: @@ -83,8 +83,6 @@ def _parse_csv(csv_path:str): institution = row[0] project_id = find_project_id_in_str(row[1]) - project_ids.add(project_id) - if institution not in STAGING_AREA_BUCKETS: raise Exception(f"Unknown institution {institution} found. " f"Make sure the institution is in the list of staging area buckets and is in all caps") @@ -97,25 +95,11 @@ def _parse_csv(csv_path:str): assert path.startswith("gs://"), "Staging area path must start with gs:// scheme" staging_gs_path = path - staging_gs_paths.add(staging_gs_path) - - return staging_gs_paths, project_ids + tuple_list.append((staging_gs_path, project_id)) - -def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]: - """ - This function gets the target snapshot name filters for the given project ids - :param project_ids: - :return: - """ - target_snapshots = set() - for project in project_ids: - target_snapshot = f"hca_prod_{project.replace('-', '')}" - target_snapshots.add(target_snapshot) - return target_snapshots + return tuple_list -# TODO: make this work in a Docker image def get_access_token(): creds, project = google.auth.default() auth_req = google.auth.transport.requests.Request() @@ -124,40 +108,40 @@ def get_access_token(): return access_token -def _get_latest_snapshots(target_snapshots: set[str], access_token: str): - latest_snapshots = [] - for snapshot_name in target_snapshots: - snapshot_response = requests.get( - f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter={snapshot_name}', - headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} - ) - snapshot_response.raise_for_status() - latest_snapshot_id = snapshot_response.json()['items'][0]['id'] - latest_snapshots.append(latest_snapshot_id) - return latest_snapshots +def get_latest_snapshot(target_snapshot: str, access_token: str): + snapshot_response = requests.get( + f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter=' + f'{target_snapshot}', + headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} + ) + snapshot_response.raise_for_status() + latest_snapshot_id = snapshot_response.json()['items'][0]['id'] + return latest_snapshot_id # for each snapshot get access url and add to a list of access urls for that snapshot -def get_access_urls(latest_snapshot_ids: list[str], access_token: str): - for snapshot in latest_snapshot_ids: - files_response = requests.get( - f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0', - headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} - ) - files_response.raise_for_status() - - # Extract file details from the JSON file and add them to a list - list_of_access_urls = [] - data = files_response.json() - for item in data: - list_of_access_urls.append(item['fileDetail']['accessUrl']) - return list_of_access_urls - - -def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]): +def get_access_urls(snapshot: str, access_token: str): + list_of_access_urls = [] + # for snapshot in latest_snapshot_ids: + logging.info(f"getting access urls for snapshot {snapshot}") + files_response = requests.get( + f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0', + headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'} + ) + files_response.raise_for_status() + + # Extract file details from the JSON file and add them to a list + data = files_response.json() + for item in data: + list_of_access_urls.append(item['fileDetail']['accessUrl']) + return list_of_access_urls + + +def check_staging_is_empty(staging_gs_paths: set[str]): + nonempty_staging_areas = [] for staging_dir in staging_gs_paths: staging_data_dir = staging_dir + '/data/' - logging.info(f'staging_data_dir is {staging_data_dir}') + logging.info(f'checking contents of staging_data dir: {staging_data_dir}') # using gsutil as output is cleaner & faster output = subprocess.run(['gsutil', 'ls', staging_data_dir], capture_output=True) stdout = output.stdout.strip() @@ -165,17 +149,42 @@ def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]): if len(files) > 1: logging.error(f"Staging area {staging_data_dir} is not empty") logging.info(f"files in staging area are: {files}") - continue - else: - logging.info(f"Staging area {staging_data_dir} is empty - copying files now") + nonempty_staging_areas.append(staging_data_dir) + else: + logging.info(f"Staging area {staging_data_dir} is empty") + + if len(nonempty_staging_areas) > 0: + logging.error("One or more staging areas are not empty. Exiting.") + logging.info(f"Non-empty staging areas are: {nonempty_staging_areas}") + sys.exit(1) + + +def copy_tdr_to_staging(tuple_list: list[tuple[str, str]], access_token: str): + for project_id in set([x[1] for x in tuple_list]): + target_snapshot = f"hca_prod_{project_id.replace('-', '')}" + latest_snapshot_id = get_latest_snapshot(target_snapshot, access_token) + logging.info(f'latest snapshot id for project {project_id} is {latest_snapshot_id}') + access_urls = get_access_urls(latest_snapshot_id, access_token) + num_access_urls = len(access_urls) + staging_gs_path = [x[0] for x in tuple_list if x[1] == project_id][0] + staging_data_dir = staging_gs_path + '/data/' + logging.info(f'Copying {num_access_urls} files from snapshot {latest_snapshot_id} to staging area {staging_data_dir}') for access_url in access_urls: + # strip the filename from the access url because gcp is not a file system - it's all objects + filename = access_url.split('/')[-1] + logging.info(f'access_url for snapshot {latest_snapshot_id} is {access_url}') try: - # strip the filename from the access url because gcp is not a file system - it's all objects - filename = access_url.split('/')[-1] - print(f"Copying {access_url} to {staging_data_dir}{filename}") subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir + filename]) except Exception as e: - logging.error(f"Error copying {access_url} to {staging_data_dir}{filename}: {e}") + logging.error(f'Error copying {access_url} to {staging_gs_path}{filename}: {e}') + continue + # visual summary of number of files copied + files_copied = subprocess.run(['gsutil', 'ls', staging_data_dir], + capture_output=True).stdout.decode('utf-8').split('\n') + # gsutil outputs the dir and a blank line, so we need to remove the blank line and the dir to count files + files_in_dir = [x.split('/')[-1] for x in files_copied if x and x.split('/')[-1]] + number_files_copied = len(files_in_dir) + logging.info(f'{number_files_copied} out of {num_access_urls} files copied to {staging_data_dir}') def main(): @@ -188,28 +197,19 @@ def main(): setup_cli_logging_format() access_token = get_access_token() - # read in the manifest and parse out the staging gs paths and project ids + # read in the manifest and get a tuple list of staging gs paths and project ids csv_path = sys.argv[1] validate_input(csv_path) - staging_gs_paths = _parse_csv(csv_path)[0] - logging.info(f"staging_gs_paths are {staging_gs_paths}") - project_ids = _parse_csv(csv_path)[1] - logging.info(f"project_ids are {project_ids}") - - # get the target snapshot ids, based on standard HCA ingest naming conventions - target_snapshots = _get_target_snapshot_ids(project_ids) - logging.info(f"target snapshot ids are {target_snapshots}") - - # get the latest snapshot ids for each target snapshot - latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token) - logging.info(f"latest_snapshots_ids are {latest_snapshot_ids}") + tuple_list = _parse_csv(csv_path) + logging.info(f"staging_gs_paths and project id tuple list is {tuple_list}") - # get the access urls for each file in the snapshot - access_urls = get_access_urls(latest_snapshot_ids, access_token) - print(f"access_urls are {access_urls}") + # staging dir is the first element in each tuple + staging_gs_paths = set([x[0] for x in tuple_list]) + # check if the staging area is empty + check_staging_is_empty(staging_gs_paths) # copy the files from the TDR project bucket to the staging area bucket - copy_tdr_to_staging(access_urls, staging_gs_paths) + copy_tdr_to_staging(tuple_list, access_token) if __name__ == '__main__': From 25a5570290c04b7fd1b2689c4581f9f4b77055a7 Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:33:22 -0400 Subject: [PATCH 07/14] update readme and add example file --- .../tdr/copy_from_tdr_to_gcs_hca/README.md | 48 +++++++++++++++---- .../dcpTEST_manifest.csv | 2 + 2 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md index ff96eb3b..5d1f7f7d 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md @@ -2,16 +2,46 @@ This was originally a bash script written by Samantha Velasquez\ [get_snapshot_files_and_transfer.sh](get_snapshot_files_and_transfer.sh) \ which was written to copy files from a TDR snapshot to an Azure bucket.\ -Bobbie then translated to python using CoPilot.\ +Bobbie then translated to python using CoPilot and it ballooned from there.\ [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \ +The bash script is now just here for posterity as it previously only lived in Slack. +It has not been tested in the Docker image created for the Python script. -Set up: -gcloud auth login -## TODO -- [ ] fix requirements.txt as needed -- [ ] update the script to copy to staging /data bucket -- [ ] update the script to take in a csv containing the institution & project UUID for HCA +## Running the Script +You will want to clone the whole horsefish repo, if you have not done so already.\ + +You will also need a manifest file to run the script.\ +The format of this manifest is identical to the one use for [HCA ingest](https://docs.google.com/document/d/1NQCDlvLgmkkveD4twX5KGv6SZUl8yaIBgz_l1EcrHdA/edit#heading=h.cg8d8o5kklql). +A sample manifest is provided in the project directory - dcpTEST_manifest.csv.\ +(Note that this is a test manifest and you will have to first load the data into TDR to use it - see the HCA ingest Ops manual linked above).\ +It's probably easiest to copy out the rows from the original ingest manifest into a new manifest, +then move that file into this project directory, so that it is picked up by compose.\ + +If you are not already logged in to gcloud/docker, you will need to do so before running the Docker compose command.\ +`gcloud auth application-default login` \ +`gcloud auth configure-docker us-east4-docker.pkg.dev` + +To start up the run/dev Docker compose env \ +`docker compose run app bash`\ +This will pull the latest image from Artifact Registry, start up the container, and mount the project dir, +so changes in your local project dir will be reflected in the container. + +Next you will want to authenticate with gcloud using your Broad credentials.\ +`gcloud auth login`\ +`gcloud config set project dsp-fieldeng-dev`* \ +`gcloud auth application-default login` \ +If you are not in dsp-fieldeng-dev +Then run the script using the following command syntax:\ +`python3 copy_from_tdr_to_gcs.py '` + +Contact Field Eng for any issues that arise. + + +## Possible improvements* - [ ] optional - update the script with conditional logic to accept a snapshot ID and destination instead - [ ] take care of any remaining TODOs in the script -- [ ] test/debug/update script -- [ ] add Dockerfile > push Docker image to artifact registry (check with Field Eng as to where to push) \ No newline at end of file +- [ ] update the script check lower case institution against lower case institution keys - see ~line 86 + +*this is likely to be used only rarely and mostly by the author, as a stop gap until partial updates have been implemented. +As such, we are attempting to keep this as light as possible, so as not to introduce unnecessary complexity. + diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv b/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv new file mode 100644 index 00000000..fc2d672d --- /dev/null +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv @@ -0,0 +1,2 @@ +TEST,003d5674-9bf6-4e51-ab1b-8fed80c308b9 +TEST,07e5ebc0-1386-4a33-8ce4-3007705adad8 \ No newline at end of file From df2dd223d7a7bd98d523ab036320d3e32895e607 Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:33:48 -0400 Subject: [PATCH 08/14] add a Docker compose dev env --- .../tdr/copy_from_tdr_to_gcs_hca/Dockerfile | 49 +++++++++++++++++++ .../docker-compose.yaml | 10 ++++ 2 files changed, 59 insertions(+) create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile new file mode 100644 index 00000000..932e5580 --- /dev/null +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile @@ -0,0 +1,49 @@ +FROM us.gcr.io/broad-dsp-gcr-public/base/python:3.12-alpine + +ENV PATH /google-cloud-sdk/bin:$PATH +RUN if [ `uname -m` = 'x86_64' ]; then echo -n "x86_64" > /tmp/arch; else echo -n "arm" > /tmp/arch; fi; +RUN ARCH=`cat /tmp/arch` && apk --no-cache upgrade && apk --no-cache add \ + bash \ + curl \ + python3 \ + py3-crcmod \ + py3-openssl \ + bash \ + libc6-compat \ + openssh-client \ + git \ + gnupg \ + && curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz && \ + tar xzf google-cloud-cli-linux-x86_64.tar.gz && \ + rm google-cloud-cli-linux-x86_64.tar.gz && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment docker_image_alpine && \ + gcloud --version +RUN git config --system credential.'https://source.developers.google.com'.helper gcloud.sh +VOLUME ["/root/.config"] + +WORKDIR /scripts/tdr/copy_from_tdr_to_gcs_hca + +# copy the contents of /scripts/tdr/copy_from_tdr_to_gcs_hca to the WORKDIR +COPY * . + +RUN pip install -r requirements.txt + +ENV PYTHONPATH "/scripts:${PYTHONPATH}" +CMD ["/bin/bash"] + +# builds with GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_main.yaml and ../.github/workflows/build-and-push_docker_dev.yaml +# tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest + +# To manually build and run locally +# docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: . +# docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: + +# to build and push to Artifact Registry +# make sure you are logged in to gcloud and that application default credentials are set +# gcloud auth login +# gcloud config set project dsp-fieldeng-dev +# gcloud auth application-default login +# set the before building and pushing +# docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: \ No newline at end of file diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml b/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml new file mode 100644 index 00000000..11951927 --- /dev/null +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml @@ -0,0 +1,10 @@ +services: + app: + # for dev + # build: . + # or specify your dev sha or local image + image: us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:latest + container_name: copy_from_tdr_to_gcs_hca + command: bin/bash --reload + volumes: + - .:/scripts/tdr/copy_from_tdr_to_gcs_hca \ No newline at end of file From af9f6c67ee5aede7122802a1b258bed156d52ca5 Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:41:00 -0400 Subject: [PATCH 09/14] fixed name of build step --- .../build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml index cee6d26f..1212ef9e 100644 --- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml @@ -29,7 +29,7 @@ jobs: - name: Configure Docker to use the Google Artifact Registry run: gcloud auth configure-docker us-east4-docker.pkg.dev - - name: Build and Push General Docker Image + - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image run: | docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA From aec4ef5e82ba9851d4a724339317d29c4b9c7826 Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:41:26 -0400 Subject: [PATCH 10/14] got more specific about which file to watch --- .github/workflows/build_and_push_docker_gen_dev.yaml | 2 +- .github/workflows/build_and_push_docker_gen_main.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_push_docker_gen_dev.yaml b/.github/workflows/build_and_push_docker_gen_dev.yaml index 9f4d7ee2..3395052a 100644 --- a/.github/workflows/build_and_push_docker_gen_dev.yaml +++ b/.github/workflows/build_and_push_docker_gen_dev.yaml @@ -4,7 +4,7 @@ on: branches-ignore: [main] paths: - scripts/general/** - - .github/workflows/** + - .github/workflows/build_and_push_docker_gen_dev.yaml env: GCP_PROJECT_ID: dsp-fieldeng-dev GCP_REPOSITORY_GENERAL: horsefish diff --git a/.github/workflows/build_and_push_docker_gen_main.yaml b/.github/workflows/build_and_push_docker_gen_main.yaml index 151e6ef6..5de917a6 100644 --- a/.github/workflows/build_and_push_docker_gen_main.yaml +++ b/.github/workflows/build_and_push_docker_gen_main.yaml @@ -7,7 +7,7 @@ on: - main paths: - scripts/general/** - - .github/workflows/** + - .github/workflows/build_and_push_docker_gen_main.yaml env: GCP_PROJECT_ID: dsp-fieldeng-dev GCP_REPOSITORY_GENERAL: horsefish From c07af4a470d11b10d7d3c82dbb3d2cd0449db3da Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:42:01 -0400 Subject: [PATCH 11/14] got more specific about which file to watch --- .../build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml | 2 +- .../build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml index 1212ef9e..714931e2 100644 --- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml @@ -4,7 +4,7 @@ on: branches-ignore: [main] paths: - scripts/copy_from_tdr_to_gcs_hca/** - - .github/workflows/** + - .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml env: GCP_PROJECT_ID: dsp-fieldeng-dev GCP_REPOSITORY: horsefish diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml index 4c987247..fa0c95c3 100644 --- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml @@ -7,7 +7,7 @@ on: - main paths: - scripts/tdr/copy_from_tdr_to_gcs_hca/** - - .github/workflows/** + - .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml env: GCP_PROJECT_ID: dsp-fieldeng-dev GCP_REPOSITORY: horsefish From 69532e98b2140dea2c2625af656c21638c23e23c Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 16:44:04 -0400 Subject: [PATCH 12/14] fix path for context --- ...h_docker_copy_from_tdr_to_gcs_hca_dev.yaml | 2 +- scripts/general/test.py | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 scripts/general/test.py diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml index 714931e2..eecad0eb 100644 --- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml +++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml @@ -31,7 +31,7 @@ jobs: - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image run: | - docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca + docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/tdr/copy_from_tdr_to_gcs_hca docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA - name: Set image tag to 'dev' diff --git a/scripts/general/test.py b/scripts/general/test.py new file mode 100644 index 00000000..9cadcb2c --- /dev/null +++ b/scripts/general/test.py @@ -0,0 +1,22 @@ +# Usage: python test.py file1 file2 +# file1: file to loop through +# file2: file to search for matching string +# if no matching string found, print the string +# if found, do nothing +# output: print the string if no matching string found, print "found" if all strings are found + +import sys + +with open(sys.argv[1]) as f: + lines = [line.strip() for line in f.readlines()] + +with open(sys.argv[2], 'r') as f2: + data = f2.read() + +counter = 0 +for line in lines: + if line not in data: + print(f'Not Found: {line}') + counter += 1 + else: + print("counter") \ No newline at end of file From 4079e7f641dbd9a532cd9f7f1fd630b2618d8b52 Mon Sep 17 00:00:00 2001 From: bahill Date: Tue, 13 Aug 2024 17:13:30 -0400 Subject: [PATCH 13/14] readme updates for accuracy and clarity --- scripts/tdr/copy_from_tdr_to_gcs_hca/README.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md index 5d1f7f7d..5c6b6844 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md @@ -1,21 +1,25 @@ # Copy from TDR to GCS This was originally a bash script written by Samantha Velasquez\ [get_snapshot_files_and_transfer.sh](get_snapshot_files_and_transfer.sh) \ -which was written to copy files from a TDR snapshot to an Azure bucket.\ -Bobbie then translated to python using CoPilot and it ballooned from there.\ +which was written to copy files from a TDR snapshot to an Azure bucket. \ +Bobbie then translated to python using CoPilot and it ballooned from there. \ [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \ The bash script is now just here for posterity as it previously only lived in Slack. It has not been tested in the Docker image created for the Python script. ## Running the Script -You will want to clone the whole horsefish repo, if you have not done so already.\ +**IMPPORTANT**\ +You will need to be in either the [Monster Group](https://groups.google.com/a/broadinstitute.org/g/monster) +or the [Field Eng group](https://groups.google.com/a/broadinstitute.org/g/dsp-fieldeng) to run this script. + +You will want to clone the whole horsefish repo, if you have not done so already. You will also need a manifest file to run the script.\ The format of this manifest is identical to the one use for [HCA ingest](https://docs.google.com/document/d/1NQCDlvLgmkkveD4twX5KGv6SZUl8yaIBgz_l1EcrHdA/edit#heading=h.cg8d8o5kklql). A sample manifest is provided in the project directory - dcpTEST_manifest.csv.\ (Note that this is a test manifest and you will have to first load the data into TDR to use it - see the HCA ingest Ops manual linked above).\ It's probably easiest to copy out the rows from the original ingest manifest into a new manifest, -then move that file into this project directory, so that it is picked up by compose.\ +then move that file into this project directory, so that it is picked up by compose. If you are not already logged in to gcloud/docker, you will need to do so before running the Docker compose command.\ `gcloud auth application-default login` \ @@ -32,9 +36,10 @@ Next you will want to authenticate with gcloud using your Broad credentials.\ `gcloud auth application-default login` \ If you are not in dsp-fieldeng-dev Then run the script using the following command syntax:\ -`python3 copy_from_tdr_to_gcs.py '` +`python3 copy_from_tdr_to_gcs_hca.py '` -Contact Field Eng for any issues that arise. +Contact Field Eng for any issues that arise. \ +_*or the monster hca prod project - mystical-slate-284720_ ## Possible improvements* From d647b61331b79f9e871bf638c2e4131b40bc8c10 Mon Sep 17 00:00:00 2001 From: bahill Date: Wed, 14 Aug 2024 10:44:42 -0400 Subject: [PATCH 14/14] moved comments in Dockerfile to readme and updated readme with possible future improvements --- .../tdr/copy_from_tdr_to_gcs_hca/Dockerfile | 15 ----------- .../tdr/copy_from_tdr_to_gcs_hca/README.md | 25 ++++++++++++++++--- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile index 932e5580..db91a1e2 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile @@ -32,18 +32,3 @@ RUN pip install -r requirements.txt ENV PYTHONPATH "/scripts:${PYTHONPATH}" CMD ["/bin/bash"] - -# builds with GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_main.yaml and ../.github/workflows/build-and-push_docker_dev.yaml -# tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest - -# To manually build and run locally -# docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: . -# docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: - -# to build and push to Artifact Registry -# make sure you are logged in to gcloud and that application default credentials are set -# gcloud auth login -# gcloud config set project dsp-fieldeng-dev -# gcloud auth application-default login -# set the before building and pushing -# docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: \ No newline at end of file diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md index 5c6b6844..eacbb524 100644 --- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md +++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md @@ -41,11 +41,30 @@ Then run the script using the following command syntax:\ Contact Field Eng for any issues that arise. \ _*or the monster hca prod project - mystical-slate-284720_ +## Building the Docker Image +The image builds with the GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_copy_from_tdr_to_gcs_hca_main.yaml +and ../.github/workflows/build-and-push_docker_copy_from_tdr_to_gcs_hca_dev.yaml +tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, +us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest + +### To manually build and run locally +`docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca: .` \ +`docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:` + +### To build and push to Artifact Registry +- make sure you are logged in to gcloud and that application default credentials are set \ +`gcloud auth login` \ +`gcloud config set project dsp-fieldeng-dev` \ +`gcloud auth application-default login` +- set the before building and pushing \ +`docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:` + ## Possible improvements* -- [ ] optional - update the script with conditional logic to accept a snapshot ID and destination instead -- [ ] take care of any remaining TODOs in the script -- [ ] update the script check lower case institution against lower case institution keys - see ~line 86 +- update the script with conditional logic to accept a snapshot ID and destination instead +- update the script check lower case institution against lower case institution keys - see ~line 86 +- update the script to merge `validate_input()` and `_parse_csv()` into one function +- Consider adding a copy manifest to this command, so instead you validating number of files copied (line 187), you can specifically highlight the files not copied successfully. *this is likely to be used only rarely and mostly by the author, as a stop gap until partial updates have been implemented. As such, we are attempting to keep this as light as possible, so as not to introduce unnecessary complexity.