From b069a0a80a14522f754250645fd4acf8c3227bcb Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 23 Jul 2024 14:18:22 -0400
Subject: [PATCH 01/14] pushing changes to remote

---
 .../from_bash_copy_from_tdr.py                | 145 ++++++++++++------
 .../tdr/copy_from_tdr_to_gcs/requirements.txt |   4 -
 2 files changed, 102 insertions(+), 47 deletions(-)
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
index eeeacd7a..f6bfc563 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
+++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
@@ -1,69 +1,128 @@
 import os
 import sys
+import csv
 import json
 import requests
 import subprocess
 from google.auth import compute_engine
 from google.auth.transport.requests import Request
 
+STAGING_AREA_BUCKETS = {
+    "prod": {
+        "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
+        "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
+        "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern",
+        "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
+        "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset"
+    }
+}
+
 # input should be a manifest csv of those projects that need data copied back
-# Check if a filename is provided as an argument
+# Check if n csv_path is provided as an argument
 if len(sys.argv) != 2:
-    print("Usage: python3 script.py <filename>")
+    print("Usage: python3 copy_from_tdr.py <csv_path>")
     sys.exit(1)
 
-filename = sys.argv[1]
+csv_path = sys.argv[1]
 
 # Check if the file exists
-if not os.path.isfile(filename):
-    print(f"File {filename} not found")
+if not os.path.isfile(csv_path):
+    print(f"{csv_path} not found")
+    sys.exit(1)
+
+# Check if the file is a csv
+if not csv_path.endswith('.csv'):
+    print(f"{csv_path} is not a csv file")
     sys.exit(1)
 
+def _parse_csv(csv_path:str):
+    keys = set()
+    with open(csv_path, "r") as f:
+        reader = csv.reader(f)
+        for row in reader:
+            if not row:
+                logging.debug("Empty path detected, skipping")
+                continue
+
+            assert len(row) == 2
+            institution = row[0]
+            project_id = find_project_id_in_str(row[1])
+
+            key = None
+            if project_id_only:
+                project_id = row[1]
+                key = project_id
+            else:
+
+                if institution not in STAGING_AREA_BUCKETS[env]:
+                    raise Exception(f"Unknown institution {institution} found")
+
+                institution_bucket = STAGING_AREA_BUCKETS[env][institution]
+                path = institution_bucket + "/" + project_id
+
+                # sanitize and dedupe
+                path = _sanitize_gs_path(path)
+                assert path.startswith("gs://"), "Staging area path must start with gs:// scheme"
+                key = path
+
+            if include_release_tag:
+                key = key + f",{release_tag}"
+            keys.add(key)
+
+    chunked_paths = chunked(keys, MAX_STAGING_AREAS_PER_PARTITION_SET)
+    return [chunk for chunk in chunked_paths]
+
 # Read the file line by line
-with open(filename, 'r') as file:
+with open csv_path, 'r') as file:
     lines = file.read().splitlines()
+    #
+
+print(lines)
 
 # TODO create the list of snapshot IDs from the list of UUIDs
 # use manifest.csv to get the UUIDs that need data copied back
 # for each UUID, construct the snapshot name - which will be latest snapshot with a dataset id like
 # "hca_prod_<uuid-without-dashes>*"
-# use those dataset ids to get the latest snapshot id for each dataset
-# this then becomes the lines list
+# use those snapshot ids to get the latest snapshot id for each dataset
+# this then becomes the snapshot list
+
+snapshots = []
+for line in lines:
 
 # TODO - is this needed? or can we just run locally as Monster members?
 # Get access token
-credentials = compute_engine.Credentials()
-credentials.refresh(Request())
-access_token = credentials.token
-
-for snapshot in lines:
-    # Make request to the API with the current snapshot
-    response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000",
-                            headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'})
-
-    # Write the response to a JSON file
-    with open(f"response_{snapshot}.json", 'w') as outfile:
-        json.dump(response.json(), outfile)
-
-    # Extract file details from the JSON file and append them to a text file
-    with open(f"response_{snapshot}.json", 'r') as json_file:
-        data = json.load(json_file)
-        with open("list_of_access_urls.txt", 'a') as outfile:
-            for item in data:
-                outfile.write(item['fileDetail']['accessUrl'] + '\n')
-
-
-# Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp
-with open("list_of_access_urls.txt", 'r') as file:
-    access_urls = file.read().splitlines()
-
-# TODO
-# copy command will look something like\
-# gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/.
-# Also need to construct the staging/data gs:// path from the manifest.csv
-# "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
-# "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
-# "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern",
-#  "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
-for access_url in access_urls:
-    subprocess.run(['gcloud storage', 'cp', access_url, "<INSERT STAGING /DATA url>"])
\ No newline at end of file
+# credentials = compute_engine.Credentials()
+# credentials.refresh(Request())
+# access_token = credentials.token
+
+# for snapshot in lines:
+#     # Make request to the API with the current snapshot
+#     response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000",
+#                             headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'})
+#
+#     # Write the response to a JSON file
+#     with open(f"response_{snapshot}.json", 'w') as outfile:
+#         json.dump(response.json(), outfile)
+#
+#     # Extract file details from the JSON file and append them to a text file
+#     with open(f"response_{snapshot}.json", 'r') as json_file:
+#         data = json.load(json_file)
+#         with open("list_of_access_urls.txt", 'a') as outfile:
+#             for item in data:
+#                 outfile.write(item['fileDetail']['accessUrl'] + '\n')
+#
+#
+# # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp
+# with open("list_of_access_urls.txt", 'r') as file:
+#     access_urls = file.read().splitlines()
+
+# # TODO
+# # copy command will look something like\
+# # gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/.
+# # Also need to construct the staging/data gs:// path from the manifest.csv
+# # "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
+# # "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
+# # "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern",
+# #  "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
+# for access_url in access_urls:
+#     subprocess.run(['gcloud storage', 'cp', access_url, "<INSERT STAGING /DATA url>"])
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
index 190fd236..fa3af485 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
+++ b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
@@ -3,9 +3,5 @@
 # source venv/bin/activate
 # pip install -r requirements.txt
 
-os
-sys
-json
 requests
-subprocess
 google-auth
\ No newline at end of file

From 48ddc849a26221a595eda1f302c7651a2e81276e Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Wed, 31 Jul 2024 17:52:34 -0400
Subject: [PATCH 02/14] continuing to convert pseudo code to functional code

---
 .../from_bash_copy_from_tdr.py                | 206 +++++++++++-------
 .../tdr/copy_from_tdr_to_gcs/requirements.txt |   3 +-
 2 files changed, 131 insertions(+), 78 deletions(-)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
index f6bfc563..4e943b37 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
+++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
@@ -1,42 +1,71 @@
 import os
 import sys
 import csv
+import logging
 import json
+import re
 import requests
+import google.auth
+import google.auth.transport.requests
 import subprocess
-from google.auth import compute_engine
-from google.auth.transport.requests import Request
 
 STAGING_AREA_BUCKETS = {
-    "prod": {
         "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
         "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
         "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern",
         "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
         "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset"
-    }
 }
 
-# input should be a manifest csv of those projects that need data copied back
-# Check if n csv_path is provided as an argument
-if len(sys.argv) != 2:
-    print("Usage: python3 copy_from_tdr.py <csv_path>")
-    sys.exit(1)
-
-csv_path = sys.argv[1]
-
-# Check if the file exists
-if not os.path.isfile(csv_path):
-    print(f"{csv_path} not found")
-    sys.exit(1)
-
-# Check if the file is a csv
-if not csv_path.endswith('.csv'):
-    print(f"{csv_path} is not a csv file")
-    sys.exit(1)
+def setup_cli_logging_format() -> None:
+    logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout)
+
+def validate_input(csv_path:str):
+    """
+    input should be a manifest csv of those projects that need data copied back
+    format is <institution>,<project_id>
+    """
+    if not os.path.isfile(csv_path):
+        logging.debug(f"{csv_path} not found")
+        sys.exit(1)
+
+    if not csv_path.endswith('.csv'):
+        logging.debug(f"{csv_path} is not a csv file")
+        sys.exit(1)
+
+    else:
+        return csv_path
+
+def find_project_id_in_str(s: str) -> str:
+    """
+    The selected function find_project_id_in_str(s: str) -> str:
+    is used to extract a UUID (Universally Unique Identifier) from a given string s.
+    :param s:
+    :return:
+    Attribution:
+    https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_orchestration/support/matchers.py
+    """
+    uuid_matcher = re.compile('[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}', re.I)
+    project_ids = uuid_matcher.findall(s)
+
+    if len(project_ids) != 1:
+        raise Exception(f"Found more than one or zero project UUIDs in {s}")
+
+    return str(project_ids[0])
+
+def _sanitize_gs_path(path: str) -> str:
+    return path.strip().strip("/")
 
 def _parse_csv(csv_path:str):
-    keys = set()
+    """
+    Parses the csv file and returns a list of staging areas
+    :param csv_path:
+    :return:
+    Attribution:
+    https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py
+    """
+    gs_paths = set()
+    project_ids = set()
     with open(csv_path, "r") as f:
         reader = csv.reader(f)
         for row in reader:
@@ -48,61 +77,60 @@ def _parse_csv(csv_path:str):
             institution = row[0]
             project_id = find_project_id_in_str(row[1])
 
-            key = None
-            if project_id_only:
-                project_id = row[1]
-                key = project_id
-            else:
-
-                if institution not in STAGING_AREA_BUCKETS[env]:
-                    raise Exception(f"Unknown institution {institution} found")
-
-                institution_bucket = STAGING_AREA_BUCKETS[env][institution]
-                path = institution_bucket + "/" + project_id
-
-                # sanitize and dedupe
-                path = _sanitize_gs_path(path)
-                assert path.startswith("gs://"), "Staging area path must start with gs:// scheme"
-                key = path
-
-            if include_release_tag:
-                key = key + f",{release_tag}"
-            keys.add(key)
-
-    chunked_paths = chunked(keys, MAX_STAGING_AREAS_PER_PARTITION_SET)
-    return [chunk for chunk in chunked_paths]
-
-# Read the file line by line
-with open csv_path, 'r') as file:
-    lines = file.read().splitlines()
-    #
-
-print(lines)
-
-# TODO create the list of snapshot IDs from the list of UUIDs
-# use manifest.csv to get the UUIDs that need data copied back
-# for each UUID, construct the snapshot name - which will be latest snapshot with a dataset id like
-# "hca_prod_<uuid-without-dashes>*"
-# use those snapshot ids to get the latest snapshot id for each dataset
-# this then becomes the snapshot list
-
-snapshots = []
-for line in lines:
-
-# TODO - is this needed? or can we just run locally as Monster members?
-# Get access token
-# credentials = compute_engine.Credentials()
-# credentials.refresh(Request())
-# access_token = credentials.token
-
-# for snapshot in lines:
-#     # Make request to the API with the current snapshot
-#     response = requests.get(f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0&limit=10000",
-#                             headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'})
-#
-#     # Write the response to a JSON file
-#     with open(f"response_{snapshot}.json", 'w') as outfile:
-#         json.dump(response.json(), outfile)
+            project_ids.add(project_id)
+
+            gs_path = None
+            if institution not in STAGING_AREA_BUCKETS:
+                raise Exception(f"Unknown institution {institution} found")
+
+            institution_bucket = STAGING_AREA_BUCKETS[institution]
+            path = institution_bucket + "/" + project_id
+
+            # sanitize and dedupe
+            path = _sanitize_gs_path(path)
+            assert path.startswith("gs://"), "Staging area path must start with gs:// scheme"
+            gs_path = path
+
+            gs_paths.add(gs_path)
+
+        # print(f"These are the parsed gs_paths {gs_paths}")
+        # print(f"These are the parsed project_ids {project_ids}")
+        return gs_paths, project_ids
+
+
+def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]:
+    """
+    This function gets the target snapshot name filters for the given project ids
+    :param project_ids:
+    :return:
+    """
+    target_snapshots = set()
+    for project in project_ids:
+        target_snapshot = f"hca_prod_{project.replace('-', '')}"
+        target_snapshots.add(target_snapshot)
+    return target_snapshots
+
+
+# TODO: make this work in a Docker image
+def get_access_token():
+    creds, project = google.auth.default()
+    auth_req = google.auth.transport.requests.Request()
+    creds.refresh(auth_req)
+    access_token = creds.token
+    return access_token
+
+
+def _get_latest_snapshot(target_snapshots: set[str], access_token: str):
+    for snapshot_name in target_snapshots:
+        response = requests.get(f'https://data.terra.bio/api/repository/v1/snapshots?sort=createdDate,'
+                                f'desc&limit=1&filter={snapshot_name}',
+                                headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'})
+        response.raise_for_status()
+        with open(f"response_{snapshot_name}.json", 'w') as outfile:
+            json.dump(response.json(), outfile)
+
+# TODO 8/1/24 - got a 400 for invalid filter - yay!
+# need to add note that you need to gcloud auth to run this.
 #
 #     # Extract file details from the JSON file and append them to a text file
 #     with open(f"response_{snapshot}.json", 'r') as json_file:
@@ -126,3 +154,27 @@ def _parse_csv(csv_path:str):
 # #  "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
 # for access_url in access_urls:
 #     subprocess.run(['gcloud storage', 'cp', access_url, "<INSERT STAGING /DATA url>"])
+
+
+def main():
+    """Parse command-line arguments and run specified tool.
+
+     Note: Does not take explicit input arguments, but uses sys.argv inputs
+     from the command line.
+
+    """
+    setup_cli_logging_format()
+    access_token = get_access_token()
+    csv_path = sys.argv[1]
+    validate_input(csv_path)
+    gs_paths = _parse_csv(csv_path)[0]
+    print(f"gs_paths are {gs_paths}")
+    project_ids = _parse_csv(csv_path)[1]
+    print(f"project_ids are {project_ids}")
+    target_snapshots = _get_target_snapshot_ids(project_ids)
+    print(f"target snapshot ids are {target_snapshots}")
+    tdr_data_path = _get_latest_snapshot(target_snapshots, access_token)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
index fa3af485..44950ab8 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
+++ b/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
@@ -4,4 +4,5 @@
 # pip install -r requirements.txt
 
 requests
-google-auth
\ No newline at end of file
+google-cloud
+google-auth

From ab4298aa4bf86d238622b451164847ac38bc1b22 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Wed, 7 Aug 2024 19:30:30 -0400
Subject: [PATCH 03/14] continuing to convert pseudo code to functional code

---
 scripts/tdr/copy_from_tdr_to_gcs/README.md    |   3 +-
 .../from_bash_copy_from_tdr.py                | 113 +++++++++++++-----
 2 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs/README.md b/scripts/tdr/copy_from_tdr_to_gcs/README.md
index 938e541d..2b64aeb1 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/README.md
+++ b/scripts/tdr/copy_from_tdr_to_gcs/README.md
@@ -4,8 +4,9 @@ This was originally a bash script written by Samantha Velasquez\
 which was written to copy files from a TDR snapshot to an Azure bucket.\
 Bobbie then translated to python using CoPilot.\
 [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \
-**This script is not yet tested.**
 
+Set up:
+gcloud auth login
 
 ## TODO
 - [ ] fix requirements.txt as needed
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
index 4e943b37..ae277f58 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
+++ b/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
@@ -17,6 +17,8 @@
         "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset"
 }
 
+# TODO change prints to logging
+
 def setup_cli_logging_format() -> None:
     logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout)
 
@@ -53,7 +55,7 @@ def find_project_id_in_str(s: str) -> str:
 
     return str(project_ids[0])
 
-def _sanitize_gs_path(path: str) -> str:
+def _sanitize_staging_gs_path(path: str) -> str:
     return path.strip().strip("/")
 
 def _parse_csv(csv_path:str):
@@ -64,7 +66,7 @@ def _parse_csv(csv_path:str):
     Attribution:
     https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py
     """
-    gs_paths = set()
+    staging_gs_paths = set()
     project_ids = set()
     with open(csv_path, "r") as f:
         reader = csv.reader(f)
@@ -79,7 +81,7 @@ def _parse_csv(csv_path:str):
 
             project_ids.add(project_id)
 
-            gs_path = None
+            staging_gs_path = None
             if institution not in STAGING_AREA_BUCKETS:
                 raise Exception(f"Unknown institution {institution} found")
 
@@ -87,15 +89,15 @@ def _parse_csv(csv_path:str):
             path = institution_bucket + "/" + project_id
 
             # sanitize and dedupe
-            path = _sanitize_gs_path(path)
+            path = _sanitize_staging_gs_path(path)
             assert path.startswith("gs://"), "Staging area path must start with gs:// scheme"
-            gs_path = path
+            staging_gs_path = path
 
-            gs_paths.add(gs_path)
+            staging_gs_paths.add(staging_gs_path)
 
-        # print(f"These are the parsed gs_paths {gs_paths}")
+        # print(f"These are the parsed staging_gs_paths {staging_gs_paths}")
         # print(f"These are the parsed project_ids {project_ids}")
-        return gs_paths, project_ids
+        return staging_gs_paths, project_ids
 
 
 def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]:
@@ -120,27 +122,65 @@ def get_access_token():
     return access_token
 
 
-def _get_latest_snapshot(target_snapshots: set[str], access_token: str):
+def _get_latest_snapshots(target_snapshots: set[str], access_token: str):
+    latest_snapshots = []
     for snapshot_name in target_snapshots:
-        response = requests.get(f'https://data.terra.bio/api/repository/v1/snapshots?sort=createdDate,'
-                                f'desc&limit=1&filter={snapshot_name}',
-                                headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'})
-        response.raise_for_status()
-        with open(f"response_{snapshot_name}.json", 'w') as outfile:
-            json.dump(response.json(), outfile)
-
-# TODO 8/1/24 - got a 400 for invalid filter - yay!
-# need to add note that you need to gcloud auth to run this.
-#
-#     # Extract file details from the JSON file and append them to a text file
-#     with open(f"response_{snapshot}.json", 'r') as json_file:
-#         data = json.load(json_file)
-#         with open("list_of_access_urls.txt", 'a') as outfile:
-#             for item in data:
-#                 outfile.write(item['fileDetail']['accessUrl'] + '\n')
-#
-#
+        snapshot_response = requests.get(
+            f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter={snapshot_name}',
+            headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
+        )
+        snapshot_response.raise_for_status()
+        # with open(f"response_{snapshot_name}.json", 'w') as outfile:
+        #     # not sure that I need to dump this? - not sure I need to write an output file at all actually
+        #     json.dump(snapshot_response.json(), outfile)
+        latest_snapshot_id = snapshot_response.json()['items'][0]['id']
+        latest_snapshots.append(latest_snapshot_id)
+    return latest_snapshots
+
+# then for each snapshot get access url and add to a list of access urls for that snapshot
+def get_access_urls(latest_snapshot_ids: list[str], access_token: str):
+    for snapshot in latest_snapshot_ids:
+        files_response = requests.get(
+            f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0',
+            headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
+        )
+        files_response.raise_for_status()
+        # TODO - don't need?
+        # with open(f"response_{snapshot}.json", 'w') as outfile:
+        #     json.dump(files_response.json(), outfile)
+
+        # Extract file details from the JSON file and append them to a text file
+        list_of_access_urls = []
+        data = files_response.json()
+        for item in data:
+            list_of_access_urls.append(item['fileDetail']['accessUrl'])
+        return list_of_access_urls
+
+
+        # with open(f"response_{snapshot}.json", 'r') as json_file:
+        #     data = json.load(json_file)
+        #     with open("list_of_access_urls.txt", 'a') as outfile:
+        #         for item in data:
+        #             outfile.write(item['fileDetail']['accessUrl'] + '\n')
+
+def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]):
+    # TODO this will need to be modified to work one by one - see nesting below
+    for staging_dir in staging_gs_paths:
+        staging_data_dir = staging_dir + '/data/'
+        # output = subprocess.run(['gcloud', 'storage', 'ls', staging_data_dir], capture_output=True, text=True)
+        # TODO FIX THIS - it's not actually stopping if it's not empty - and it won't be entirely empty... need another method
+        # if output.stdout.strip() != '':
+        #     print(f"Staging area {staging_data_dir} is not empty")
+        #     print(output.stdout.strip())
+        # else:
+        #     print(f"Staging area {staging_data_dir} is empty - copying files now")
+        for access_url in access_urls:
+            print(f"Copying {access_url} to {staging_data_dir}")
+            subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir])
+            # HM... why is copy hard? does it need /. at the end? or is it the access url that's wrong?
+
 # # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp
+# we should make sure the staging/data dir is empty before running this
 # with open("list_of_access_urls.txt", 'r') as file:
 #     access_urls = file.read().splitlines()
 
@@ -167,13 +207,26 @@ def main():
     access_token = get_access_token()
     csv_path = sys.argv[1]
     validate_input(csv_path)
-    gs_paths = _parse_csv(csv_path)[0]
-    print(f"gs_paths are {gs_paths}")
+    staging_gs_paths = _parse_csv(csv_path)[0]
+    print(f"staging_gs_paths are {staging_gs_paths}")
     project_ids = _parse_csv(csv_path)[1]
     print(f"project_ids are {project_ids}")
     target_snapshots = _get_target_snapshot_ids(project_ids)
     print(f"target snapshot ids are {target_snapshots}")
-    tdr_data_path = _get_latest_snapshot(target_snapshots, access_token)
+    latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token)
+    print(f"latest_snapshots_ids are {latest_snapshot_ids}")
+    access_urls = get_access_urls(latest_snapshot_ids, access_token)
+    print(f"access_urls are {access_urls}")
+    copy_tdr_to_staging(access_urls, staging_gs_paths)
+
+    # ultimately
+    # for each project_id in project_ids
+    #   get the staging gs path
+    #   get the snapshot name
+    #   get the latest snapshot id
+    #   get the access url for each file in the snapshot
+    #   for each file in the snapshot
+    #       copy the file from the access to the staging area
 
 
 if __name__ == '__main__':

From efe1b04a8373d61483ecb5b8bad189bb5d637565 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Mon, 12 Aug 2024 11:34:04 -0400
Subject: [PATCH 04/14] cleaning up code and updated script names

---
 .../README.md                                 |   1 -
 .../from_bash_copy_from_tdr_hca.py}           | 105 ++++++++----------
 .../get_snapshot_files_and_transfer.sh        |   0
 .../requirements.txt                          |   0
 4 files changed, 44 insertions(+), 62 deletions(-)
 rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/README.md (99%)
 rename scripts/tdr/{copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py => copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py} (62%)
 rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/get_snapshot_files_and_transfer.sh (100%)
 rename scripts/tdr/{copy_from_tdr_to_gcs => copy_from_tdr_to_gcs_hca}/requirements.txt (100%)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
similarity index 99%
rename from scripts/tdr/copy_from_tdr_to_gcs/README.md
rename to scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
index 2b64aeb1..ff96eb3b 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/README.md
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
@@ -7,7 +7,6 @@ Bobbie then translated to python using CoPilot.\
 
 Set up:
 gcloud auth login
-
 ## TODO
 - [ ] fix requirements.txt as needed
 - [ ] update the script to copy to staging /data bucket
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py b/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py
similarity index 62%
rename from scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
rename to scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py
index ae277f58..61b8228e 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs/from_bash_copy_from_tdr.py
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py
@@ -2,7 +2,6 @@
 import sys
 import csv
 import logging
-import json
 import re
 import requests
 import google.auth
@@ -19,10 +18,12 @@
 
 # TODO change prints to logging
 
+
 def setup_cli_logging_format() -> None:
     logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout)
 
-def validate_input(csv_path:str):
+
+def validate_input(csv_path: str):
     """
     input should be a manifest csv of those projects that need data copied back
     format is <institution>,<project_id>
@@ -38,10 +39,11 @@ def validate_input(csv_path:str):
     else:
         return csv_path
 
+
 def find_project_id_in_str(s: str) -> str:
     """
     The selected function find_project_id_in_str(s: str) -> str:
-    is used to extract a UUID (Universally Unique Identifier) from a given string s.
+    is used to extract a valid UUID (Universally Unique Identifier) from a given string s.
     :param s:
     :return:
     Attribution:
@@ -55,9 +57,11 @@ def find_project_id_in_str(s: str) -> str:
 
     return str(project_ids[0])
 
+
 def _sanitize_staging_gs_path(path: str) -> str:
     return path.strip().strip("/")
 
+
 def _parse_csv(csv_path:str):
     """
     Parses the csv file and returns a list of staging areas
@@ -81,9 +85,9 @@ def _parse_csv(csv_path:str):
 
             project_ids.add(project_id)
 
-            staging_gs_path = None
             if institution not in STAGING_AREA_BUCKETS:
-                raise Exception(f"Unknown institution {institution} found")
+                raise Exception(f"Unknown institution {institution} found. "
+                                f"Make sure the institution is in the list of staging area buckets and is in all caps")
 
             institution_bucket = STAGING_AREA_BUCKETS[institution]
             path = institution_bucket + "/" + project_id
@@ -95,8 +99,6 @@ def _parse_csv(csv_path:str):
 
             staging_gs_paths.add(staging_gs_path)
 
-        # print(f"These are the parsed staging_gs_paths {staging_gs_paths}")
-        # print(f"These are the parsed project_ids {project_ids}")
         return staging_gs_paths, project_ids
 
 
@@ -130,14 +132,12 @@ def _get_latest_snapshots(target_snapshots: set[str], access_token: str):
             headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
         )
         snapshot_response.raise_for_status()
-        # with open(f"response_{snapshot_name}.json", 'w') as outfile:
-        #     # not sure that I need to dump this? - not sure I need to write an output file at all actually
-        #     json.dump(snapshot_response.json(), outfile)
         latest_snapshot_id = snapshot_response.json()['items'][0]['id']
         latest_snapshots.append(latest_snapshot_id)
     return latest_snapshots
 
-# then for each snapshot get access url and add to a list of access urls for that snapshot
+
+# for each snapshot get access url and add to a list of access urls for that snapshot
 def get_access_urls(latest_snapshot_ids: list[str], access_token: str):
     for snapshot in latest_snapshot_ids:
         files_response = requests.get(
@@ -145,11 +145,8 @@ def get_access_urls(latest_snapshot_ids: list[str], access_token: str):
             headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
         )
         files_response.raise_for_status()
-        # TODO - don't need?
-        # with open(f"response_{snapshot}.json", 'w') as outfile:
-        #     json.dump(files_response.json(), outfile)
 
-        # Extract file details from the JSON file and append them to a text file
+        # Extract file details from the JSON file and add them to a list
         list_of_access_urls = []
         data = files_response.json()
         for item in data:
@@ -157,43 +154,28 @@ def get_access_urls(latest_snapshot_ids: list[str], access_token: str):
         return list_of_access_urls
 
 
-        # with open(f"response_{snapshot}.json", 'r') as json_file:
-        #     data = json.load(json_file)
-        #     with open("list_of_access_urls.txt", 'a') as outfile:
-        #         for item in data:
-        #             outfile.write(item['fileDetail']['accessUrl'] + '\n')
-
 def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]):
-    # TODO this will need to be modified to work one by one - see nesting below
     for staging_dir in staging_gs_paths:
         staging_data_dir = staging_dir + '/data/'
-        # output = subprocess.run(['gcloud', 'storage', 'ls', staging_data_dir], capture_output=True, text=True)
-        # TODO FIX THIS - it's not actually stopping if it's not empty - and it won't be entirely empty... need another method
-        # if output.stdout.strip() != '':
-        #     print(f"Staging area {staging_data_dir} is not empty")
-        #     print(output.stdout.strip())
-        # else:
-        #     print(f"Staging area {staging_data_dir} is empty - copying files now")
+        logging.info(f'staging_data_dir is {staging_data_dir}')
+        # using gsutil as output is cleaner & faster
+        output = subprocess.run(['gsutil', 'ls', staging_data_dir], capture_output=True)
+        stdout = output.stdout.strip()
+        files = stdout.decode('utf-8').split('\n')
+        if len(files) > 1:
+            logging.error(f"Staging area {staging_data_dir} is not empty")
+            logging.info(f"files in staging area are: {files}")
+            continue
+    else:
+        logging.info(f"Staging area {staging_data_dir} is empty - copying files now")
         for access_url in access_urls:
-            print(f"Copying {access_url} to {staging_data_dir}")
-            subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir])
-            # HM... why is copy hard? does it need /. at the end? or is it the access url that's wrong?
-
-# # Read the list of files from list_of_filepaths.txt and copy them using gcloud storage cp
-# we should make sure the staging/data dir is empty before running this
-# with open("list_of_access_urls.txt", 'r') as file:
-#     access_urls = file.read().splitlines()
-
-# # TODO
-# # copy command will look something like\
-# # gcloud storage cp gs://datarepo-4bcb4408-bucket/2e2aac27-3bf5-4a89-b466-e563cf99aef2/07a78be1-c75f-4463-a1a4-d4f7f9771ca5/SRR3562314_2.fastq.gz gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset/07e5ebc0-1386-4a33-8ce4-3007705adad8/data/.
-# # Also need to construct the staging/data gs:// path from the manifest.csv
-# # "EBI": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
-# # "UCSC": "gs://broad-dsp-monster-hca-prod-ebi-storage/prod",
-# # "LANTERN": "gs://broad-dsp-monster-hca-prod-lantern",
-# #  "LATTICE": "gs://broad-dsp-monster-hca-prod-lattice/staging",
-# for access_url in access_urls:
-#     subprocess.run(['gcloud storage', 'cp', access_url, "<INSERT STAGING /DATA url>"])
+            try:
+                # strip the filename from the access url because gcp is not a file system - it's all objects
+                filename = access_url.split('/')[-1]
+                print(f"Copying {access_url} to {staging_data_dir}{filename}")
+                subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir + filename])
+            except Exception as e:
+                logging.error(f"Error copying {access_url} to {staging_data_dir}{filename}: {e}")
 
 
 def main():
@@ -205,28 +187,29 @@ def main():
     """
     setup_cli_logging_format()
     access_token = get_access_token()
+
+    # read in the manifest and parse out the staging gs paths and project ids
     csv_path = sys.argv[1]
     validate_input(csv_path)
     staging_gs_paths = _parse_csv(csv_path)[0]
-    print(f"staging_gs_paths are {staging_gs_paths}")
+    logging.info(f"staging_gs_paths are {staging_gs_paths}")
     project_ids = _parse_csv(csv_path)[1]
-    print(f"project_ids are {project_ids}")
+    logging.info(f"project_ids are {project_ids}")
+
+    # get the target snapshot ids, based on standard HCA ingest naming conventions
     target_snapshots = _get_target_snapshot_ids(project_ids)
-    print(f"target snapshot ids are {target_snapshots}")
+    logging.info(f"target snapshot ids are {target_snapshots}")
+
+    # get the latest snapshot ids for each target snapshot
     latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token)
-    print(f"latest_snapshots_ids are {latest_snapshot_ids}")
+    logging.info(f"latest_snapshots_ids are {latest_snapshot_ids}")
+
+    # get the access urls for each file in the snapshot
     access_urls = get_access_urls(latest_snapshot_ids, access_token)
     print(f"access_urls are {access_urls}")
-    copy_tdr_to_staging(access_urls, staging_gs_paths)
 
-    # ultimately
-    # for each project_id in project_ids
-    #   get the staging gs path
-    #   get the snapshot name
-    #   get the latest snapshot id
-    #   get the access url for each file in the snapshot
-    #   for each file in the snapshot
-    #       copy the file from the access to the staging area
+    # copy the files from the TDR project bucket to the staging area bucket
+    copy_tdr_to_staging(access_urls, staging_gs_paths)
 
 
 if __name__ == '__main__':
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/get_snapshot_files_and_transfer.sh b/scripts/tdr/copy_from_tdr_to_gcs_hca/get_snapshot_files_and_transfer.sh
similarity index 100%
rename from scripts/tdr/copy_from_tdr_to_gcs/get_snapshot_files_and_transfer.sh
rename to scripts/tdr/copy_from_tdr_to_gcs_hca/get_snapshot_files_and_transfer.sh
diff --git a/scripts/tdr/copy_from_tdr_to_gcs/requirements.txt b/scripts/tdr/copy_from_tdr_to_gcs_hca/requirements.txt
similarity index 100%
rename from scripts/tdr/copy_from_tdr_to_gcs/requirements.txt
rename to scripts/tdr/copy_from_tdr_to_gcs_hca/requirements.txt

From a4d7f2388b402abc4e9f57683673fa628cca1b5d Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:24:54 -0400
Subject: [PATCH 05/14] adding docker build and push GitHub actions for
 copy_from_tdr_to_gcs_hca

---
 ...h_docker_copy_from_tdr_to_gcs_hca_dev.yaml | 40 +++++++++++++++++
 ..._docker_copy_from_tdr_to_gcs_hca_main.yaml | 43 +++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
 create mode 100644 .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml

diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
new file mode 100644
index 00000000..cee6d26f
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
@@ -0,0 +1,40 @@
+name: Build and Publish Dev Images for scripts/tdr/copy_from_tdr_to_gcs_hca
+on:
+  push:
+    branches-ignore: [main]
+    paths:
+      - scripts/copy_from_tdr_to_gcs_hca/**
+      - .github/workflows/**
+env:
+  GCP_PROJECT_ID: dsp-fieldeng-dev
+  GCP_REPOSITORY: horsefish
+  GITHUB_SHA: ${{ github.sha }}
+
+jobs:
+  build-and-push-dev-images:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to GCP
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: ${{ secrets.BASE64_SAKEY_DSPFIELDENG_GARPUSHER }}
+
+      - name: Configure Docker to use the Google Artifact Registry
+        run: gcloud auth configure-docker us-east4-docker.pkg.dev
+
+      - name: Build and Push General Docker Image
+        run: |
+          docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca
+          docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA
+
+      - name: Set image tag to 'dev'
+        run: |
+          docker tag us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:dev
+          docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:dev
\ No newline at end of file
diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
new file mode 100644
index 00000000..4c987247
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
@@ -0,0 +1,43 @@
+name: Build and Publish Latest Images for scripts/tdr/copy_from_tdr_to_gcs_hca
+on:
+  pull_request_target:
+    types:
+      - closed
+    branches:
+      - main
+    paths:
+      - scripts/tdr/copy_from_tdr_to_gcs_hca/**
+      - .github/workflows/**
+env:
+  GCP_PROJECT_ID: dsp-fieldeng-dev
+  GCP_REPOSITORY: horsefish
+  GITHUB_SHA: ${{ github.sha }}
+
+jobs:
+  build-and-push-dev-images:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to GCP
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: ${{ secrets.BASE64_SAKEY_DSPFIELDENG_GARPUSHER }}
+
+      - name: Configure Docker to use the Google Artifact Registry
+        run: gcloud auth configure-docker us-east4-docker.pkg.dev
+
+      - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image
+        run: |
+          docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/tdr/copy_from_tdr_to_gcs_hca
+          docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA
+
+      - name: Set image tag to 'latest'
+        run: |
+          docker tag us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest
+          docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest
\ No newline at end of file

From cfc2cded38e60eb5a082f273f839580b28effb4f Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:32:57 -0400
Subject: [PATCH 06/14] A little more code re-org an optimization to be sure
 that we never mix up lists

---
 ...tdr_hca.py => copy_from_tdr_to_gcs_hca.py} | 150 +++++++++---------
 1 file changed, 75 insertions(+), 75 deletions(-)
 rename scripts/tdr/copy_from_tdr_to_gcs_hca/{from_bash_copy_from_tdr_hca.py => copy_from_tdr_to_gcs_hca.py} (51%)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py b/scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py
similarity index 51%
rename from scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py
rename to scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py
index 61b8228e..058c859c 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs_hca/from_bash_copy_from_tdr_hca.py
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/copy_from_tdr_to_gcs_hca.py
@@ -1,3 +1,6 @@
+# Description: This script copies data from Terra Data Repository (TDR) HCA project buckets to HCA staging area buckets.
+# It is based on the bash script get_snapshot_files_and_transfer.sh, written by Samantha Velasquez.
+
 import os
 import sys
 import csv
@@ -16,8 +19,6 @@
         "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset"
 }
 
-# TODO change prints to logging
-
 
 def setup_cli_logging_format() -> None:
     logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout)
@@ -70,8 +71,7 @@ def _parse_csv(csv_path:str):
     Attribution:
     https://github.com/DataBiosphere/hca-ingest/blob/main/orchestration/hca_manage/manifest.py
     """
-    staging_gs_paths = set()
-    project_ids = set()
+    tuple_list = []
     with open(csv_path, "r") as f:
         reader = csv.reader(f)
         for row in reader:
@@ -83,8 +83,6 @@ def _parse_csv(csv_path:str):
             institution = row[0]
             project_id = find_project_id_in_str(row[1])
 
-            project_ids.add(project_id)
-
             if institution not in STAGING_AREA_BUCKETS:
                 raise Exception(f"Unknown institution {institution} found. "
                                 f"Make sure the institution is in the list of staging area buckets and is in all caps")
@@ -97,25 +95,11 @@ def _parse_csv(csv_path:str):
             assert path.startswith("gs://"), "Staging area path must start with gs:// scheme"
             staging_gs_path = path
 
-            staging_gs_paths.add(staging_gs_path)
-
-        return staging_gs_paths, project_ids
+            tuple_list.append((staging_gs_path, project_id))
 
-
-def _get_target_snapshot_ids(project_ids: set[str]) -> set[str]:
-    """
-    This function gets the target snapshot name filters for the given project ids
-    :param project_ids:
-    :return:
-    """
-    target_snapshots = set()
-    for project in project_ids:
-        target_snapshot = f"hca_prod_{project.replace('-', '')}"
-        target_snapshots.add(target_snapshot)
-    return target_snapshots
+        return tuple_list
 
 
-# TODO: make this work in a Docker image
 def get_access_token():
     creds, project = google.auth.default()
     auth_req = google.auth.transport.requests.Request()
@@ -124,40 +108,40 @@ def get_access_token():
     return access_token
 
 
-def _get_latest_snapshots(target_snapshots: set[str], access_token: str):
-    latest_snapshots = []
-    for snapshot_name in target_snapshots:
-        snapshot_response = requests.get(
-            f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter={snapshot_name}',
-            headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
-        )
-        snapshot_response.raise_for_status()
-        latest_snapshot_id = snapshot_response.json()['items'][0]['id']
-        latest_snapshots.append(latest_snapshot_id)
-    return latest_snapshots
+def get_latest_snapshot(target_snapshot: str, access_token: str):
+    snapshot_response = requests.get(
+        f'https://data.terra.bio/api/repository/v1/snapshots?offset=0&limit=10&sort=created_date&direction=desc&filter='
+        f'{target_snapshot}',
+        headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
+    )
+    snapshot_response.raise_for_status()
+    latest_snapshot_id = snapshot_response.json()['items'][0]['id']
+    return latest_snapshot_id
 
 
 # for each snapshot get access url and add to a list of access urls for that snapshot
-def get_access_urls(latest_snapshot_ids: list[str], access_token: str):
-    for snapshot in latest_snapshot_ids:
-        files_response = requests.get(
-            f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0',
-            headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
-        )
-        files_response.raise_for_status()
-
-        # Extract file details from the JSON file and add them to a list
-        list_of_access_urls = []
-        data = files_response.json()
-        for item in data:
-            list_of_access_urls.append(item['fileDetail']['accessUrl'])
-        return list_of_access_urls
-
-
-def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]):
+def get_access_urls(snapshot: str, access_token: str):
+    list_of_access_urls = []
+    # for snapshot in latest_snapshot_ids:
+    logging.info(f"getting access urls for snapshot {snapshot}")
+    files_response = requests.get(
+        f'https://data.terra.bio/api/repository/v1/snapshots/{snapshot}/files?offset=0',
+        headers={'accept': 'application/json', 'Authorization': f'Bearer {access_token}'}
+    )
+    files_response.raise_for_status()
+
+    # Extract file details from the JSON file and add them to a list
+    data = files_response.json()
+    for item in data:
+        list_of_access_urls.append(item['fileDetail']['accessUrl'])
+    return list_of_access_urls
+
+
+def check_staging_is_empty(staging_gs_paths: set[str]):
+    nonempty_staging_areas = []
     for staging_dir in staging_gs_paths:
         staging_data_dir = staging_dir + '/data/'
-        logging.info(f'staging_data_dir is {staging_data_dir}')
+        logging.info(f'checking contents of staging_data dir: {staging_data_dir}')
         # using gsutil as output is cleaner & faster
         output = subprocess.run(['gsutil', 'ls', staging_data_dir], capture_output=True)
         stdout = output.stdout.strip()
@@ -165,17 +149,42 @@ def copy_tdr_to_staging(access_urls: list[str], staging_gs_paths: set[str]):
         if len(files) > 1:
             logging.error(f"Staging area {staging_data_dir} is not empty")
             logging.info(f"files in staging area are: {files}")
-            continue
-    else:
-        logging.info(f"Staging area {staging_data_dir} is empty - copying files now")
+            nonempty_staging_areas.append(staging_data_dir)
+        else:
+            logging.info(f"Staging area {staging_data_dir} is empty")
+
+    if len(nonempty_staging_areas) > 0:
+        logging.error("One or more staging areas are not empty. Exiting.")
+        logging.info(f"Non-empty staging areas are: {nonempty_staging_areas}")
+        sys.exit(1)
+
+
+def copy_tdr_to_staging(tuple_list: list[tuple[str, str]], access_token: str):
+    for project_id in set([x[1] for x in tuple_list]):
+        target_snapshot = f"hca_prod_{project_id.replace('-', '')}"
+        latest_snapshot_id = get_latest_snapshot(target_snapshot, access_token)
+        logging.info(f'latest snapshot id for project {project_id} is {latest_snapshot_id}')
+        access_urls = get_access_urls(latest_snapshot_id, access_token)
+        num_access_urls = len(access_urls)
+        staging_gs_path = [x[0] for x in tuple_list if x[1] == project_id][0]
+        staging_data_dir = staging_gs_path + '/data/'
+        logging.info(f'Copying {num_access_urls} files from snapshot {latest_snapshot_id} to staging area {staging_data_dir}')
         for access_url in access_urls:
+            # strip the filename from the access url because gcp is not a file system - it's all objects
+            filename = access_url.split('/')[-1]
+            logging.info(f'access_url for snapshot {latest_snapshot_id} is {access_url}')
             try:
-                # strip the filename from the access url because gcp is not a file system - it's all objects
-                filename = access_url.split('/')[-1]
-                print(f"Copying {access_url} to {staging_data_dir}{filename}")
                 subprocess.run(['gcloud', 'storage', 'cp', access_url, staging_data_dir + filename])
             except Exception as e:
-                logging.error(f"Error copying {access_url} to {staging_data_dir}{filename}: {e}")
+                logging.error(f'Error copying {access_url} to {staging_gs_path}{filename}: {e}')
+                continue
+        # visual summary of number of files copied
+        files_copied = subprocess.run(['gsutil', 'ls', staging_data_dir],
+                                      capture_output=True).stdout.decode('utf-8').split('\n')
+        # gsutil outputs the dir and a blank line, so we need to remove the blank line and the dir to count files
+        files_in_dir = [x.split('/')[-1] for x in files_copied if x and x.split('/')[-1]]
+        number_files_copied = len(files_in_dir)
+        logging.info(f'{number_files_copied} out of {num_access_urls} files copied to {staging_data_dir}')
 
 
 def main():
@@ -188,28 +197,19 @@ def main():
     setup_cli_logging_format()
     access_token = get_access_token()
 
-    # read in the manifest and parse out the staging gs paths and project ids
+    # read in the manifest and get a tuple list of staging gs paths and project ids
     csv_path = sys.argv[1]
     validate_input(csv_path)
-    staging_gs_paths = _parse_csv(csv_path)[0]
-    logging.info(f"staging_gs_paths are {staging_gs_paths}")
-    project_ids = _parse_csv(csv_path)[1]
-    logging.info(f"project_ids are {project_ids}")
-
-    # get the target snapshot ids, based on standard HCA ingest naming conventions
-    target_snapshots = _get_target_snapshot_ids(project_ids)
-    logging.info(f"target snapshot ids are {target_snapshots}")
-
-    # get the latest snapshot ids for each target snapshot
-    latest_snapshot_ids = _get_latest_snapshots(target_snapshots, access_token)
-    logging.info(f"latest_snapshots_ids are {latest_snapshot_ids}")
+    tuple_list = _parse_csv(csv_path)
+    logging.info(f"staging_gs_paths and project id tuple list is {tuple_list}")
 
-    # get the access urls for each file in the snapshot
-    access_urls = get_access_urls(latest_snapshot_ids, access_token)
-    print(f"access_urls are {access_urls}")
+    # staging dir is the first element in each tuple
+    staging_gs_paths = set([x[0] for x in tuple_list])
 
+    # check if the staging area is empty
+    check_staging_is_empty(staging_gs_paths)
     # copy the files from the TDR project bucket to the staging area bucket
-    copy_tdr_to_staging(access_urls, staging_gs_paths)
+    copy_tdr_to_staging(tuple_list, access_token)
 
 
 if __name__ == '__main__':

From 25a5570290c04b7fd1b2689c4581f9f4b77055a7 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:33:22 -0400
Subject: [PATCH 07/14] update readme and add example file

---
 .../tdr/copy_from_tdr_to_gcs_hca/README.md    | 48 +++++++++++++++----
 .../dcpTEST_manifest.csv                      |  2 +
 2 files changed, 41 insertions(+), 9 deletions(-)
 create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv

diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
index ff96eb3b..5d1f7f7d 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
@@ -2,16 +2,46 @@
 This was originally a bash script written by Samantha Velasquez\
 [get_snapshot_files_and_transfer.sh](get_snapshot_files_and_transfer.sh) \
 which was written to copy files from a TDR snapshot to an Azure bucket.\
-Bobbie then translated to python using CoPilot.\
+Bobbie then translated to python using CoPilot and it ballooned from there.\
 [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \
+The bash script is now just here for posterity as it previously only lived in Slack. 
+It has not been tested in the Docker image created for the Python script.
 
-Set up:
-gcloud auth login
-## TODO
-- [ ] fix requirements.txt as needed
-- [ ] update the script to copy to staging /data bucket
-- [ ] update the script to take in a csv containing the institution & project UUID for HCA
+## Running the Script
+You will want to clone the whole horsefish repo, if you have not done so already.\
+
+You will also need a manifest file to run the script.\
+The format of this manifest is identical to the one use for [HCA ingest](https://docs.google.com/document/d/1NQCDlvLgmkkveD4twX5KGv6SZUl8yaIBgz_l1EcrHdA/edit#heading=h.cg8d8o5kklql).
+A sample manifest is provided in the project directory - dcpTEST_manifest.csv.\
+(Note that this is a test manifest and you will have to first load the data into TDR to use it - see the HCA ingest Ops manual linked above).\
+It's probably easiest to copy out the rows from the original ingest manifest into a new manifest, 
+then move that file into this project directory, so that it is picked up by compose.\
+
+If you are not already logged in to gcloud/docker, you will need to do so before running the Docker compose command.\
+`gcloud auth application-default login` \
+`gcloud auth configure-docker us-east4-docker.pkg.dev`
+
+To start up the run/dev Docker compose env \
+`docker compose run app bash`\
+This will pull the latest image from Artifact Registry, start up the container, and mount the project dir, 
+so changes in your local project dir will be reflected in the container.
+
+Next you will want to authenticate with gcloud using your Broad credentials.\
+`gcloud auth login`\
+`gcloud config set project dsp-fieldeng-dev`* \
+`gcloud auth application-default login` \
+If you are not in dsp-fieldeng-dev
+Then run the script using the following command syntax:\
+`python3 copy_from_tdr_to_gcs.py <manifest_file>'`
+
+Contact Field Eng for any issues that arise.
+
+
+## Possible improvements*
 - [ ] optional - update the script with conditional logic to accept a snapshot ID and destination instead
 - [ ] take care of any remaining TODOs in the script
-- [ ] test/debug/update script
-- [ ] add Dockerfile > push Docker image to artifact registry (check with Field Eng as to where to push)
\ No newline at end of file
+- [ ] update the script check lower case institution against lower case institution keys - see ~line 86
+
+*this is likely to be used only rarely and mostly by the author, as a stop gap until partial updates have been implemented.
+As such, we are attempting to keep this as light as possible, so as not to introduce unnecessary complexity.
+
diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv b/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv
new file mode 100644
index 00000000..fc2d672d
--- /dev/null
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/dcpTEST_manifest.csv
@@ -0,0 +1,2 @@
+TEST,003d5674-9bf6-4e51-ab1b-8fed80c308b9
+TEST,07e5ebc0-1386-4a33-8ce4-3007705adad8
\ No newline at end of file

From df2dd223d7a7bd98d523ab036320d3e32895e607 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:33:48 -0400
Subject: [PATCH 08/14] add a Docker compose dev env

---
 .../tdr/copy_from_tdr_to_gcs_hca/Dockerfile   | 49 +++++++++++++++++++
 .../docker-compose.yaml                       | 10 ++++
 2 files changed, 59 insertions(+)
 create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
 create mode 100644 scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml

diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
new file mode 100644
index 00000000..932e5580
--- /dev/null
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
@@ -0,0 +1,49 @@
+FROM us.gcr.io/broad-dsp-gcr-public/base/python:3.12-alpine
+
+ENV PATH /google-cloud-sdk/bin:$PATH
+RUN if [ `uname -m` = 'x86_64' ]; then echo -n "x86_64" > /tmp/arch; else echo -n "arm" > /tmp/arch; fi;
+RUN ARCH=`cat /tmp/arch` && apk --no-cache upgrade && apk --no-cache add \
+        bash    \
+        curl \
+        python3 \
+        py3-crcmod \
+        py3-openssl \
+        bash \
+        libc6-compat \
+        openssh-client \
+        git \
+        gnupg \
+    && curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz && \
+    tar xzf google-cloud-cli-linux-x86_64.tar.gz && \
+    rm google-cloud-cli-linux-x86_64.tar.gz && \
+    gcloud config set core/disable_usage_reporting true && \
+    gcloud config set component_manager/disable_update_check true && \
+    gcloud config set metrics/environment docker_image_alpine && \
+    gcloud --version
+RUN git config --system credential.'https://source.developers.google.com'.helper gcloud.sh
+VOLUME ["/root/.config"]
+
+WORKDIR /scripts/tdr/copy_from_tdr_to_gcs_hca
+
+# copy the contents of /scripts/tdr/copy_from_tdr_to_gcs_hca to the WORKDIR
+COPY * .
+
+RUN pip install -r requirements.txt
+
+ENV PYTHONPATH "/scripts:${PYTHONPATH}"
+CMD ["/bin/bash"]
+
+# builds with GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_main.yaml and ../.github/workflows/build-and-push_docker_dev.yaml
+# tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest
+
+# To manually build and run locally
+# docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version> .
+# docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>
+
+# to build and push to Artifact Registry
+# make sure you are logged in to gcloud and that application default credentials are set
+# gcloud auth login
+# gcloud config set project dsp-fieldeng-dev
+# gcloud auth application-default login
+# set the <new_version> before building and pushing
+# docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>
\ No newline at end of file
diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml b/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml
new file mode 100644
index 00000000..11951927
--- /dev/null
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/docker-compose.yaml
@@ -0,0 +1,10 @@
+services:
+  app:
+    # for dev
+    # build: .
+    # or specify your dev sha or local image
+    image: us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:latest
+    container_name: copy_from_tdr_to_gcs_hca
+    command: bin/bash --reload
+    volumes:
+      - .:/scripts/tdr/copy_from_tdr_to_gcs_hca
\ No newline at end of file

From af9f6c67ee5aede7122802a1b258bed156d52ca5 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:41:00 -0400
Subject: [PATCH 09/14] fixed name of build step

---
 .../build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
index cee6d26f..1212ef9e 100644
--- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
@@ -29,7 +29,7 @@ jobs:
       - name: Configure Docker to use the Google Artifact Registry
         run: gcloud auth configure-docker us-east4-docker.pkg.dev
 
-      - name: Build and Push General Docker Image
+      - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image
         run: |
           docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca
           docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA

From aec4ef5e82ba9851d4a724339317d29c4b9c7826 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:41:26 -0400
Subject: [PATCH 10/14] got more specific about which file to watch

---
 .github/workflows/build_and_push_docker_gen_dev.yaml  | 2 +-
 .github/workflows/build_and_push_docker_gen_main.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_gen_dev.yaml b/.github/workflows/build_and_push_docker_gen_dev.yaml
index 9f4d7ee2..3395052a 100644
--- a/.github/workflows/build_and_push_docker_gen_dev.yaml
+++ b/.github/workflows/build_and_push_docker_gen_dev.yaml
@@ -4,7 +4,7 @@ on:
     branches-ignore: [main]
     paths:
       - scripts/general/**
-      - .github/workflows/**
+      - .github/workflows/build_and_push_docker_gen_dev.yaml
 env:
   GCP_PROJECT_ID: dsp-fieldeng-dev
   GCP_REPOSITORY_GENERAL: horsefish
diff --git a/.github/workflows/build_and_push_docker_gen_main.yaml b/.github/workflows/build_and_push_docker_gen_main.yaml
index 151e6ef6..5de917a6 100644
--- a/.github/workflows/build_and_push_docker_gen_main.yaml
+++ b/.github/workflows/build_and_push_docker_gen_main.yaml
@@ -7,7 +7,7 @@ on:
       - main
     paths:
       - scripts/general/**
-      - .github/workflows/**
+      - .github/workflows/build_and_push_docker_gen_main.yaml
 env:
   GCP_PROJECT_ID: dsp-fieldeng-dev
   GCP_REPOSITORY_GENERAL: horsefish

From c07af4a470d11b10d7d3c82dbb3d2cd0449db3da Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:42:01 -0400
Subject: [PATCH 11/14] got more specific about which file to watch

---
 .../build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml     | 2 +-
 .../build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
index 1212ef9e..714931e2 100644
--- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
@@ -4,7 +4,7 @@ on:
     branches-ignore: [main]
     paths:
       - scripts/copy_from_tdr_to_gcs_hca/**
-      - .github/workflows/**
+      - .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
 env:
   GCP_PROJECT_ID: dsp-fieldeng-dev
   GCP_REPOSITORY: horsefish
diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
index 4c987247..fa0c95c3 100644
--- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
@@ -7,7 +7,7 @@ on:
       - main
     paths:
       - scripts/tdr/copy_from_tdr_to_gcs_hca/**
-      - .github/workflows/**
+      - .github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_main.yaml
 env:
   GCP_PROJECT_ID: dsp-fieldeng-dev
   GCP_REPOSITORY: horsefish

From 69532e98b2140dea2c2625af656c21638c23e23c Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 16:44:04 -0400
Subject: [PATCH 12/14] fix path for context

---
 ...h_docker_copy_from_tdr_to_gcs_hca_dev.yaml |  2 +-
 scripts/general/test.py                       | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 scripts/general/test.py

diff --git a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
index 714931e2..eecad0eb 100644
--- a/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
+++ b/.github/workflows/build_and_push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Build and Push copy_from_tdr_to_gcs_hca Docker Image
         run: |
-          docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/copy_from_tdr_to_gcs_hca
+          docker build -t us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA -f scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile scripts/tdr/copy_from_tdr_to_gcs_hca
           docker push us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA
 
       - name: Set image tag to 'dev'
diff --git a/scripts/general/test.py b/scripts/general/test.py
new file mode 100644
index 00000000..9cadcb2c
--- /dev/null
+++ b/scripts/general/test.py
@@ -0,0 +1,22 @@
+# Usage: python test.py file1 file2
+# file1: file to loop through
+# file2: file to search for matching string
+# if no matching string found, print the string
+# if found, do nothing
+# output: print the string if no matching string found, print "found" if all strings are found
+
+import sys
+
+with open(sys.argv[1]) as f:
+    lines = [line.strip() for line in f.readlines()]
+
+with open(sys.argv[2], 'r') as f2:
+    data = f2.read()
+
+counter = 0
+for line in lines:
+    if line not in data:
+        print(f'Not Found: {line}')
+        counter += 1
+    else:
+        print("counter")
\ No newline at end of file

From 4079e7f641dbd9a532cd9f7f1fd630b2618d8b52 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Tue, 13 Aug 2024 17:13:30 -0400
Subject: [PATCH 13/14] readme updates for accuracy and clarity

---
 scripts/tdr/copy_from_tdr_to_gcs_hca/README.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
index 5d1f7f7d..5c6b6844 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
@@ -1,21 +1,25 @@
 # Copy from TDR to GCS
 This was originally a bash script written by Samantha Velasquez\
 [get_snapshot_files_and_transfer.sh](get_snapshot_files_and_transfer.sh) \
-which was written to copy files from a TDR snapshot to an Azure bucket.\
-Bobbie then translated to python using CoPilot and it ballooned from there.\
+which was written to copy files from a TDR snapshot to an Azure bucket. \
+Bobbie then translated to python using CoPilot and it ballooned from there. \
 [copy_from_tdr_to_gcs.py](copy_from_tdr_to_gcs.py) \
 The bash script is now just here for posterity as it previously only lived in Slack. 
 It has not been tested in the Docker image created for the Python script.
 
 ## Running the Script
-You will want to clone the whole horsefish repo, if you have not done so already.\
+**IMPPORTANT**\
+You will need to be in either the [Monster Group](https://groups.google.com/a/broadinstitute.org/g/monster) 
+or the [Field Eng group](https://groups.google.com/a/broadinstitute.org/g/dsp-fieldeng) to run this script.
+
+You will want to clone the whole horsefish repo, if you have not done so already.
 
 You will also need a manifest file to run the script.\
 The format of this manifest is identical to the one use for [HCA ingest](https://docs.google.com/document/d/1NQCDlvLgmkkveD4twX5KGv6SZUl8yaIBgz_l1EcrHdA/edit#heading=h.cg8d8o5kklql).
 A sample manifest is provided in the project directory - dcpTEST_manifest.csv.\
 (Note that this is a test manifest and you will have to first load the data into TDR to use it - see the HCA ingest Ops manual linked above).\
 It's probably easiest to copy out the rows from the original ingest manifest into a new manifest, 
-then move that file into this project directory, so that it is picked up by compose.\
+then move that file into this project directory, so that it is picked up by compose.
 
 If you are not already logged in to gcloud/docker, you will need to do so before running the Docker compose command.\
 `gcloud auth application-default login` \
@@ -32,9 +36,10 @@ Next you will want to authenticate with gcloud using your Broad credentials.\
 `gcloud auth application-default login` \
 If you are not in dsp-fieldeng-dev
 Then run the script using the following command syntax:\
-`python3 copy_from_tdr_to_gcs.py <manifest_file>'`
+`python3 copy_from_tdr_to_gcs_hca.py <manifest_file>'`
 
-Contact Field Eng for any issues that arise.
+Contact Field Eng for any issues that arise. \
+_*or the monster hca prod project - mystical-slate-284720_
 
 
 ## Possible improvements*

From d647b61331b79f9e871bf638c2e4131b40bc8c10 Mon Sep 17 00:00:00 2001
From: bahill <bhill@broadinstitute.org>
Date: Wed, 14 Aug 2024 10:44:42 -0400
Subject: [PATCH 14/14] moved comments in Dockerfile to readme and updated
 readme with possible future improvements

---
 .../tdr/copy_from_tdr_to_gcs_hca/Dockerfile   | 15 -----------
 .../tdr/copy_from_tdr_to_gcs_hca/README.md    | 25 ++++++++++++++++---
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
index 932e5580..db91a1e2 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/Dockerfile
@@ -32,18 +32,3 @@ RUN pip install -r requirements.txt
 
 ENV PYTHONPATH "/scripts:${PYTHONPATH}"
 CMD ["/bin/bash"]
-
-# builds with GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_main.yaml and ../.github/workflows/build-and-push_docker_dev.yaml
-# tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest
-
-# To manually build and run locally
-# docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version> .
-# docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>
-
-# to build and push to Artifact Registry
-# make sure you are logged in to gcloud and that application default credentials are set
-# gcloud auth login
-# gcloud config set project dsp-fieldeng-dev
-# gcloud auth application-default login
-# set the <new_version> before building and pushing
-# docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>
\ No newline at end of file
diff --git a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
index 5c6b6844..eacbb524 100644
--- a/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
+++ b/scripts/tdr/copy_from_tdr_to_gcs_hca/README.md
@@ -41,11 +41,30 @@ Then run the script using the following command syntax:\
 Contact Field Eng for any issues that arise. \
 _*or the monster hca prod project - mystical-slate-284720_
 
+## Building the Docker Image
+The image builds with the GitHub Action "Main Validation and Release" ../.github/workflows/build-and-push_docker_copy_from_tdr_to_gcs_hca_main.yaml 
+and ../.github/workflows/build-and-push_docker_copy_from_tdr_to_gcs_hca_dev.yaml
+tags = us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:$GITHUB_SHA, 
+us-east4-docker.pkg.dev/$GCP_PROJECT_ID/$GCP_REPOSITORY/copy_from_tdr_to_gcs_hca:latest
+
+### To manually build and run locally
+`docker build -t us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version> .` \
+`docker run --rm -it us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>`
+
+### To build and push to Artifact Registry
+- make sure you are logged in to gcloud and that application default credentials are set \
+`gcloud auth login` \
+`gcloud config set project dsp-fieldeng-dev` \
+`gcloud auth application-default login`
+- set the <new_version> before building and pushing \
+`docker push us-east4-docker.pkg.dev/dsp-fieldeng-dev/horsefish/copy_from_tdr_to_gcs_hca:<new_version>`
+
 
 ## Possible improvements*
-- [ ] optional - update the script with conditional logic to accept a snapshot ID and destination instead
-- [ ] take care of any remaining TODOs in the script
-- [ ] update the script check lower case institution against lower case institution keys - see ~line 86
+- update the script with conditional logic to accept a snapshot ID and destination instead
+- update the script check lower case institution against lower case institution keys - see ~line 86
+- update the script to merge `validate_input()` and `_parse_csv()` into one function
+- Consider adding a copy manifest to this command, so instead you validating number of files copied (line 187), you can specifically highlight the files not copied successfully.
 
 *this is likely to be used only rarely and mostly by the author, as a stop gap until partial updates have been implemented.
 As such, we are attempting to keep this as light as possible, so as not to introduce unnecessary complexity.