diff --git a/.env.sample b/.env.sample index 418957b..fafb8f9 100644 --- a/.env.sample +++ b/.env.sample @@ -4,3 +4,5 @@ FLYWHEEL_GROUP="" ORTHANC_CREDENTIALS="" ORTHANC_HOST="" ORTHANC_PORT=80 +PHI_BUCKET_NAME="phi-data-bucket" +SUBJECT_ID_MAPPING_PATH="s3://phi-data-bucket/subject_id_mapping.csv" diff --git a/README.md b/README.md index b44488b..f41bd7c 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ Then, customize its contents with a text editor: - For `FLYWHEEL_GROUP`, specify either `d3b` or an alternative group created for testing (e.g., your name). - For `ORTHANC_CREDENTIALS`, use your Orthanc username and password specified like `username:password`. - For `ORTHANC_HOST`, specify the hostname (minus `http(s)://`) that you use to access Orthanc. +- For `PHI_DATA_BUCKET_NAME`, specify the bucket name where the ETL should backup NIfTI files. +- For `SUBJECT_ID_MAPPING_PATH`, specify the [path](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) to the CSV file containing subject ID mappings. Next, run `update` to build the container image and initialize the database: diff --git a/deployment/terraform/batch.tf b/deployment/terraform/batch.tf index 468fb18..6760208 100644 --- a/deployment/terraform/batch.tf +++ b/deployment/terraform/batch.tf @@ -104,6 +104,9 @@ resource "aws_batch_job_definition" "default" { orthanc_host = var.orthanc_host orthanc_port = var.orthanc_port + phi_data_bucket_name = var.d3b_phi_data_bucket_name + subject_id_mapping_path = var.subject_id_mapping_path + image_deid_etl_log_level = var.image_deid_etl_log_level }) } diff --git a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl index e1ee71a..185f297 100644 --- a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl +++ b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl @@ -27,6 +27,14 @@ "name": "ORTHANC_PORT", "value": "${orthanc_port}" }, + { + "name": "PHI_DATA_BUCKET_NAME", + "value": "${phi_data_bucket_name}" + }, + { + "name": "SUBJECT_ID_MAPPING_PATH", + "value": "${subject_id_mapping_path}" + }, { "name": "IMAGE_DEID_ETL_LOG_LEVEL", "value": "${image_deid_etl_log_level}" diff --git a/deployment/terraform/variables.tf b/deployment/terraform/variables.tf index 6b156e0..67dfbf7 100644 --- a/deployment/terraform/variables.tf +++ b/deployment/terraform/variables.tf @@ -274,6 +274,10 @@ variable "orthanc_port" { type = number } +variable "subject_id_mapping_path" { + type = string +} + variable "image_deid_etl_log_level" { type = string default = "INFO" diff --git a/src/image_deid_etl/image_deid_etl/__main__.py b/src/image_deid_etl/image_deid_etl/__main__.py index 0d86d0d..6a6808e 100644 --- a/src/image_deid_etl/image_deid_etl/__main__.py +++ b/src/image_deid_etl/image_deid_etl/__main__.py @@ -4,7 +4,6 @@ import os import sys import tempfile -import time import boto3 import flywheel @@ -31,6 +30,11 @@ "You must supply a valid Flywheel group in FLYWHEEL_GROUP." ) +PHI_DATA_BUCKET_NAME = os.getenv("PHI_DATA_BUCKET_NAME") +if PHI_DATA_BUCKET_NAME is None: + raise ImproperlyConfigured( + "You must supply a valid S3 bucket in PHI_DATA_BUCKET_NAME." + ) # Configure Python's logging module. The Django project does a fantastic job explaining how logging works: # https://docs.djangoproject.com/en/4.0/topics/logging/ @@ -229,7 +233,9 @@ def add_fw_metadata(args) -> int: def s3_backup_niftis(args) -> int: local_path = f"{args.program}/{args.site}/" - s3_path = f"s3://d3b-phi-data-prd/imaging/radiology/{args.program}/{args.site}/" + s3_path = ( + f"s3://{PHI_DATA_BUCKET_NAME}/imaging/radiology/{args.program}/{args.site}/" + ) return os.system("aws s3 sync " + local_path + "NIfTIs/ " + s3_path + "NIfTIs/") diff --git a/src/image_deid_etl/image_deid_etl/main_pipeline.py b/src/image_deid_etl/image_deid_etl/main_pipeline.py index 78f6b13..473a471 100644 --- a/src/image_deid_etl/image_deid_etl/main_pipeline.py +++ b/src/image_deid_etl/image_deid_etl/main_pipeline.py @@ -1,9 +1,8 @@ -# NOTE: cbtn-all is HARD-CODED in cbtn_subject_info, will need to change this when ADAPT sets up routine CSV updating import logging -import boto3 -import pandas as pd -import io +import pandas + +from image_deid_etl.exceptions import ImproperlyConfigured from image_deid_etl.external_data_handling import * from image_deid_etl.images_no_save import * @@ -11,12 +10,12 @@ todays_date = datetime.now().strftime('%Y-%m-%d') -# point to cbtn-all CSV file from s3 using boto3 & default AWS profile -table_fn = 'cbtn-all_identified_2022-03-17.csv' -bucket_name = 'd3b-phi-data-prd' -obj_path = f'imaging/{table_fn}' -s3_client = boto3.client('s3') -obj = s3_client.get_object(Bucket=bucket_name, Key=obj_path) +SUBJECT_ID_MAPPING_PATH = os.getenv("SUBJECT_ID_MAPPING_PATH") +if SUBJECT_ID_MAPPING_PATH is None: + raise ImproperlyConfigured( + "You must supply a valid string path in SUBJECT_ID_MAPPING_PATH." + ) + def subject_info(local_path, program, file_dir, validate=0): # site_name = local_path.split('/')[1] @@ -29,7 +28,7 @@ def subject_info(local_path, program, file_dir, validate=0): if program == 'cbtn': # get CBTN Subject IDs try: - cbtn_all_df = pd.read_csv(io.BytesIO(obj['Body'].read())) + cbtn_all_df = pandas.read_csv(SUBJECT_ID_MAPPING_PATH) except IndexError as error: logger.error("Missing CBTN subject ID .csv file from internal EIG database: %r", error) try: diff --git a/src/image_deid_etl/requirements.txt b/src/image_deid_etl/requirements.txt index b326088..2a04b16 100644 --- a/src/image_deid_etl/requirements.txt +++ b/src/image_deid_etl/requirements.txt @@ -1,3 +1,4 @@ +boto3==1.21.34 flywheel-sdk==15.8.0 nibabel==3.2.2 numpy==1.22.3 @@ -8,6 +9,6 @@ psycopg2==2.9.3 pydicom==2.2.2 python-magic==0.4.25 requests==2.27.1 +s3fs>=0.4.0 # Relaxing the version constraint to avoid botocore incompatibilities. sqlalchemy==1.4.32 urllib3==1.26.8 -boto3==1.21.34 diff --git a/src/image_deid_etl/setup.cfg b/src/image_deid_etl/setup.cfg index d1d8467..585ce0e 100644 --- a/src/image_deid_etl/setup.cfg +++ b/src/image_deid_etl/setup.cfg @@ -9,6 +9,7 @@ url = https://github.com/d3b-center/image-deid-etl packages = find: python_requires = >=3.9,<3.10 install_requires = + boto3 flywheel-sdk nibabel numpy @@ -19,9 +20,9 @@ install_requires = pydicom python-magic requests + s3fs SQLAlchemy urllib3 - boto3 [options.entry_points] console_scripts =