From 5c27460d6e128d49da0bf04855cd997f1bc042da Mon Sep 17 00:00:00 2001 From: Rocky Breslow <1774125+rbreslow@users.noreply.github.com> Date: Tue, 17 May 2022 11:54:55 -0400 Subject: [PATCH] Read subject ID mapping via Pandas API This attempts to fix a "Connection reset by peer" error we're receiving that may be a result of the call to io.BytesIO. I'd like to simplify things to use the native Pandas API for reading from S3 to see if the error goes away. Also, we're now reading these paths in from the environment. --- .env.sample | 2 ++ README.md | 2 ++ deployment/terraform/batch.tf | 3 +++ .../job-definitions/image-deid-etl.json.tmpl | 8 +++++++ deployment/terraform/variables.tf | 4 ++++ src/image_deid_etl/image_deid_etl/__main__.py | 10 +++++++-- .../image_deid_etl/main_pipeline.py | 21 +++++++++---------- src/image_deid_etl/requirements.txt | 3 ++- src/image_deid_etl/setup.cfg | 3 ++- 9 files changed, 41 insertions(+), 15 deletions(-) diff --git a/.env.sample b/.env.sample index 418957b..fafb8f9 100644 --- a/.env.sample +++ b/.env.sample @@ -4,3 +4,5 @@ FLYWHEEL_GROUP="" ORTHANC_CREDENTIALS="" ORTHANC_HOST="" ORTHANC_PORT=80 +PHI_BUCKET_NAME="phi-data-bucket" +SUBJECT_ID_MAPPING_PATH="s3://phi-data-bucket/subject_id_mapping.csv" diff --git a/README.md b/README.md index b44488b..f41bd7c 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ Then, customize its contents with a text editor: - For `FLYWHEEL_GROUP`, specify either `d3b` or an alternative group created for testing (e.g., your name). - For `ORTHANC_CREDENTIALS`, use your Orthanc username and password specified like `username:password`. - For `ORTHANC_HOST`, specify the hostname (minus `http(s)://`) that you use to access Orthanc. +- For `PHI_DATA_BUCKET_NAME`, specify the bucket name where the ETL should backup NIfTI files. +- For `SUBJECT_ID_MAPPING_PATH`, specify the [path](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) to the CSV file containing subject ID mappings. Next, run `update` to build the container image and initialize the database: diff --git a/deployment/terraform/batch.tf b/deployment/terraform/batch.tf index 468fb18..6760208 100644 --- a/deployment/terraform/batch.tf +++ b/deployment/terraform/batch.tf @@ -104,6 +104,9 @@ resource "aws_batch_job_definition" "default" { orthanc_host = var.orthanc_host orthanc_port = var.orthanc_port + phi_data_bucket_name = var.d3b_phi_data_bucket_name + subject_id_mapping_path = var.subject_id_mapping_path + image_deid_etl_log_level = var.image_deid_etl_log_level }) } diff --git a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl index e1ee71a..185f297 100644 --- a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl +++ b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl @@ -27,6 +27,14 @@ "name": "ORTHANC_PORT", "value": "${orthanc_port}" }, + { + "name": "PHI_DATA_BUCKET_NAME", + "value": "${phi_data_bucket_name}" + }, + { + "name": "SUBJECT_ID_MAPPING_PATH", + "value": "${subject_id_mapping_path}" + }, { "name": "IMAGE_DEID_ETL_LOG_LEVEL", "value": "${image_deid_etl_log_level}" diff --git a/deployment/terraform/variables.tf b/deployment/terraform/variables.tf index 6b156e0..67dfbf7 100644 --- a/deployment/terraform/variables.tf +++ b/deployment/terraform/variables.tf @@ -274,6 +274,10 @@ variable "orthanc_port" { type = number } +variable "subject_id_mapping_path" { + type = string +} + variable "image_deid_etl_log_level" { type = string default = "INFO" diff --git a/src/image_deid_etl/image_deid_etl/__main__.py b/src/image_deid_etl/image_deid_etl/__main__.py index 0d86d0d..6a6808e 100644 --- a/src/image_deid_etl/image_deid_etl/__main__.py +++ b/src/image_deid_etl/image_deid_etl/__main__.py @@ -4,7 +4,6 @@ import os import sys import tempfile -import time import boto3 import flywheel @@ -31,6 +30,11 @@ "You must supply a valid Flywheel group in FLYWHEEL_GROUP." ) +PHI_DATA_BUCKET_NAME = os.getenv("PHI_DATA_BUCKET_NAME") +if PHI_DATA_BUCKET_NAME is None: + raise ImproperlyConfigured( + "You must supply a valid S3 bucket in PHI_DATA_BUCKET_NAME." + ) # Configure Python's logging module. The Django project does a fantastic job explaining how logging works: # https://docs.djangoproject.com/en/4.0/topics/logging/ @@ -229,7 +233,9 @@ def add_fw_metadata(args) -> int: def s3_backup_niftis(args) -> int: local_path = f"{args.program}/{args.site}/" - s3_path = f"s3://d3b-phi-data-prd/imaging/radiology/{args.program}/{args.site}/" + s3_path = ( + f"s3://{PHI_DATA_BUCKET_NAME}/imaging/radiology/{args.program}/{args.site}/" + ) return os.system("aws s3 sync " + local_path + "NIfTIs/ " + s3_path + "NIfTIs/") diff --git a/src/image_deid_etl/image_deid_etl/main_pipeline.py b/src/image_deid_etl/image_deid_etl/main_pipeline.py index 78f6b13..473a471 100644 --- a/src/image_deid_etl/image_deid_etl/main_pipeline.py +++ b/src/image_deid_etl/image_deid_etl/main_pipeline.py @@ -1,9 +1,8 @@ -# NOTE: cbtn-all is HARD-CODED in cbtn_subject_info, will need to change this when ADAPT sets up routine CSV updating import logging -import boto3 -import pandas as pd -import io +import pandas + +from image_deid_etl.exceptions import ImproperlyConfigured from image_deid_etl.external_data_handling import * from image_deid_etl.images_no_save import * @@ -11,12 +10,12 @@ todays_date = datetime.now().strftime('%Y-%m-%d') -# point to cbtn-all CSV file from s3 using boto3 & default AWS profile -table_fn = 'cbtn-all_identified_2022-03-17.csv' -bucket_name = 'd3b-phi-data-prd' -obj_path = f'imaging/{table_fn}' -s3_client = boto3.client('s3') -obj = s3_client.get_object(Bucket=bucket_name, Key=obj_path) +SUBJECT_ID_MAPPING_PATH = os.getenv("SUBJECT_ID_MAPPING_PATH") +if SUBJECT_ID_MAPPING_PATH is None: + raise ImproperlyConfigured( + "You must supply a valid string path in SUBJECT_ID_MAPPING_PATH." + ) + def subject_info(local_path, program, file_dir, validate=0): # site_name = local_path.split('/')[1] @@ -29,7 +28,7 @@ def subject_info(local_path, program, file_dir, validate=0): if program == 'cbtn': # get CBTN Subject IDs try: - cbtn_all_df = pd.read_csv(io.BytesIO(obj['Body'].read())) + cbtn_all_df = pandas.read_csv(SUBJECT_ID_MAPPING_PATH) except IndexError as error: logger.error("Missing CBTN subject ID .csv file from internal EIG database: %r", error) try: diff --git a/src/image_deid_etl/requirements.txt b/src/image_deid_etl/requirements.txt index b326088..2a04b16 100644 --- a/src/image_deid_etl/requirements.txt +++ b/src/image_deid_etl/requirements.txt @@ -1,3 +1,4 @@ +boto3==1.21.34 flywheel-sdk==15.8.0 nibabel==3.2.2 numpy==1.22.3 @@ -8,6 +9,6 @@ psycopg2==2.9.3 pydicom==2.2.2 python-magic==0.4.25 requests==2.27.1 +s3fs>=0.4.0 # Relaxing the version constraint to avoid botocore incompatibilities. sqlalchemy==1.4.32 urllib3==1.26.8 -boto3==1.21.34 diff --git a/src/image_deid_etl/setup.cfg b/src/image_deid_etl/setup.cfg index d1d8467..585ce0e 100644 --- a/src/image_deid_etl/setup.cfg +++ b/src/image_deid_etl/setup.cfg @@ -9,6 +9,7 @@ url = https://github.com/d3b-center/image-deid-etl packages = find: python_requires = >=3.9,<3.10 install_requires = + boto3 flywheel-sdk nibabel numpy @@ -19,9 +20,9 @@ install_requires = pydicom python-magic requests + s3fs SQLAlchemy urllib3 - boto3 [options.entry_points] console_scripts =