From 5c27460d6e128d49da0bf04855cd997f1bc042da Mon Sep 17 00:00:00 2001
From: Rocky Breslow <1774125+rbreslow@users.noreply.github.com>
Date: Tue, 17 May 2022 11:54:55 -0400
Subject: [PATCH] Read subject ID mapping via Pandas API

This attempts to fix a "Connection reset by peer" error we're receiving
that may be a result of the call to io.BytesIO. I'd like to simplify
things to use the native Pandas API for reading from S3 to see if the
error goes away. Also, we're now reading these paths in from the
environment.
---
 .env.sample                                   |  2 ++
 README.md                                     |  2 ++
 deployment/terraform/batch.tf                 |  3 +++
 .../job-definitions/image-deid-etl.json.tmpl  |  8 +++++++
 deployment/terraform/variables.tf             |  4 ++++
 src/image_deid_etl/image_deid_etl/__main__.py | 10 +++++++--
 .../image_deid_etl/main_pipeline.py           | 21 +++++++++----------
 src/image_deid_etl/requirements.txt           |  3 ++-
 src/image_deid_etl/setup.cfg                  |  3 ++-
 9 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.env.sample b/.env.sample
index 418957b..fafb8f9 100644
--- a/.env.sample
+++ b/.env.sample
@@ -4,3 +4,5 @@ FLYWHEEL_GROUP=""
 ORTHANC_CREDENTIALS=""
 ORTHANC_HOST=""
 ORTHANC_PORT=80
+PHI_BUCKET_NAME="phi-data-bucket"
+SUBJECT_ID_MAPPING_PATH="s3://phi-data-bucket/subject_id_mapping.csv"
diff --git a/README.md b/README.md
index b44488b..f41bd7c 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ Then, customize its contents with a text editor:
 - For `FLYWHEEL_GROUP`, specify either `d3b` or an alternative group created for testing (e.g., your name).
 - For `ORTHANC_CREDENTIALS`, use your Orthanc username and password specified like `username:password`.
 - For `ORTHANC_HOST`, specify the hostname (minus `http(s)://`) that you use to access Orthanc.
+- For `PHI_DATA_BUCKET_NAME`, specify the bucket name where the ETL should backup NIfTI files.
+- For `SUBJECT_ID_MAPPING_PATH`, specify the [path](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) to the CSV file containing subject ID mappings.
 
 Next, run `update` to build the container image and initialize the database:
 
diff --git a/deployment/terraform/batch.tf b/deployment/terraform/batch.tf
index 468fb18..6760208 100644
--- a/deployment/terraform/batch.tf
+++ b/deployment/terraform/batch.tf
@@ -104,6 +104,9 @@ resource "aws_batch_job_definition" "default" {
     orthanc_host        = var.orthanc_host
     orthanc_port        = var.orthanc_port
 
+    phi_data_bucket_name    = var.d3b_phi_data_bucket_name
+    subject_id_mapping_path = var.subject_id_mapping_path
+
     image_deid_etl_log_level = var.image_deid_etl_log_level
   })
 }
diff --git a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl
index e1ee71a..185f297 100644
--- a/deployment/terraform/job-definitions/image-deid-etl.json.tmpl
+++ b/deployment/terraform/job-definitions/image-deid-etl.json.tmpl
@@ -27,6 +27,14 @@
             "name": "ORTHANC_PORT",
             "value": "${orthanc_port}"
         },
+        {
+            "name": "PHI_DATA_BUCKET_NAME",
+            "value": "${phi_data_bucket_name}"
+        },
+        {
+            "name": "SUBJECT_ID_MAPPING_PATH",
+            "value": "${subject_id_mapping_path}"
+        },
         {
             "name": "IMAGE_DEID_ETL_LOG_LEVEL",
             "value": "${image_deid_etl_log_level}"
diff --git a/deployment/terraform/variables.tf b/deployment/terraform/variables.tf
index 6b156e0..67dfbf7 100644
--- a/deployment/terraform/variables.tf
+++ b/deployment/terraform/variables.tf
@@ -274,6 +274,10 @@ variable "orthanc_port" {
   type = number
 }
 
+variable "subject_id_mapping_path" {
+  type = string
+}
+
 variable "image_deid_etl_log_level" {
   type    = string
   default = "INFO"
diff --git a/src/image_deid_etl/image_deid_etl/__main__.py b/src/image_deid_etl/image_deid_etl/__main__.py
index 0d86d0d..6a6808e 100644
--- a/src/image_deid_etl/image_deid_etl/__main__.py
+++ b/src/image_deid_etl/image_deid_etl/__main__.py
@@ -4,7 +4,6 @@
 import os
 import sys
 import tempfile
-import time
 
 import boto3
 import flywheel
@@ -31,6 +30,11 @@
         "You must supply a valid Flywheel group in FLYWHEEL_GROUP."
     )
 
+PHI_DATA_BUCKET_NAME = os.getenv("PHI_DATA_BUCKET_NAME")
+if PHI_DATA_BUCKET_NAME is None:
+    raise ImproperlyConfigured(
+        "You must supply a valid S3 bucket in PHI_DATA_BUCKET_NAME."
+    )
 
 # Configure Python's logging module. The Django project does a fantastic job explaining how logging works:
 # https://docs.djangoproject.com/en/4.0/topics/logging/
@@ -229,7 +233,9 @@ def add_fw_metadata(args) -> int:
 
 def s3_backup_niftis(args) -> int:
     local_path = f"{args.program}/{args.site}/"
-    s3_path = f"s3://d3b-phi-data-prd/imaging/radiology/{args.program}/{args.site}/"
+    s3_path = (
+        f"s3://{PHI_DATA_BUCKET_NAME}/imaging/radiology/{args.program}/{args.site}/"
+    )
 
     return os.system("aws s3 sync " + local_path + "NIfTIs/ " + s3_path + "NIfTIs/")
 
diff --git a/src/image_deid_etl/image_deid_etl/main_pipeline.py b/src/image_deid_etl/image_deid_etl/main_pipeline.py
index 78f6b13..473a471 100644
--- a/src/image_deid_etl/image_deid_etl/main_pipeline.py
+++ b/src/image_deid_etl/image_deid_etl/main_pipeline.py
@@ -1,9 +1,8 @@
-# NOTE: cbtn-all is HARD-CODED in cbtn_subject_info, will need to change this when ADAPT sets up routine CSV updating
 import logging
-import boto3
-import pandas as pd
-import io
 
+import pandas
+
+from image_deid_etl.exceptions import ImproperlyConfigured
 from image_deid_etl.external_data_handling import *
 from image_deid_etl.images_no_save import *
 
@@ -11,12 +10,12 @@
 
 todays_date = datetime.now().strftime('%Y-%m-%d')
 
-# point to cbtn-all CSV file from s3 using boto3 & default AWS profile
-table_fn = 'cbtn-all_identified_2022-03-17.csv'
-bucket_name = 'd3b-phi-data-prd'
-obj_path = f'imaging/{table_fn}'
-s3_client = boto3.client('s3')
-obj = s3_client.get_object(Bucket=bucket_name, Key=obj_path)
+SUBJECT_ID_MAPPING_PATH = os.getenv("SUBJECT_ID_MAPPING_PATH")
+if SUBJECT_ID_MAPPING_PATH is None:
+    raise ImproperlyConfigured(
+        "You must supply a valid string path in SUBJECT_ID_MAPPING_PATH."
+    )
+
 
 def subject_info(local_path, program, file_dir, validate=0):
     # site_name = local_path.split('/')[1]
@@ -29,7 +28,7 @@ def subject_info(local_path, program, file_dir, validate=0):
     if program == 'cbtn':
         # get CBTN Subject IDs
         try:
-            cbtn_all_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
+            cbtn_all_df = pandas.read_csv(SUBJECT_ID_MAPPING_PATH)
         except IndexError as error:
             logger.error("Missing CBTN subject ID .csv file from internal EIG database: %r", error)
         try:
diff --git a/src/image_deid_etl/requirements.txt b/src/image_deid_etl/requirements.txt
index b326088..2a04b16 100644
--- a/src/image_deid_etl/requirements.txt
+++ b/src/image_deid_etl/requirements.txt
@@ -1,3 +1,4 @@
+boto3==1.21.34
 flywheel-sdk==15.8.0
 nibabel==3.2.2
 numpy==1.22.3
@@ -8,6 +9,6 @@ psycopg2==2.9.3
 pydicom==2.2.2
 python-magic==0.4.25
 requests==2.27.1
+s3fs>=0.4.0 # Relaxing the version constraint to avoid botocore incompatibilities.
 sqlalchemy==1.4.32
 urllib3==1.26.8
-boto3==1.21.34
diff --git a/src/image_deid_etl/setup.cfg b/src/image_deid_etl/setup.cfg
index d1d8467..585ce0e 100644
--- a/src/image_deid_etl/setup.cfg
+++ b/src/image_deid_etl/setup.cfg
@@ -9,6 +9,7 @@ url = https://github.com/d3b-center/image-deid-etl
 packages = find:
 python_requires = >=3.9,<3.10
 install_requires =
+    boto3
     flywheel-sdk
     nibabel
     numpy
@@ -19,9 +20,9 @@ install_requires =
     pydicom
     python-magic
     requests
+    s3fs
     SQLAlchemy
     urllib3
-    boto3
 
 [options.entry_points]
 console_scripts =