Skip to content

Commit

Permalink
Read subject ID mapping via Pandas API
Browse files Browse the repository at this point in the history
This attempts to fix a "Connection reset by peer" error we're receiving
that may be a result of the call to io.BytesIO. I'd like to simplify
things to use the native Pandas API for reading from S3 to see if the
error goes away. Also, we're now reading these paths in from the
environment.
  • Loading branch information
rbreslow committed May 17, 2022
1 parent 4b002fd commit 5c27460
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 15 deletions.
2 changes: 2 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ FLYWHEEL_GROUP=""
ORTHANC_CREDENTIALS=""
ORTHANC_HOST=""
ORTHANC_PORT=80
PHI_BUCKET_NAME="phi-data-bucket"
SUBJECT_ID_MAPPING_PATH="s3://phi-data-bucket/subject_id_mapping.csv"
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ Then, customize its contents with a text editor:
- For `FLYWHEEL_GROUP`, specify either `d3b` or an alternative group created for testing (e.g., your name).
- For `ORTHANC_CREDENTIALS`, use your Orthanc username and password specified like `username:password`.
- For `ORTHANC_HOST`, specify the hostname (minus `http(s)://`) that you use to access Orthanc.
- For `PHI_DATA_BUCKET_NAME`, specify the bucket name where the ETL should backup NIfTI files.
- For `SUBJECT_ID_MAPPING_PATH`, specify the [path](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) to the CSV file containing subject ID mappings.

Next, run `update` to build the container image and initialize the database:

Expand Down
3 changes: 3 additions & 0 deletions deployment/terraform/batch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ resource "aws_batch_job_definition" "default" {
orthanc_host = var.orthanc_host
orthanc_port = var.orthanc_port

phi_data_bucket_name = var.d3b_phi_data_bucket_name
subject_id_mapping_path = var.subject_id_mapping_path

image_deid_etl_log_level = var.image_deid_etl_log_level
})
}
8 changes: 8 additions & 0 deletions deployment/terraform/job-definitions/image-deid-etl.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
"name": "ORTHANC_PORT",
"value": "${orthanc_port}"
},
{
"name": "PHI_DATA_BUCKET_NAME",
"value": "${phi_data_bucket_name}"
},
{
"name": "SUBJECT_ID_MAPPING_PATH",
"value": "${subject_id_mapping_path}"
},
{
"name": "IMAGE_DEID_ETL_LOG_LEVEL",
"value": "${image_deid_etl_log_level}"
Expand Down
4 changes: 4 additions & 0 deletions deployment/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ variable "orthanc_port" {
type = number
}

variable "subject_id_mapping_path" {
type = string
}

variable "image_deid_etl_log_level" {
type = string
default = "INFO"
Expand Down
10 changes: 8 additions & 2 deletions src/image_deid_etl/image_deid_etl/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import os
import sys
import tempfile
import time

import boto3
import flywheel
Expand All @@ -31,6 +30,11 @@
"You must supply a valid Flywheel group in FLYWHEEL_GROUP."
)

PHI_DATA_BUCKET_NAME = os.getenv("PHI_DATA_BUCKET_NAME")
if PHI_DATA_BUCKET_NAME is None:
raise ImproperlyConfigured(
"You must supply a valid S3 bucket in PHI_DATA_BUCKET_NAME."
)

# Configure Python's logging module. The Django project does a fantastic job explaining how logging works:
# https://docs.djangoproject.com/en/4.0/topics/logging/
Expand Down Expand Up @@ -229,7 +233,9 @@ def add_fw_metadata(args) -> int:

def s3_backup_niftis(args) -> int:
local_path = f"{args.program}/{args.site}/"
s3_path = f"s3://d3b-phi-data-prd/imaging/radiology/{args.program}/{args.site}/"
s3_path = (
f"s3://{PHI_DATA_BUCKET_NAME}/imaging/radiology/{args.program}/{args.site}/"
)

return os.system("aws s3 sync " + local_path + "NIfTIs/ " + s3_path + "NIfTIs/")

Expand Down
21 changes: 10 additions & 11 deletions src/image_deid_etl/image_deid_etl/main_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
# NOTE: cbtn-all is HARD-CODED in cbtn_subject_info, will need to change this when ADAPT sets up routine CSV updating
import logging
import boto3
import pandas as pd
import io

import pandas

from image_deid_etl.exceptions import ImproperlyConfigured
from image_deid_etl.external_data_handling import *
from image_deid_etl.images_no_save import *

logger = logging.getLogger(__name__)

todays_date = datetime.now().strftime('%Y-%m-%d')

# point to cbtn-all CSV file from s3 using boto3 & default AWS profile
table_fn = 'cbtn-all_identified_2022-03-17.csv'
bucket_name = 'd3b-phi-data-prd'
obj_path = f'imaging/{table_fn}'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=obj_path)
SUBJECT_ID_MAPPING_PATH = os.getenv("SUBJECT_ID_MAPPING_PATH")
if SUBJECT_ID_MAPPING_PATH is None:
raise ImproperlyConfigured(
"You must supply a valid string path in SUBJECT_ID_MAPPING_PATH."
)


def subject_info(local_path, program, file_dir, validate=0):
# site_name = local_path.split('/')[1]
Expand All @@ -29,7 +28,7 @@ def subject_info(local_path, program, file_dir, validate=0):
if program == 'cbtn':
# get CBTN Subject IDs
try:
cbtn_all_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
cbtn_all_df = pandas.read_csv(SUBJECT_ID_MAPPING_PATH)
except IndexError as error:
logger.error("Missing CBTN subject ID .csv file from internal EIG database: %r", error)
try:
Expand Down
3 changes: 2 additions & 1 deletion src/image_deid_etl/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
boto3==1.21.34
flywheel-sdk==15.8.0
nibabel==3.2.2
numpy==1.22.3
Expand All @@ -8,6 +9,6 @@ psycopg2==2.9.3
pydicom==2.2.2
python-magic==0.4.25
requests==2.27.1
s3fs>=0.4.0 # Relaxing the version constraint to avoid botocore incompatibilities.
sqlalchemy==1.4.32
urllib3==1.26.8
boto3==1.21.34
3 changes: 2 additions & 1 deletion src/image_deid_etl/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ url = https://github.com/d3b-center/image-deid-etl
packages = find:
python_requires = >=3.9,<3.10
install_requires =
boto3
flywheel-sdk
nibabel
numpy
Expand All @@ -19,9 +20,9 @@ install_requires =
pydicom
python-magic
requests
s3fs
SQLAlchemy
urllib3
boto3

[options.entry_points]
console_scripts =
Expand Down

0 comments on commit 5c27460

Please sign in to comment.