From ee22f5aa715e0657720e929494eb642972cc7865 Mon Sep 17 00:00:00 2001 From: ppmarkus <60943040+ppmarkus@users.noreply.github.com> Date: Tue, 28 Nov 2023 00:33:46 +0900 Subject: [PATCH] Add get_response_local_file endpoint to API and not use AWS S3 (#251) * Add get_response_local_file endpoint to API and not use AWS S3 * Updated to remove the stand alone "/api/v1/responses/{response_id}/localfile" route and associated get_response_local_file api method. Driven now soley by config. --------- Co-authored-by: Paul Markus --- .env.example | 1 + dataherald/api/__init__.py | 1 + dataherald/api/fastapi.py | 20 ++++++++++++------- dataherald/config.py | 1 + .../sql_generator/create_sql_query_status.py | 14 +++++++++---- docs/envars.rst | 4 ++++ 6 files changed, 30 insertions(+), 11 deletions(-) diff --git a/.env.example b/.env.example index 2efd2ba8..80e4c800 100644 --- a/.env.example +++ b/.env.example @@ -39,3 +39,4 @@ MONGODB_DB_PASSWORD = 'admin' # The enncryption key is used to encrypt database connection info before storing in Mongo. Please refer to the README on how to set it. S3_AWS_ACCESS_KEY_ID= S3_AWS_SECRET_ACCESS_KEY= +ONLY_STORE_CSV_FILES_LOCALLY = True # Set to True if only want to save generated CSV files locally instead of S3. Note that if stored locally they should be treated as ephemeral, i.e., they will disappear when the engine is restarted. diff --git a/dataherald/api/__init__.py b/dataherald/api/__init__.py index 84140c46..5bc0c736 100644 --- a/dataherald/api/__init__.py +++ b/dataherald/api/__init__.py @@ -136,6 +136,7 @@ def get_response_file( ) -> FileResponse: pass + @abstractmethod def delete_golden_record(self, golden_record_id: str) -> dict: pass diff --git a/dataherald/api/fastapi.py b/dataherald/api/fastapi.py index 4fcfccf0..cedf9deb 100644 --- a/dataherald/api/fastapi.py +++ b/dataherald/api/fastapi.py @@ -52,6 +52,7 @@ TableDescriptionRequest, UpdateInstruction, ) +from dataherald.config import Settings from dataherald.utils.s3 import S3 logger = logging.getLogger(__name__) @@ -408,14 +409,19 @@ def get_response_file( raise HTTPException(status_code=400, detail=str(e)) from e if not result: - raise HTTPException( - status_code=404, detail="Question, response, or db_connection not found" - ) - - s3 = S3() + raise HTTPException(status_code=404, detail="Question, response, or db_connection not found") + + # Check if the file is to be returned from server (locally) or from S3 + if Settings().only_store_csv_files_locally: + file_location = result.csv_file_path + # check if the file exists + if not os.path.exists(file_location): + raise HTTPException(status_code=404, detail="CSV file not found. Possibly deleted/removed from server.") + else: + s3 = S3() - file_location = s3.download(result.csv_file_path, db_connection.file_storage) - background_tasks.add_task(delete_file, file_location) + file_location = s3.download(result.csv_file_path, db_connection.file_storage) + background_tasks.add_task(delete_file, file_location) return FileResponse( file_location, diff --git a/dataherald/config.py b/dataherald/config.py index 2d094569..dec5c86d 100644 --- a/dataherald/config.py +++ b/dataherald/config.py @@ -50,6 +50,7 @@ class Settings(BaseSettings): encrypt_key: str = os.environ.get("ENCRYPT_KEY") s3_aws_access_key_id: str | None = os.environ.get("S3_AWS_ACCESS_KEY_ID") s3_aws_secret_access_key: str | None = os.environ.get("S3_AWS_SECRET_ACCESS_KEY") + only_store_csv_files_locally: str | None = os.environ.get("ONLY_STORE_CSV_FILES_LOCALLY") def require(self, key: str) -> Any: val = self[key] diff --git a/dataherald/sql_generator/create_sql_query_status.py b/dataherald/sql_generator/create_sql_query_status.py index 05b4bc6b..ea26c8ac 100644 --- a/dataherald/sql_generator/create_sql_query_status.py +++ b/dataherald/sql_generator/create_sql_query_status.py @@ -10,6 +10,9 @@ from dataherald.types import Response, SQLQueryResult from dataherald.utils.s3 import S3 +from dataherald.config import Settings + + def format_error_message(response: Response, error_message: str) -> Response: # Remove the complete query @@ -40,10 +43,13 @@ def create_csv_file( writer.writerow(rows[0].keys()) for row in rows: writer.writerow(row.values()) - s3 = S3() - response.csv_file_path = s3.upload( - file_location, database_connection.file_storage - ) + if Settings().only_store_csv_files_locally: + response.csv_file_path = file_location + else: + s3 = S3() + response.csv_file_path = s3.upload( + file_location, database_connection.file_storage + ) response.sql_query_result = SQLQueryResult(columns=columns, rows=rows) diff --git a/docs/envars.rst b/docs/envars.rst index 80f8b2d5..e42af4f4 100644 --- a/docs/envars.rst +++ b/docs/envars.rst @@ -34,6 +34,8 @@ provided in the .env.example file with the default values. S3_AWS_ACCESS_KEY_ID = S3_AWS_SECRET_ACCESS_KEY = + + ONLY_STORE_CSV_FILES_LOCALLY = DH_ENGINE_TIMEOUT = UPPER_LIMIT_QUERY_RETURN_ROWS = @@ -65,3 +67,5 @@ provided in the .env.example file with the default values. "S3_AWS_SECRET_ACCESS_KEY", "The key used to access credential files if saved to S3", "None", "No" "DH_ENGINE_TIMEOUT", "This is used to set the max seconds the process will wait for the response to be generate. If the specified time limit is exceeded, it will trigger an exception", "None", "No" "UPPER_LIMIT_QUERY_RETURN_ROWS", "The upper limit on number of rows returned from the query engine (equivalent to using LIMIT N in PostgreSQL/MySQL/SQlite).", "None", "No" + "ONLY_STORE_CSV_FILES_LOCALLY", "Set to True if only want to save generated CSV files locally instead of S3. Note that if stored locally they should be treated as ephemeral, i.e., they will disappear when the engine is restarted.", "None", "No" +