From 5ca38d8f4ffce5e9871f7bb74942ab0d0b76c87e Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:07:06 -0700 Subject: [PATCH] Download benchmark results --- docs/articles/benchmarking.rst | 29 ++++ docs/scripts/downloadBenchmark.py | 221 ++++++++++++++++++++++++++++ docs/scripts/uploadTestFiles.py | 229 ++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+) create mode 100644 docs/scripts/downloadBenchmark.py create mode 100644 docs/scripts/uploadTestFiles.py diff --git a/docs/articles/benchmarking.rst b/docs/articles/benchmarking.rst index b836b078c..87bc2dd2d 100644 --- a/docs/articles/benchmarking.rst +++ b/docs/articles/benchmarking.rst @@ -11,6 +11,35 @@ give us a way to measure the impact of changes to the client. Results =================== +12/12/2023: Downloading files from Synapse +========================================== +The results were created on a `t3a.micro` EC2 instance with a 200GB disk size running in us-east-1. +The script that was run can be found in `docs/scripts/downloadBenchmark.py` and `docs/scripts/uploadTestFiles.py`. + +During this download test I tried various thread counts to see what performance looked like at +different levels. What I found was that going over the default count of threads during download +of large files (10GB and over) led to signficantly unstable performance. The client would often +crash or hang during execution. As a result the general reccomendation is as follows: + +- For files over 1GB use the default number of threads: `multiprocessing.cpu_count() + 4` +- For a large number of files 1GB and under 40-50 threads worked best + + ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ +| Test | Thread Count | Synapseutils Sync | syn.getChildren + syn.get | S3 Sync | Per file size | ++===========================+==============+===================+===========================+==========+===============+ +| 25 Files 1MB total size | 40 | 1.30s | 5.48s | 1.49s | 40KB | ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ +| 775 Files 10MB total size | 40 | 19.17s | 161.46s | 12.02s | 12.9KB | ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ +| 10 Files 1GB total size | 40 | 14.74s | 21.91s | 11.72s | 100MB | ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ +| 10 Files 100GB total size | 6 | 3859.66s | 2006.53s | 1023.57s | 10GB | ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ +| 10 Files 100GB total size | 40 | Wouldn't complete | Wouldn't complete | N/A | 10GB | ++---------------------------+--------------+-------------------+---------------------------+----------+---------------+ + + 12/06/2023: Uploading files to Synapse, Varying thread count, 5 annotations per file ==================================================================================== diff --git a/docs/scripts/downloadBenchmark.py b/docs/scripts/downloadBenchmark.py new file mode 100644 index 000000000..a45bb674a --- /dev/null +++ b/docs/scripts/downloadBenchmark.py @@ -0,0 +1,221 @@ +""" +Handle running a few tests for benchmark download times from synapse and S3. + +This tests 3 different methods of downloading files from synapse and S3: +1. `synapseclient.Synapse.getChildren` - This method will traverse the entire synapse + project and download all files and folders recursively. +2. `synapseutils.syncFromSynapse` - This uses the utility method to traverse the + entire synapse project and download all files and folders recursively. +3. `aws s3 sync` - This uses the AWS CLI to sync all files and folders from S3. +""" +import os +import shutil +from time import perf_counter +import synapseclient +import synapseutils +import subprocess # nosec + +S3_BUCKET = "s3://$FILL_ME_IN" +S3_PROFILE = "$FILL_ME_IN" + +PROJECT_25_FILES_1MB = "download_benchmarking_25_files_1mb" +PROJECT_775_FILES_10MB = "download_benchmarking_775_files_10mb" +PROJECT_10_FILES_1GB = "download_benchmarking_10_files_1gb" +PROJECT_10_FILES_100GB = "download_benchmarking_10_files_100gb" + + +def excute_get_children_synapse_test( + path: str, syn: synapseclient.Synapse, project_name: str +) -> None: + """This test uses the `synapseclient.Synapse.getChildren` method to download files + for the entire synapse project. This will create the folder on disk and find the folders + children recursively. + + :param path: The path to download to. + :param syn: The logged in synapse instance. + :param project_name: The name of the project to download. + """ + document_path = os.path.expanduser("~/") + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write(f"Started excute_get_children_synapse_test\n") + f.close() + before = perf_counter() + parent_project_id = syn.store(synapseclient.Project(name=project_name)).id + children_under_project = syn.getChildren( + parent=parent_project_id, includeTypes=["file", "folder"] + ) + + def download_or_create_folder( + entity: synapseclient.Entity, current_resolved_path: str + ) -> None: + is_folder = ( + "type" in entity + and entity["type"] == "org.sagebionetworks.repo.model.Folder" + ) + is_file = ( + "type" in entity + and entity["type"] == "org.sagebionetworks.repo.model.FileEntity" + ) + if is_folder: + new_resolved_path = os.path.join(current_resolved_path, entity["name"]) + if not os.path.exists(new_resolved_path): + os.mkdir(new_resolved_path) + children_for_folder = syn.getChildren( + parent=entity["id"], includeTypes=["file", "folder"] + ) + + for child_for_folder in children_for_folder: + download_or_create_folder( + entity=child_for_folder, + current_resolved_path=new_resolved_path, + ) + elif is_file: + syn.get( + entity=entity["id"], + downloadFile=True, + downloadLocation=os.path.join(current_resolved_path, entity["name"]), + ) + + for entity in children_under_project: + download_or_create_folder(entity=entity, current_resolved_path=path) + + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write( + f"Time to excute_get_children_synapse_test: {perf_counter() - before}\n" + ) + f.close() + print(f"\nTime to excute_get_children_synapse_test: {perf_counter() - before}") + + +def execute_synapseutils_sync_from_synapse_test( + path: str, syn: synapseclient.Synapse, project_name: str +) -> None: + """Use the `synapseutils.syncFromSynapse` method to download files for the entire + synapse project. + + :param path: The path to download to. + :param syn: The logged in synapse instance. + :param project_name: The name of the project to download. + """ + document_path = os.path.expanduser("~/") + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write(f"\nStarted syncFromSynapse\n") + f.close() + before = perf_counter() + project = syn.store(synapseclient.Project(name=project_name)) + synapseutils.syncFromSynapse(syn=syn, entity=project, path=path) + + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write(f"\nTime to syncFromSynapse: {perf_counter() - before}\n") + f.close() + print(f"\nTime to syncFromSynapse: {perf_counter() - before}") + + +def execute_sync_from_s3(path: str, key_in_bucket: str) -> None: + """Executes the AWS CLI sync command. + + :param path: The path to the root directory + :param test_name: The name of the test to add to the span name + """ + document_path = os.path.expanduser("~/") + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write(f"\nStarted S3 Sync\n") + f.close() + + time_before_sync = perf_counter() + subprocess.run( + [ + "aws", + "s3", + "sync", + f"{S3_BUCKET}/{key_in_bucket}", + path, + "--profile", + S3_PROFILE, + ] + ) # nosec + + with open( + os.path.join(document_path, "synapse_download_benchmarking.txt"), "a" + ) as f: + f.write(f"\nTime to S3 sync: {perf_counter() - time_before_sync}\n") + f.close() + print(f"\nTime to S3 sync: {perf_counter() - time_before_sync}") + + +def execute_test_suite( + path: str, project_name: str, syn: synapseclient.Synapse +) -> None: + """Execute the test suite. + + :param path: The path to download to. + :param project_name: The name of the project to download. + """ + excute_get_children_synapse_test(path=path, syn=syn, project_name=project_name) + shutil.rmtree(path) + + execute_synapseutils_sync_from_synapse_test( + path=path, project_name=project_name, syn=syn + ) + shutil.rmtree(path) + + execute_sync_from_s3(path=path, key_in_bucket=project_name) + shutil.rmtree(path) + + +synapse = synapseclient.Synapse(debug=False) +root_path = os.path.expanduser("~/benchmarkingDownload") +if not os.path.exists(root_path): + os.mkdir(root_path) +# Log-in with ~.synapseConfig `authToken` +synapse.login() + +document_path = os.path.expanduser("~/") +with open(os.path.join(document_path, "synapse_download_benchmarking.txt"), "a") as f: + f.write(f"\nStarted Benchmarking: 25 Files - 1MB\n") + f.close() +print("25 Files - 1MB") +# ## 25 Files - 1MB ----------------------------------------------------------------------- + +execute_test_suite(path=root_path, project_name=PROJECT_25_FILES_1MB, syn=synapse) + +if not os.path.exists(root_path): + os.mkdir(root_path) + +with open(os.path.join(document_path, "synapse_download_benchmarking.txt"), "a") as f: + f.write(f"\nStarted Benchmarking: 775 Files - 10MB\n") + f.close() +print("775 Files - 10MB") +### 775 Files - 10MB --------------------------------------------------------------------- +execute_test_suite(path=root_path, project_name=PROJECT_775_FILES_10MB, syn=synapse) + +if not os.path.exists(root_path): + os.mkdir(root_path) + + +with open(os.path.join(document_path, "synapse_download_benchmarking.txt"), "a") as f: + f.write(f"\nStarted Benchmarking: 10 Files - 1GB\n") + f.close() +print("10 Files - 1GB") +## 10 Files - 1GB ----------------------------------------------------------------------- +execute_test_suite(path=root_path, project_name=PROJECT_10_FILES_1GB, syn=synapse) + +if not os.path.exists(root_path): + os.mkdir(root_path) + +with open(os.path.join(document_path, "synapse_download_benchmarking.txt"), "a") as f: + f.write(f"\nStarted Benchmarking: 10 Files - 100GB\n") + f.close() +print("10 Files - 100GB") +### 10 Files - 100GB --------------------------------------------------------------------- +execute_test_suite(path=root_path, project_name=PROJECT_10_FILES_100GB, syn=synapse) diff --git a/docs/scripts/uploadTestFiles.py b/docs/scripts/uploadTestFiles.py new file mode 100644 index 000000000..326de29e3 --- /dev/null +++ b/docs/scripts/uploadTestFiles.py @@ -0,0 +1,229 @@ +""" +Create some test files and upload them to Synapse and S3. This is used as the first step +for benchmarking downloads. +""" +import os +import shutil +from synapseclient.entity import Project +import synapseclient +import synapseutils +import subprocess # nosec + +PARENT_PROJECT = "syn$FILL_ME_IN" +S3_BUCKET = "s3://$FILL_ME_IN" +S3_PROFILE = "$FILL_ME_IN" + +PROJECT_25_FILES_1MB = "download_benchmarking_25_files_1mb" +PROJECT_775_FILES_10MB = "download_benchmarking_775_files_10mb" +PROJECT_10_FILES_1GB = "download_benchmarking_10_files_1gb" +PROJECT_10_FILES_100GB = "download_benchmarking_10_files_100gb" + + +def create_folder_structure( + path: str, + depth_of_directory_tree: int, + num_sub_directories: int, + num_files_per_directory: int, + total_size_of_files_mbytes: int, +) -> None: + """Create a tree directory structure starting with `root/subdir`. + + Example: + Input: + depth_of_directory_tree = 1 + num_sub_directories = 1 + num_files_per_directory = 2 + Result: + root/subdir1/file1.txt + root/subdir1/file2.txt + + + Input: + depth_of_directory_tree = 1 + num_sub_directories = 2 + num_files_per_directory = 2 + Result: + root/subdir1/file1.txt + root/subdir1/file2.txt + root/subdir2/file1.txt + root/subdir2/file2.txt + + :param path: _description_ + :param depth_of_directory_tree: _description_ + :param num_sub_directories: _description_ + :param num_files_per_directory: _description_ + :param total_size_of_files_mbytes: _description_ + :return: _description_ + """ + # Calculate total number of files and size of each file + total_dirs = sum( + [num_sub_directories**i for i in range(1, depth_of_directory_tree + 1)] + ) + total_files = total_dirs * num_files_per_directory + total_size_of_files_bytes = total_size_of_files_mbytes * 1024 * 1024 + size_of_each_file_bytes = total_size_of_files_bytes // total_files + + print(f"total_directories: {total_dirs}") + print(f"total_files: {total_files}") + print(f"total_size_of_files_bytes: {total_size_of_files_bytes}") + print(f"size_of_each_file_bits: {size_of_each_file_bytes}") + + def create_files_in_current_dir(path_to_create_files): + for i in range(1, num_files_per_directory + 1): + chunk_size = 1024 # size of each chunk in bytes + num_chunks = size_of_each_file_bytes // chunk_size + + with open(f"{path_to_create_files}/file{i}.txt", "wb") as f: + for _ in range(num_chunks): + f.write(os.urandom(chunk_size)) + + def create_directories_in_current_dir(path_to_create_dirs, current_depth): + if current_depth < depth_of_directory_tree: + for i in range(1, num_sub_directories + 1): + path = f"{path_to_create_dirs}/subdir{i}" + os.makedirs(path, exist_ok=True) + create_files_in_current_dir(path) + new_depth = current_depth + 1 + create_directories_in_current_dir(path, new_depth) + + # Start creating directories and files + root_dir = os.path.join(path, "root") + os.makedirs(root_dir, exist_ok=True) + create_directories_in_current_dir(root_dir, 0) + return total_dirs, total_files, size_of_each_file_bytes + + +def execute_synapseutils_sync_from_synapse_test( + path: str, syn: synapseclient.Synapse +) -> None: + result = synapseutils.syncFromSynapse(syn=syn, entity=PARENT_PROJECT, path=path) + print(result) + + synapseutils.syncToSynapse( + syn, + manifestFile=f"{path}/SYNAPSE_METADATA_MANIFEST.tsv", + sendMessages=False, + ) + + +def sync_to_synapse(path: str, project_id: str, syn: synapseclient.Synapse) -> None: + """Execute the test that uses synapseutils to sync all files/folders to synapse. + + :param path: The path to the root directory + """ + manifest_path = f"{path}/benchmarking_manifest.tsv" + with open(manifest_path, "w", encoding="utf-8") as f: + pass + + synapseutils.generate_sync_manifest( + syn, + directory_path=path, + parent_id=project_id, + manifest_path=manifest_path, + ) + + # Write annotations to the manifest file ----------------------------------------- + # Open the `manifest_path` tab-delimited file and read its contents + with open(manifest_path, "r") as file: + lines = file.readlines() + + # Append 3 columns "annot1", "annot2", "annot3" to the header + lines[0] = lines[0].strip() + "\tannot1\tannot2\tannot3\tannot4\tannot5\n" + + # Append the values to each line + for i in range(1, len(lines)): + lines[i] = lines[i].strip() + "\tvalue1\1\1.2\tFalse\t2020-01-01\n" + + # Write the modified contents back to the file + with open(manifest_path, "w") as file: + file.writelines(lines) + # Finish writing annotations to the manifest file -------------------------------- + + synapseutils.syncToSynapse( + syn, + manifestFile=manifest_path, + sendMessages=False, + ) + + +def execute_sync_to_s3(path: str, key_in_bucket: str) -> None: + """Executes the AWS CLI sync command. Expected to run last as this will delete local files. + + :param path: The path to the root directory + :param test_name: The name of the test to add to the span name + """ + + subprocess.run( + [ + "aws", + "s3", + "sync", + path, + f"{S3_BUCKET}/{key_in_bucket}", + "--profile", + S3_PROFILE, + ] + ) # nosec + + +def set_up_projects_one_time(path: str, syn: synapseclient.Synapse) -> None: + create_folder_structure( + path=path, + depth_of_directory_tree=1, + num_sub_directories=5, + num_files_per_directory=5, + total_size_of_files_mbytes=1, + ) + # Set up the project: + project_25_files_1MB = syn.store(obj=Project(name=PROJECT_25_FILES_1MB)) + sync_to_synapse(path=path, syn=syn, project_id=project_25_files_1MB.id) + os.remove(f"{path}/benchmarking_manifest.tsv") + execute_sync_to_s3(path=path, key_in_bucket=PROJECT_25_FILES_1MB) + shutil.rmtree(path) + + create_folder_structure( + path=path, + depth_of_directory_tree=3, + num_sub_directories=5, + num_files_per_directory=5, + total_size_of_files_mbytes=10, + ) + project_775_files_10MB = syn.store(obj=Project(name=PROJECT_775_FILES_10MB)) + sync_to_synapse(path=path, syn=syn, project_id=project_775_files_10MB.id) + os.remove(f"{path}/benchmarking_manifest.tsv") + execute_sync_to_s3(path=path, key_in_bucket=PROJECT_775_FILES_10MB) + shutil.rmtree(path) + + create_folder_structure( + path=path, + depth_of_directory_tree=1, + num_sub_directories=1, + num_files_per_directory=10, + total_size_of_files_mbytes=1000, + ) + project_10_files_1GB = syn.store(obj=Project(name=PROJECT_10_FILES_1GB)) + sync_to_synapse(path=path, syn=syn, project_id=project_10_files_1GB.id) + os.remove(f"{path}/benchmarking_manifest.tsv") + execute_sync_to_s3(path=path, key_in_bucket=PROJECT_10_FILES_1GB) + shutil.rmtree(path) + + create_folder_structure( + path=path, + depth_of_directory_tree=1, + num_sub_directories=1, + num_files_per_directory=10, + total_size_of_files_mbytes=100000, + ) + project_10_files_100GB = syn.store(obj=Project(name=PROJECT_10_FILES_100GB)) + sync_to_synapse(path=path, syn=syn, project_id=project_10_files_100GB.id) + os.remove(f"{path}/benchmarking_manifest.tsv") + execute_sync_to_s3(path=path, key_in_bucket=PROJECT_10_FILES_100GB) + shutil.rmtree(path) + + +synapse = synapseclient.Synapse(debug=False) +root_path = os.path.expanduser("~/benchmarkingDownload") +# Log-in with ~.synapseConfig `authToken` +synapse.login() + +set_up_projects_one_time(path=root_path, syn=synapse)