diff --git a/docs/articles/benchmarking.rst b/docs/articles/benchmarking.rst index b0768f3cb..b836b078c 100644 --- a/docs/articles/benchmarking.rst +++ b/docs/articles/benchmarking.rst @@ -12,11 +12,53 @@ Results =================== -11/14/2023 -========================== +12/06/2023: Uploading files to Synapse, Varying thread count, 5 annotations per file +==================================================================================== The results were created on a `t3a.micro` EC2 instance with a 200GB disk size running in us-east-1. The script that was run can be found in `docs/scripts`. The time to create the files on disk is not included. +This test includes adding 5 annotations to each file, a Text, Integer, Floating Point, Boolean, and Date. + +S3 was not benchmarked again. + +As a result of these tests the sweet spot for thread count is around 50 threads. It is not reccomended to +go over 50 threads as it resulted in signficant instability in the client. + ++---------------------------+--------------+-------------------+---------------------+---------------+ +| Test | Thread Count | Synapseutils Sync | os.walk + syn.store | Per file size | ++===========================+==============+===================+=====================+===============+ +| 25 Files 1MB total size | 6 | 10.75s | 10.96s | 40KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 25 Files 1MB total size | 25 | 6.79s | 11.31s | 40KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 25 Files 1MB total size | 50 | 6.05s | 10.90s | 40KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 25 Files 1MB total size | 100 | 6.14s | 10.89s | 40KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 775 Files 10MB total size | 6 | 268.33s | 298.12s | 12.9KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 775 Files 10MB total size | 25 | 162.63s | 305.93s | 12.9KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 775 Files 10MB total size | 50 | 86.46s | 304.40s | 12.9KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 775 Files 10MB total size | 100 | 85.55s | 304.71s | 12.9KB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 10 Files 1GB total size | 6 | 27.17s | 36.25s | 100MB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 10 Files 1GB total size | 25 | 22.26s | 12.77s | 100MB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 10 Files 1GB total size | 50 | 22.24s | 12.26s | 100MB | ++---------------------------+--------------+-------------------+---------------------+---------------+ +| 10 Files 1GB total size | 100 | Wouldn't complete | Wouldn't complete | 100MB | ++---------------------------+--------------+-------------------+---------------------+---------------+ + + +11/14/2023: Uploading files to Synapse, Default thread count +============================================================ +The results were created on a `t3a.micro` EC2 instance with a 200GB disk size running in us-east-1. +The script that was run can be found in `docs/scripts`. The time to create the files on disk is not included. + +This test uses the default number of threads in the client: `multiprocessing.cpu_count() + 4` +---------------------------+-------------------+---------------------+---------+---------------+ | Test | Synapseutils Sync | os.walk + syn.store | S3 Sync | Per file size | diff --git a/docs/getting_started/configuration.rst b/docs/getting_started/configuration.rst new file mode 100644 index 000000000..1bffd5180 --- /dev/null +++ b/docs/getting_started/configuration.rst @@ -0,0 +1,42 @@ +************* +Configuration +************* + +The synapse python client can be configured either programmatically or by using a +configuration file. When installing the Synapse Python client, the :code:`.synapseConfig` +is added to your home directory. This configuration file is used to store a number of +configuration options, including your Synapse authtoken, cache, +and multi-threading settings. + +A full example :code:`.synapseConfig` can be found in the +`github repository `_. + +:code:`.synapseConfig` sections +=============================== + + +:code:`[authentication]` +======================== +See details on this section in the :doc:`credentials` document. + +:code:`[cache]` +=============== +Your downloaded files are cached to avoid repeat downloads of the same file. change 'location' to use a different folder on your computer as the cache location + +:code:`[endpoints]` +=================== +Configuring these will cause the Python client to use these as Synapse service endpoints instead of the default prod endpoints. + +:code:`[transfer]` +================== +Settings to configure how Synapse uploads/downloads data. + +The current reccomended :code:`max_threads` is around 50. This was the best balance in +stability and performance. +See the results of our benchmarking :doc:`../articles/benchmarking`. + +You may also set the :code:`max_threads` programmatically via:: + + import synapseclient + syn = synapseclient.login() + syn.max_threads = 50 diff --git a/docs/index.rst b/docs/index.rst index 4ff6e0045..65c7efa3d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,6 +40,7 @@ If you're just getting started with Synapse, have a look at the Getting Started getting_started/installation getting_started/credentials getting_started/basics + getting_started/configuration .. toctree:: :caption: Articles diff --git a/docs/scripts/benchmark.py b/docs/scripts/uploadBenchmark.py similarity index 86% rename from docs/scripts/benchmark.py rename to docs/scripts/uploadBenchmark.py index 3dc9d31ed..ad027ea19 100644 --- a/docs/scripts/benchmark.py +++ b/docs/scripts/uploadBenchmark.py @@ -2,11 +2,14 @@ Handle running a few tests for benchmark upload times to synapse and S3. This has the ability to create a directory and file structure, sync to synapse using synapseutils, sync to synapse using os.walk, and sync to S3 using the AWS CLI. + +For the Synapse tests we are also adding annotations to the uploaded files. """ import os import shutil from time import perf_counter from synapseclient.entity import File, Folder +from synapseclient.annotations import Annotations import synapseclient import synapseutils import subprocess # nosec @@ -151,6 +154,23 @@ def execute_synapseutils_test( f"\nTime to generate sync manifest: {perf_counter() - time_before_generate_sync_manifest}" ) + # Write annotations to the manifest file ----------------------------------------- + # Open the `manifest_path` tab-delimited file and read its contents + with open(manifest_path, "r") as file: + lines = file.readlines() + + # Append 3 columns "annot1", "annot2", "annot3" to the header + lines[0] = lines[0].strip() + "\tannot1\tannot2\tannot3\tannot4\tannot5\n" + + # Append the values to each line + for i in range(1, len(lines)): + lines[i] = lines[i].strip() + "\tvalue1\t1\t1.2\ttrue\t2020-01-01\n" + + # Write the modified contents back to the file + with open(manifest_path, "w") as file: + file.writelines(lines) + # Finish writing annotations to the manifest file -------------------------------- + time_before_syncToSynapse = perf_counter() synapseutils.syncToSynapse( syn, @@ -199,7 +219,24 @@ def execute_walk_test( path=filepath, parent=parents[directory_path], ) - saved_files.append(syn.store(file)) + saved_file = syn.store(file) + saved_files.append(saved_file) + + # Store annotations on the file ------------------------------------------ + syn.set_annotations( + annotations=Annotations( + id=saved_file.id, + etag=saved_file.etag, + **{ + "annot1": "value1", + "annot2": 1, + "annot3": 1.2, + "annot4": True, + "annot5": "2020-01-01", + }, + ) + ) + # Finish storing annotations on the file --------------------------------- print( f"\nTime to walk and sync tree: {perf_counter() - time_before_walking_tree}" )