From 23eae106d98317822599affaecc9e0c3b6f2259a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:53:19 -0700 Subject: [PATCH 1/5] Adding in a comprehensive example --- docs/articles/data_storage.rst | 69 +++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/articles/data_storage.rst b/docs/articles/data_storage.rst index f5e07cb27..d16d708fe 100644 --- a/docs/articles/data_storage.rst +++ b/docs/articles/data_storage.rst @@ -19,7 +19,7 @@ More information on this feature can be found The most common implementation of this is to configure a folder to store data in a user controlled AWS S3 bucket rather than Synapse's default internal S3 storage. -The following illustrates creating a new folder backed by a user specified S3 bucket. +The following illustrates creating a new folder backed by a user specified S3 bucket. Note: An existing folder also works. #. Ensure that the bucket is `properly configured `__. @@ -30,6 +30,8 @@ The following illustrates creating a new folder backed by a user specified S3 bu # create a new folder to use with external S3 storage folder = syn.store(Folder(name=folder_name, parent=parent)) + # You may also use an existing folder like: + # folder = syn.get("syn123") folder, storage_location, project_setting = syn.create_s3_storage_location( folder=folder, bucket_name='my-external-synapse-bucket', @@ -256,6 +258,71 @@ Note that above the *force* parameter is necessary if running from a non-interac with a migration requires confirmation in the form of user prompt. If running programatically this parameter instead confirms your intention to proceed with the migration. +Putting it all together + .. code-block:: + + import os + import synapseutils + import synapseclient + + my_synapse_folder_to_migrate = "syn53013644" + + external_bucket_name = "sc-237179673806-pp-ykwqcwr4uh2d2-s3bucket-x4gs5zpkj47k" + external_bucket_base_key = "my_external_synapse_folder/" + + # # a path on disk where this utility can create a sqlite database to store its index. + # # nothing needs to exist at this path, but it must be a valid path on a volume with sufficient + # # disk space to store a meta data listing of all the contents in the indexed entity. + # # a rough rule of thumb is 100kB per 1000 entities indexed. + db_path = os.path.expanduser("~/synapseMigration/my.db") + + syn = synapseclient.Synapse() + + # # Log-in with ~.synapseConfig `authToken` + syn.login() + + # The folder I want to migrate everything to this S3 storage location + folder = syn.get(my_synapse_folder_to_migrate) + + folder, storage_location, project_setting = syn.create_s3_storage_location( + folder=folder, + bucket_name=external_bucket_name, + base_key=external_bucket_base_key, + ) + + # The id of the destination storage location being migrated to + storage_location_id = storage_location["storageLocationId"] + print( + f"Indexing: {folder.id} for migration to storage_id: {storage_location_id} at: {db_path}" + ) + + result = synapseutils.index_files_for_migration( + syn, + folder.id, + storage_location_id, + db_path, + file_version_strategy="all", + ) + + print(f"Indexing result: {result.get_counts_by_status()}") + + print("Migrating files...") + + result = synapseutils.migrate_indexed_files( + syn, + db_path, + force=True, + ) + + print(f"Migration result: {result.get_counts_by_status()}") + +The result of running this should look like + .. code-block:: + + Indexing: syn123 for migration to storage_id: 11111 at: /home/user/synapseMigration/my.db + Indexing result: {'INDEXED': 100, 'MIGRATED': 0, 'ALREADY_MIGRATED': 0, 'ERRORED': 0} + Migrating files... + Migration result: {'INDEXED': 0, 'MIGRATED': 100, 'ALREADY_MIGRATED': 0, 'ERRORED': 0} Migrating from the command line ------------------------------- From d5c114442e978738ece026fddc7c18eb21f51d08 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:59:29 -0700 Subject: [PATCH 2/5] Generic info --- docs/articles/data_storage.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/articles/data_storage.rst b/docs/articles/data_storage.rst index d16d708fe..6b7d471cb 100644 --- a/docs/articles/data_storage.rst +++ b/docs/articles/data_storage.rst @@ -265,10 +265,10 @@ Putting it all together import synapseutils import synapseclient - my_synapse_folder_to_migrate = "syn53013644" + my_synapse_folder_to_migrate = "syn123" - external_bucket_name = "sc-237179673806-pp-ykwqcwr4uh2d2-s3bucket-x4gs5zpkj47k" - external_bucket_base_key = "my_external_synapse_folder/" + external_bucket_name = "my-external-synapse-bucket" + external_bucket_base_key = "path/within/bucket/" # # a path on disk where this utility can create a sqlite database to store its index. # # nothing needs to exist at this path, but it must be a valid path on a volume with sufficient From c5c4f113b39d193f2487f640e6104030f219b508 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 21 Nov 2023 09:46:58 -0700 Subject: [PATCH 3/5] Moving content to group like topics --- docs/articles/data_storage.rst | 277 +++++++++++++++++++-------------- 1 file changed, 158 insertions(+), 119 deletions(-) diff --git a/docs/articles/data_storage.rst b/docs/articles/data_storage.rst index 6b7d471cb..22a715ec2 100644 --- a/docs/articles/data_storage.rst +++ b/docs/articles/data_storage.rst @@ -8,12 +8,17 @@ S3 Storage Features Synapse can use a variety of storage mechanisms to store content, however the most common storage solution is AWS S3. This article illustrates some special features that can be used with S3 storage -and how they interact with the Python client. +and how they interact with the Python client. In particular it covers: + +#. Linking External storage locations to new/existing projects or folders +#. Migration of existing projects or folders to new external storage locations +#. Creating STS enabled storage locations +#. Using SFTP External storage locations ========================== -Synapse folders can be configured to use custom implementations for their underlying data storage. +Synapse projects or folders can be configured to use custom implementations for their underlying data storage. More information on this feature can be found `here `__. The most common implementation of this is to configure a folder to store data in a user controlled AWS S3 bucket @@ -21,6 +26,11 @@ rather than Synapse's default internal S3 storage. The following illustrates creating a new folder backed by a user specified S3 bucket. Note: An existing folder also works. +If you are changing the storage location of an existing folder to a user specified S3 bucket none +of the files will be migrated. In order to migrate the files to the new storage location see the documentation +further down in this article labeled 'Migrating programmatically'. When you change the storage location +for a folder only NEW files uploaded to the folder are uploaded to the user specific S3 bucket. + #. Ensure that the bucket is `properly configured `__. @@ -41,6 +51,31 @@ The following illustrates creating a new folder backed by a user specified S3 bu # if needed the unique storage location identifier can be obtained e.g. storage_location_id = storage_location['storageLocationId'] +The following illustrates creating a new project backed by a user specified S3 bucket. Note: An existing project also works. + +If you are changing the storage location of an existing project to a user specified S3 bucket none +of the files will be migrated. In order to migrate the files to the new storage location see the documentation +further down in this article labeled 'Migrating programmatically'. When you change the storage location +for a project only NEW files uploaded to the project are uploaded to the user specific S3 bucket. + +#. Ensure that the bucket is `properly configured + `__. + +#. Create a project and configure it to use external S3 storage: + + .. code-block:: + + # create a new, or retrieve an existing project to use with external S3 storage + project = syn.store(Project(name="my_project_name")) + project_storage, storage_location, project_setting = syn.create_s3_storage_location( + # Despite the KW argument name, this can be a project or folder + folder=project, + bucket_name='my-external-synapse-bucket', + base_key='path/within/bucket', + ) + + # if needed the unique storage location identifier can be obtained e.g. + storage_location_id = storage_location['storageLocationId'] Once an external S3 storage folder exists, you can interact with it as you would any other folder using Synapse tools. If you wish to add an object that is stored within the bucket to Synapse you can do that by adding @@ -72,115 +107,6 @@ a file handle for that object using the Python client and then storing the file file_entity = syn.store(file) -.. _sts_storage_locations: - -STS Storage Locations -===================== - -Create an STS enabled folder to use -`AWS Security Token Service `__ credentials -with S3 storage locations. These credentials can be scoped to access individual Synapse files or folders and can be used -with external S3 tools such as the awscli and the boto3 library separately from Synapse to read and write files to and -from Synapse storage. At this time read and write capabilities are supported for external storage locations, while default -Synapse storage is limited to read only. Please read the linked documentation for a complete understanding of the capabilities -and restrictions of STS enabled folders. - -Creating an STS enabled folder ------------------------------- -Creating an STS enabled folder is similar to creating an external storage folder as described above, but this -time passing an additional **sts_enabled=True** keyword parameter. The **bucket_name** and **base_key** -parameters apply to external storage locations and can be omitted to use Synapse internal storage. -Note also that STS can only be enabled on an empty folder. - - .. code-block:: - - # create a new folder to use with STS and external S3 storage - folder = syn.store(Folder(name=folder_name, parent=parent)) - folder, storage_location, project_setting = syn.create_s3_storage_location( - folder=folder, - bucket_name='my-external-synapse-bucket', - base_key='path/within/bucket', - sts_enabled=True, - ) - - -Using credentials with the awscli ---------------------------------- -This example illustrates obtaining STS credentials and using them with the awscli command line tool. -The first command outputs the credentials as shell commands to execute which will then be picked up -by subsequent aws cli commands. Note that the bucket-owner-full-control ACL is required when putting -an object via STS credentials. This ensures that the object ownership will be transferred to the -owner of the AWS bucket. - - .. code-block:: - - $ synapse get-sts-token -o shell syn123 read_write - - export SYNAPSE_STS_S3_LOCATION="s3://my-external-synapse-bucket/path/within/bucket" - export AWS_ACCESS_KEY_ID="" - export AWS_SECRET_ACCESS_KEY="" - export AWS_SESSION_TOKEN=" - - # if the above are executed in the shell, the awscli will automatically apply them - - # e.g. copy a file directly to the bucket using the exported credentials - $ aws s3 cp /path/to/local/file $SYNAPSE_STS_S3_LOCATION --acl bucket-owner-full-control - -Using credentials with boto3 in python --------------------------------------- -This example illustrates retrieving STS credentials and using them with boto3 within python code, -in this case to upload a file. Note that the bucket-owner-full-control ACL is required when putting -an object via STS credentials. This ensures that the object ownership will be transferred to the -owner of the AWS bucket. - - .. code-block:: - - # the boto output_format is compatible with the boto3 session api. - credentials = syn.get_sts_storage_token('syn123', 'read_write', output_format='boto') - - s3_client = boto3.client('s3', **credentials) - s3_client.upload_file( - Filename='/path/to/local/file, - Bucket='my-external-synapse-bucket', - Key='path/within/bucket/file', - ExtraArgs={'ACL': 'bucket-owner-full-control'}, - ) - -Automatic transfers to/from STS storage locations using boto3 with synapseclient --------------------------------------------------------------------------------- - -The Python Synapse client can be configured to automatically use STS tokens to perform uploads and downloads to enabled -storage locations using an installed boto3 library rather than through the traditional Synapse client APIs. -This can improve performance in certain situations, particularly uploads of large files, as the data transfer itself -can be conducted purely against the AWS S3 APIs, only invoking the Synapse APIs to retrieve the necessary token and -to update Synapse metadata in the case of an upload. Once configured to do so, retrieval of STS tokens for supported -operations occurs automatically without any change in synapseclient usage. - -To enable STS/boto3 transfers on all `get` and `store` operations, do the following: - -1. Ensure that boto3 is installed in the same Python installation as synapseclient. - - .. code-block:: - - pip install boto3 - -2. To enable automatic transfers on all uploads and downloads, update your Synapse client configuration file - (typically “.synapseConfig” in your $HOME directory, unless otherwise configured) with the [transfer] section, - if it is not already present. To leverage STS/boto3 transfers on a per Synapse client object basis, set - the **use_boto_sts_transfers** property. - - .. code-block:: - - # add to .synapseConfig to automatically apply as default for all synapse client instances - [transfer] - use_boto_sts=true - - # alternatively set on a per instance basis within python code - syn.use_boto_sts_transfers = True - -Note that if boto3 is not installed, then these settings will have no effect. - - Storage location migration ========================== @@ -194,6 +120,10 @@ The Synapse client has utilities for migrating entities to a new storage locatio the content locally and re-uploading it which can be slow, and may alter the meta data associated with the entities in undesirable ways. +During the migration it is reccomended that uploads and downloads are blocked to prevent possible conflicts +or race conditions. This can be done by setting permissions to `Can view` for the project or folder being migrated. +After the migration is complete set the permissions back to their original values. + Migrating programmatically -------------------------- @@ -202,7 +132,7 @@ Migrating a Synapse project or folder programatically is a two step process. First ensure that you know the id of the storage location you want to migrate to. More info on storage locations can be found above and `here `__. -Once the storage location is known, the first step to migrate an entity is create a migratable index +Once the storage location is known, the first step to migrate the project or folder is to create a migratable index of its contents using the `index_files_for_migration `__ function, e.g. @@ -265,7 +195,7 @@ Putting it all together import synapseutils import synapseclient - my_synapse_folder_to_migrate = "syn123" + my_synapse_project_or_folder_to_migrate = "syn123" external_bucket_name = "my-external-synapse-bucket" external_bucket_base_key = "path/within/bucket/" @@ -281,11 +211,12 @@ Putting it all together # # Log-in with ~.synapseConfig `authToken` syn.login() - # The folder I want to migrate everything to this S3 storage location - folder = syn.get(my_synapse_folder_to_migrate) + # The project or folder I want to migrate everything to this S3 storage location + project_or_folder = syn.get(my_synapse_project_or_folder_to_migrate) - folder, storage_location, project_setting = syn.create_s3_storage_location( - folder=folder, + project_or_folder, storage_location, project_setting = syn.create_s3_storage_location( + # Despite the KW argument name, this can be a project or folder + folder=project_or_folder, bucket_name=external_bucket_name, base_key=external_bucket_base_key, ) @@ -293,12 +224,12 @@ Putting it all together # The id of the destination storage location being migrated to storage_location_id = storage_location["storageLocationId"] print( - f"Indexing: {folder.id} for migration to storage_id: {storage_location_id} at: {db_path}" + f"Indexing: {project_or_folder.id} for migration to storage_id: {storage_location_id} at: {db_path}" ) result = synapseutils.index_files_for_migration( syn, - folder.id, + project_or_folder.id, storage_location_id, db_path, file_version_strategy="all", @@ -356,6 +287,114 @@ Sample output: Writing csv log to /tmp/migrate.csv +.. _sts_storage_locations: + +STS Storage Locations +===================== + +Create an STS enabled folder to use +`AWS Security Token Service `__ credentials +with S3 storage locations. These credentials can be scoped to access individual Synapse files or folders and can be used +with external S3 tools such as the awscli and the boto3 library separately from Synapse to read and write files to and +from Synapse storage. At this time read and write capabilities are supported for external storage locations, while default +Synapse storage is limited to read only. Please read the linked documentation for a complete understanding of the capabilities +and restrictions of STS enabled folders. + +Creating an STS enabled folder +------------------------------ +Creating an STS enabled folder is similar to creating an external storage folder as described above, but this +time passing an additional **sts_enabled=True** keyword parameter. The **bucket_name** and **base_key** +parameters apply to external storage locations and can be omitted to use Synapse internal storage. +Note also that STS can only be enabled on an empty folder. + + .. code-block:: + + # create a new folder to use with STS and external S3 storage + folder = syn.store(Folder(name=folder_name, parent=parent)) + folder, storage_location, project_setting = syn.create_s3_storage_location( + folder=folder, + bucket_name='my-external-synapse-bucket', + base_key='path/within/bucket', + sts_enabled=True, + ) + + +Using credentials with the awscli +--------------------------------- +This example illustrates obtaining STS credentials and using them with the awscli command line tool. +The first command outputs the credentials as shell commands to execute which will then be picked up +by subsequent aws cli commands. Note that the bucket-owner-full-control ACL is required when putting +an object via STS credentials. This ensures that the object ownership will be transferred to the +owner of the AWS bucket. + + .. code-block:: + + $ synapse get-sts-token -o shell syn123 read_write + + export SYNAPSE_STS_S3_LOCATION="s3://my-external-synapse-bucket/path/within/bucket" + export AWS_ACCESS_KEY_ID="" + export AWS_SECRET_ACCESS_KEY="" + export AWS_SESSION_TOKEN=" + + # if the above are executed in the shell, the awscli will automatically apply them + + # e.g. copy a file directly to the bucket using the exported credentials + $ aws s3 cp /path/to/local/file $SYNAPSE_STS_S3_LOCATION --acl bucket-owner-full-control + +Using credentials with boto3 in python +-------------------------------------- +This example illustrates retrieving STS credentials and using them with boto3 within python code, +in this case to upload a file. Note that the bucket-owner-full-control ACL is required when putting +an object via STS credentials. This ensures that the object ownership will be transferred to the +owner of the AWS bucket. + + .. code-block:: + + # the boto output_format is compatible with the boto3 session api. + credentials = syn.get_sts_storage_token('syn123', 'read_write', output_format='boto') + + s3_client = boto3.client('s3', **credentials) + s3_client.upload_file( + Filename='/path/to/local/file, + Bucket='my-external-synapse-bucket', + Key='path/within/bucket/file', + ExtraArgs={'ACL': 'bucket-owner-full-control'}, + ) + +Automatic transfers to/from STS storage locations using boto3 with synapseclient +-------------------------------------------------------------------------------- + +The Python Synapse client can be configured to automatically use STS tokens to perform uploads and downloads to enabled +storage locations using an installed boto3 library rather than through the traditional Synapse client APIs. +This can improve performance in certain situations, particularly uploads of large files, as the data transfer itself +can be conducted purely against the AWS S3 APIs, only invoking the Synapse APIs to retrieve the necessary token and +to update Synapse metadata in the case of an upload. Once configured to do so, retrieval of STS tokens for supported +operations occurs automatically without any change in synapseclient usage. + +To enable STS/boto3 transfers on all `get` and `store` operations, do the following: + +1. Ensure that boto3 is installed in the same Python installation as synapseclient. + + .. code-block:: + + pip install boto3 + +2. To enable automatic transfers on all uploads and downloads, update your Synapse client configuration file + (typically “.synapseConfig” in your $HOME directory, unless otherwise configured) with the [transfer] section, + if it is not already present. To leverage STS/boto3 transfers on a per Synapse client object basis, set + the **use_boto_sts_transfers** property. + + .. code-block:: + + # add to .synapseConfig to automatically apply as default for all synapse client instances + [transfer] + use_boto_sts=true + + # alternatively set on a per instance basis within python code + syn.use_boto_sts_transfers = True + +Note that if boto3 is not installed, then these settings will have no effect. + ==== SFTP ==== From c628373155d71b04ddcf3967fedcc630ffcecbf5 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 21 Nov 2023 09:51:10 -0700 Subject: [PATCH 4/5] Formatting --- docs/articles/data_storage.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/articles/data_storage.rst b/docs/articles/data_storage.rst index 22a715ec2..079e0a813 100644 --- a/docs/articles/data_storage.rst +++ b/docs/articles/data_storage.rst @@ -188,7 +188,8 @@ Note that above the *force* parameter is necessary if running from a non-interac with a migration requires confirmation in the form of user prompt. If running programatically this parameter instead confirms your intention to proceed with the migration. -Putting it all together +Putting all the migration pieces together +----------------------------------------- .. code-block:: import os From 76412deabd33aa233e8b1e95696bb2980dec9818 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 21 Nov 2023 11:59:29 -0700 Subject: [PATCH 5/5] Add time for 100GB --- docs/articles/data_storage.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/articles/data_storage.rst b/docs/articles/data_storage.rst index 079e0a813..81709b7fb 100644 --- a/docs/articles/data_storage.rst +++ b/docs/articles/data_storage.rst @@ -124,6 +124,8 @@ During the migration it is reccomended that uploads and downloads are blocked to or race conditions. This can be done by setting permissions to `Can view` for the project or folder being migrated. After the migration is complete set the permissions back to their original values. +Expected time to migrate data is around 13 minutes per 100Gb as of 11/21/2023. + Migrating programmatically --------------------------