From 4713d12867a31beeb2fbe025e9adf2c0b13e9789 Mon Sep 17 00:00:00 2001 From: tomcrane Date: Tue, 12 Mar 2024 11:54:44 +0000 Subject: [PATCH] AwsChecksums --- .../Controllers/ImportController.cs | 37 ++++++-- LeedsExperiment/Preservation/AwsChecksum.cs | 26 ++++++ LeedsExperiment/Preservation/FedoraWrapper.cs | 86 ++++++++++--------- .../Preservation/PreservationApiOptions.cs | 3 +- 4 files changed, 103 insertions(+), 49 deletions(-) create mode 100644 LeedsExperiment/Preservation/AwsChecksum.cs diff --git a/LeedsExperiment/Preservation.API/Controllers/ImportController.cs b/LeedsExperiment/Preservation.API/Controllers/ImportController.cs index 4fbb2fb..58bb918 100644 --- a/LeedsExperiment/Preservation.API/Controllers/ImportController.cs +++ b/LeedsExperiment/Preservation.API/Controllers/ImportController.cs @@ -329,16 +329,24 @@ private async Task GetImportSource(string source, Uri intendedPare // This will currently break if the source is not an s3 Uri to which we have access // but later could be a file path etc, a scratch upload location, whatever var s3Uri = new AmazonS3Uri(source); - // we assume this is the root. We also assume that we are not going to hit the AWS limit (1000?) + + // FOR THIS DEMO we assume this is the root. + // We also assume that we are not going to hit the AWS limit for paging (1000?) // https://docs.aws.amazon.com/sdkfornet1/latest/apidocs/html/M_Amazon_S3_AmazonS3_ListObjects.htm - // ^^ for paging // We can't learn anything about containers this way other than that there are slugs in path // We can't learn anything about intended name (dc:title) from this, but that's OK for now // That kind of data should be in METS files; we can enhance the ImportJob with it later in a real world application + // The code that constructs the import job has access to more information than the code below. + // The files have been through a pipeline that will have produced checksums, content types and more, and put them in + // metadata such as METS that that code understands. var listObjectsReq = new ListObjectsV2Request() { BucketName = s3Uri.Bucket, - Prefix = $"{s3Uri.Key.TrimEnd('/')}/" //, + Prefix = $"{s3Uri.Key.TrimEnd('/')}/" + + // The only valid values here are RestoreStatus or null, so this is no good + // https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/model/OptionalObjectAttributes.html + // OptionalObjectAttributes = [ObjectAttributes.Checksum] //, // OptionalObjectAttributes = ["Content-Type"] - need to work out how to get content type back here // https://stackoverflow.com/a/44179929 // application/x-directory @@ -356,18 +364,29 @@ private async Task GetImportSource(string source, Uri intendedPare continue; } + // how do you get the checksum here without making a further call? + // S3 source folders either need SHA-256 hashes in their AWS metadata (preferred for package-building) // or they are recorded in things like METS files - in a way that this code here can understand. // Different applications have their own logic for storing hashes as part of the object, e.g., in METS. // Unless coming from other information, we *require* that S3 source folders have sha256 hashes in their metadata - // so we don't have to do this: + // (how do we enforce that, we don't want to know about METS here) - // TODAY - var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null); - var sha256Digest = Checksum.Sha256FromStream(s3Stream); - // (and all our Fedora objects have sha-256) + // Get the SHA256 algorithm from AWS directly rather than compute it here + // If the S3 file does not already have the SHA-256 in metadata, then it's an error + string? sha256 = await AwsChecksum.GetHexChecksumAsync(s3Client, s3Uri.Bucket, obj.Key); + if (string.IsNullOrWhiteSpace(sha256)) + { + throw new InvalidOperationException($"S3 Key at {obj.Key} does not have SHA256 Checksum in its attributes"); + } + + + // so we don't have to do this: + // var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null); + // var sha256Digest = Checksum.Sha256FromStream(s3Stream); + // We can also do an eTag comparison for smaller files // We can also do a size comparison as a sanity check - this can't catch all changes obvs // but if source and current have same checksum but different sizes then something's up @@ -386,7 +405,7 @@ private async Task GetImportSource(string source, Uri intendedPare Path = sourcePath, StorageType = StorageTypes.S3, ExternalLocation = $"s3://{obj.BucketName}/{obj.Key}", - Digest = sha256Digest, + Digest = sha256, ContentType = GetDefaultContentType(nameAndParentPath.Name) // we may overwrite this later, e.g., from PREMIS data }); } diff --git a/LeedsExperiment/Preservation/AwsChecksum.cs b/LeedsExperiment/Preservation/AwsChecksum.cs new file mode 100644 index 0000000..e597a4a --- /dev/null +++ b/LeedsExperiment/Preservation/AwsChecksum.cs @@ -0,0 +1,26 @@ +using Amazon.S3; +using Amazon.S3.Model; + +namespace Preservation; + +public class AwsChecksum +{ + public static async Task GetHexChecksumAsync(IAmazonS3 s3Client, string bucket, string key) + { + var objAttrsRequest = new GetObjectAttributesRequest() + { + BucketName = bucket, + Key = key, + ObjectAttributes = [ObjectAttributes.Checksum] + }; + var objAttrsResponse = await s3Client!.GetObjectAttributesAsync(objAttrsRequest); + string? base64Sha256 = objAttrsResponse?.Checksum?.ChecksumSHA256; + if (!string.IsNullOrWhiteSpace(base64Sha256)) + { + byte[] bytes = Convert.FromBase64String(base64Sha256); + return BitConverter.ToString(bytes).Replace("-", "").ToLowerInvariant(); + + } + return null; + } +} diff --git a/LeedsExperiment/Preservation/FedoraWrapper.cs b/LeedsExperiment/Preservation/FedoraWrapper.cs index e3bf673..7087ca9 100644 --- a/LeedsExperiment/Preservation/FedoraWrapper.cs +++ b/LeedsExperiment/Preservation/FedoraWrapper.cs @@ -207,48 +207,56 @@ public async Task PutBinary(BinaryFile binaryFile, Transaction? transact return await PutOrPostBinary(HttpMethod.Put, binaryFile, transaction); } - private async void EnsureChecksum(BinaryFile binaryFile) + private async void EnsureChecksum(BinaryFile binaryFile, bool validate) { - if(apiOptions.RequireDigestOnBinaryFile && string.IsNullOrWhiteSpace(binaryFile.Digest)) + bool isMissing = string.IsNullOrWhiteSpace(binaryFile.Digest); + if (isMissing || validate) { - throw new InvalidOperationException($"Missing digest on incoming BinaryFile {binaryFile.Path}"); - } - string? expected; - switch (binaryFile.StorageType) - { - case StorageTypes.FileSystem: - var fi = new FileInfo(binaryFile.ExternalLocation); - expected = Checksum.Sha256FromFile(fi); - break; - case StorageTypes.S3: - // TODO - get the SHA256 algorithm from AWS directly rather than compute it here - // If the S3 file does not already have the SHA-256 in metadata, then it's an error - // Our prep service - // GetObjectAttributesAsync - // Need to switch Fedora and OCFL to SHA256 - // What does it mean if you switch the default algorithm in Fedora? It's used for OCFL... - - var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation); - - // This would be an efficient way of doing this - but with this naive implementation - // we're going to read the object twice - var s3Stream = await s3Client!.GetObjectStreamAsync(s3Uri.Bucket, s3Uri.Key, null); - expected = Checksum.Sha256FromStream(s3Stream); - // could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost - // for now just read it twice. - // Later we'll get the sha256 checksum from metadata - // Or the MD5 from eTag? - // BEWARE that multipart uploads will not have the MD5 as the eTag. - break; - default: - throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType); - } + string? expected = null; + switch (binaryFile.StorageType) + { + case StorageTypes.FileSystem: + if (isMissing && apiOptions.RequireDigestOnBinaryFileInfo) + { + throw new InvalidOperationException($"Missing digest on incoming BinaryFile FileInfo {binaryFile.Path}"); + } + var fi = new FileInfo(binaryFile.ExternalLocation); + expected = Checksum.Sha256FromFile(fi); + break; + case StorageTypes.S3: + if (isMissing && apiOptions.RequireDigestOnBinaryS3) + { + throw new InvalidOperationException($"Missing digest on incoming BinaryFile in S3 {binaryFile.Path}"); + } + var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation); + // Get the SHA256 algorithm from AWS directly rather than compute it here + // If the S3 file does not already have the SHA-256 in metadata, then it's an error + expected = await AwsChecksum.GetHexChecksumAsync(s3Client, s3Uri.Bucket, s3Uri.Key); + if (string.IsNullOrWhiteSpace(expected)) + { + throw new InvalidOperationException($"S3 Key at {s3Uri} does not have SHA256 Checksum in its attributes"); + } - if (binaryFile.Digest != null && binaryFile.Digest != expected) - { - throw new InvalidOperationException("Initial checksum doesn't match"); + // This would be an efficient way of doing this - but with this naive implementation + // we're going to read the object twice + // var s3Stream = await s3Client!.GetObjectStreamAsync(s3Uri.Bucket, s3Uri.Key, null); + // expected = Checksum.Sha256FromStream(s3Stream); + // could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost + // for now just read it twice. + // Later we'll get the sha256 checksum from metadata + // Or the MD5 from eTag? + // BEWARE that multipart uploads will not have the MD5 as the eTag. + break; + default: + throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType); + } + // validation + if (!string.IsNullOrWhiteSpace(binaryFile.Digest) && binaryFile.Digest != expected) + { + throw new InvalidOperationException("Initial checksum doesn't match"); + } + binaryFile.Digest = expected; } - binaryFile.Digest = expected; } private Uri GetFedoraUriWithinArchivalGroup(Uri archivalGroupUri, string path) @@ -268,7 +276,7 @@ private Uri GetFedoraUriWithinArchivalGroup(Uri archivalGroupUri, string path) private async Task PutOrPostBinary(HttpMethod httpMethod, BinaryFile binaryFile, Transaction? transaction = null) { // verify that parent is a container first? - EnsureChecksum(binaryFile); + EnsureChecksum(binaryFile, false); var fedoraLocation = GetFedoraUriWithinArchivalGroup(binaryFile.Parent, binaryFile.Path); var req = await MakeBinaryPutOrPost(httpMethod, fedoraLocation, binaryFile, transaction); var response = await httpClient.SendAsync(req); diff --git a/LeedsExperiment/Preservation/PreservationApiOptions.cs b/LeedsExperiment/Preservation/PreservationApiOptions.cs index fc2b4e6..1a9c54e 100644 --- a/LeedsExperiment/Preservation/PreservationApiOptions.cs +++ b/LeedsExperiment/Preservation/PreservationApiOptions.cs @@ -5,5 +5,6 @@ public class PreservationApiOptions public required string Prefix { get; set; } public required string StagingBucket { get; set; } public required int StorageMapCacheTimeSeconds { get; set; } = 5; - public bool RequireDigestOnBinaryFile { get; set; } = true; + public bool RequireDigestOnBinaryFileInfo { get; set; } = false; + public bool RequireDigestOnBinaryS3 { get; set; } = true; }