Skip to content

Commit

Permalink
AwsChecksums
Browse files Browse the repository at this point in the history
  • Loading branch information
tomcrane committed Mar 12, 2024
1 parent 65705ce commit 4713d12
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 49 deletions.
37 changes: 28 additions & 9 deletions LeedsExperiment/Preservation.API/Controllers/ImportController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -329,16 +329,24 @@ private async Task<ImportSource> GetImportSource(string source, Uri intendedPare
// This will currently break if the source is not an s3 Uri to which we have access
// but later could be a file path etc, a scratch upload location, whatever
var s3Uri = new AmazonS3Uri(source);
// we assume this is the root. We also assume that we are not going to hit the AWS limit (1000?)

// FOR THIS DEMO we assume this is the root.
// We also assume that we are not going to hit the AWS limit for paging (1000?)
// https://docs.aws.amazon.com/sdkfornet1/latest/apidocs/html/M_Amazon_S3_AmazonS3_ListObjects.htm
// ^^ for paging
// We can't learn anything about containers this way other than that there are slugs in path
// We can't learn anything about intended name (dc:title) from this, but that's OK for now
// That kind of data should be in METS files; we can enhance the ImportJob with it later in a real world application
// The code that constructs the import job has access to more information than the code below.
// The files have been through a pipeline that will have produced checksums, content types and more, and put them in
// metadata such as METS that that code understands.
var listObjectsReq = new ListObjectsV2Request()
{
BucketName = s3Uri.Bucket,
Prefix = $"{s3Uri.Key.TrimEnd('/')}/" //,
Prefix = $"{s3Uri.Key.TrimEnd('/')}/"

// The only valid values here are RestoreStatus or null, so this is no good
// https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/model/OptionalObjectAttributes.html
// OptionalObjectAttributes = [ObjectAttributes.Checksum] //,
// OptionalObjectAttributes = ["Content-Type"] - need to work out how to get content type back here
// https://stackoverflow.com/a/44179929
// application/x-directory
Expand All @@ -356,18 +364,29 @@ private async Task<ImportSource> GetImportSource(string source, Uri intendedPare
continue;
}

// how do you get the checksum here without making a further call?

// S3 source folders either need SHA-256 hashes in their AWS metadata (preferred for package-building)
// or they are recorded in things like METS files - in a way that this code here can understand.

// Different applications have their own logic for storing hashes as part of the object, e.g., in METS.

// Unless coming from other information, we *require* that S3 source folders have sha256 hashes in their metadata
// so we don't have to do this:
// (how do we enforce that, we don't want to know about METS here)

// TODAY
var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null);
var sha256Digest = Checksum.Sha256FromStream(s3Stream);
// (and all our Fedora objects have sha-256)
// Get the SHA256 algorithm from AWS directly rather than compute it here
// If the S3 file does not already have the SHA-256 in metadata, then it's an error
string? sha256 = await AwsChecksum.GetHexChecksumAsync(s3Client, s3Uri.Bucket, obj.Key);
if (string.IsNullOrWhiteSpace(sha256))
{
throw new InvalidOperationException($"S3 Key at {obj.Key} does not have SHA256 Checksum in its attributes");
}


// so we don't have to do this:
// var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null);
// var sha256Digest = Checksum.Sha256FromStream(s3Stream);

// We can also do an eTag comparison for smaller files
// We can also do a size comparison as a sanity check - this can't catch all changes obvs
// but if source and current have same checksum but different sizes then something's up
Expand All @@ -386,7 +405,7 @@ private async Task<ImportSource> GetImportSource(string source, Uri intendedPare
Path = sourcePath,
StorageType = StorageTypes.S3,
ExternalLocation = $"s3://{obj.BucketName}/{obj.Key}",
Digest = sha256Digest,
Digest = sha256,
ContentType = GetDefaultContentType(nameAndParentPath.Name) // we may overwrite this later, e.g., from PREMIS data
});
}
Expand Down
26 changes: 26 additions & 0 deletions LeedsExperiment/Preservation/AwsChecksum.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using Amazon.S3;
using Amazon.S3.Model;

namespace Preservation;

public class AwsChecksum
{
public static async Task<string?> GetHexChecksumAsync(IAmazonS3 s3Client, string bucket, string key)
{
var objAttrsRequest = new GetObjectAttributesRequest()
{
BucketName = bucket,
Key = key,
ObjectAttributes = [ObjectAttributes.Checksum]
};
var objAttrsResponse = await s3Client!.GetObjectAttributesAsync(objAttrsRequest);
string? base64Sha256 = objAttrsResponse?.Checksum?.ChecksumSHA256;
if (!string.IsNullOrWhiteSpace(base64Sha256))
{
byte[] bytes = Convert.FromBase64String(base64Sha256);
return BitConverter.ToString(bytes).Replace("-", "").ToLowerInvariant();

}
return null;
}
}
86 changes: 47 additions & 39 deletions LeedsExperiment/Preservation/FedoraWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -207,48 +207,56 @@ public async Task<Binary> PutBinary(BinaryFile binaryFile, Transaction? transact
return await PutOrPostBinary(HttpMethod.Put, binaryFile, transaction);
}

private async void EnsureChecksum(BinaryFile binaryFile)
private async void EnsureChecksum(BinaryFile binaryFile, bool validate)
{
if(apiOptions.RequireDigestOnBinaryFile && string.IsNullOrWhiteSpace(binaryFile.Digest))
bool isMissing = string.IsNullOrWhiteSpace(binaryFile.Digest);
if (isMissing || validate)
{
throw new InvalidOperationException($"Missing digest on incoming BinaryFile {binaryFile.Path}");
}
string? expected;
switch (binaryFile.StorageType)
{
case StorageTypes.FileSystem:
var fi = new FileInfo(binaryFile.ExternalLocation);
expected = Checksum.Sha256FromFile(fi);
break;
case StorageTypes.S3:
// TODO - get the SHA256 algorithm from AWS directly rather than compute it here
// If the S3 file does not already have the SHA-256 in metadata, then it's an error
// Our prep service
// GetObjectAttributesAsync
// Need to switch Fedora and OCFL to SHA256
// What does it mean if you switch the default algorithm in Fedora? It's used for OCFL...

var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation);

// This would be an efficient way of doing this - but with this naive implementation
// we're going to read the object twice
var s3Stream = await s3Client!.GetObjectStreamAsync(s3Uri.Bucket, s3Uri.Key, null);
expected = Checksum.Sha256FromStream(s3Stream);
// could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost
// for now just read it twice.
// Later we'll get the sha256 checksum from metadata
// Or the MD5 from eTag?
// BEWARE that multipart uploads will not have the MD5 as the eTag.
break;
default:
throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType);
}
string? expected = null;
switch (binaryFile.StorageType)
{
case StorageTypes.FileSystem:
if (isMissing && apiOptions.RequireDigestOnBinaryFileInfo)
{
throw new InvalidOperationException($"Missing digest on incoming BinaryFile FileInfo {binaryFile.Path}");
}
var fi = new FileInfo(binaryFile.ExternalLocation);
expected = Checksum.Sha256FromFile(fi);
break;
case StorageTypes.S3:
if (isMissing && apiOptions.RequireDigestOnBinaryS3)
{
throw new InvalidOperationException($"Missing digest on incoming BinaryFile in S3 {binaryFile.Path}");
}
var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation);
// Get the SHA256 algorithm from AWS directly rather than compute it here
// If the S3 file does not already have the SHA-256 in metadata, then it's an error
expected = await AwsChecksum.GetHexChecksumAsync(s3Client, s3Uri.Bucket, s3Uri.Key);
if (string.IsNullOrWhiteSpace(expected))
{
throw new InvalidOperationException($"S3 Key at {s3Uri} does not have SHA256 Checksum in its attributes");
}

if (binaryFile.Digest != null && binaryFile.Digest != expected)
{
throw new InvalidOperationException("Initial checksum doesn't match");
// This would be an efficient way of doing this - but with this naive implementation
// we're going to read the object twice
// var s3Stream = await s3Client!.GetObjectStreamAsync(s3Uri.Bucket, s3Uri.Key, null);
// expected = Checksum.Sha256FromStream(s3Stream);
// could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost
// for now just read it twice.
// Later we'll get the sha256 checksum from metadata
// Or the MD5 from eTag?
// BEWARE that multipart uploads will not have the MD5 as the eTag.
break;
default:
throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType);
}
// validation
if (!string.IsNullOrWhiteSpace(binaryFile.Digest) && binaryFile.Digest != expected)
{
throw new InvalidOperationException("Initial checksum doesn't match");
}
binaryFile.Digest = expected;
}
binaryFile.Digest = expected;
}

private Uri GetFedoraUriWithinArchivalGroup(Uri archivalGroupUri, string path)
Expand All @@ -268,7 +276,7 @@ private Uri GetFedoraUriWithinArchivalGroup(Uri archivalGroupUri, string path)
private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, BinaryFile binaryFile, Transaction? transaction = null)
{
// verify that parent is a container first?
EnsureChecksum(binaryFile);
EnsureChecksum(binaryFile, false);
var fedoraLocation = GetFedoraUriWithinArchivalGroup(binaryFile.Parent, binaryFile.Path);
var req = await MakeBinaryPutOrPost(httpMethod, fedoraLocation, binaryFile, transaction);
var response = await httpClient.SendAsync(req);
Expand Down
3 changes: 2 additions & 1 deletion LeedsExperiment/Preservation/PreservationApiOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ public class PreservationApiOptions
public required string Prefix { get; set; }
public required string StagingBucket { get; set; }
public required int StorageMapCacheTimeSeconds { get; set; } = 5;
public bool RequireDigestOnBinaryFile { get; set; } = true;
public bool RequireDigestOnBinaryFileInfo { get; set; } = false;
public bool RequireDigestOnBinaryS3 { get; set; } = true;
}

0 comments on commit 4713d12

Please sign in to comment.