From ca1f1482ae13950c7349a915417e1c651890dca5 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Aug 2024 19:40:33 +0000 Subject: [PATCH 1/4] Re-enabled address change tracking --- mdio/variable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mdio/variable.h b/mdio/variable.h index 364bb19..2577bc6 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -1132,9 +1132,7 @@ class Variable { if (attributes.get() != nullptr && attributes->get() != nullptr) { std::uintptr_t newAddress = reinterpret_cast(&(**attributes)); - // TODO(BrianMichell): Leaving this as active is causing segfaults. - // The features requiring it are low priority. - // attributesAddress = newAddress; + attributesAddress = newAddress; } // It is fine that this will only change in the "collection" instance of the // Variable, because that is the only one that will be operated on by the From 762f4e05901444444be3b4d87ea81f555a706b6a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Aug 2024 19:41:45 +0000 Subject: [PATCH 2/4] Added true modified Variable tracking back --- mdio/dataset.h | 9 ++++++++- mdio/utils.h | 1 + mdio/variable.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 394e0c1..05a0924 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -834,7 +834,14 @@ class Dataset { // Build out list of modified variables std::vector modifiedVariables; for (const auto& key : keys) { - modifiedVariables.push_back(key); + auto var = variables.at(key).value(); + if (var.was_updated() || var.should_publish()) { + modifiedVariables.push_back(key); + } + if (var.should_publish()) { + // Reset the flag. This should only get set by the trim util. + var.set_metadata_publish_flag(false); + } } // If nothing changed, we don't want to perform any writes diff --git a/mdio/utils.h b/mdio/utils.h index 3984a8e..f3b5b5e 100644 --- a/mdio/utils.h +++ b/mdio/utils.h @@ -62,6 +62,7 @@ Future TrimDataset(std::string dataset_path, for (auto& varIdentifier : ds.variables.get_iterable_accessor()) { MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.at(varIdentifier)) + var.set_metadata_publish_flag(true); if (var.dimensions().labels().back() == "") { auto spec = var.spec(); diff --git a/mdio/variable.h b/mdio/variable.h index 2577bc6..d0fa4ae 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -1098,6 +1098,22 @@ class Variable { return attributesAddress != currentAddress; } + /** + * @brief Sets the flag whether the metadata should get republished. + * Intended for internal use with the trimming utility. + * + * @param shouldPublish True if the metadata should get republished. + */ + void set_metadata_publish_flag(const bool shouldPublish) { + if (!toPublish) { + // This should never be the case, but better safe than sorry + toPublish = std::make_shared>( + std::make_shared(shouldPublish)); + } else { + *toPublish = std::make_shared(shouldPublish); + } + } + // ===========================Member data getters=========================== const std::string& get_variable_name() const { return variableName; } @@ -1118,6 +1134,20 @@ class Variable { return attributesAddress; } + /** + * @brief Gets whether the metadata should get republished. + * + * @return True if the metadata should get republished. + */ + bool should_publish() const { + if (toPublish && *toPublish) { + // Deref the shared_ptr so we're not increasing refcount + return **toPublish; + } + // If the flag was a nullptr, err on the side of caution and republish + return true; + } + private: /** * This method should NEVER be called by the user. @@ -1148,6 +1178,9 @@ class Variable { tensorstore::TensorStore store; // The address of the attributes. This MUST NEVER be touched by the user. std::uintptr_t attributesAddress; + // The metadata will need to be updated if the trim util was used on it. + std::shared_ptr> toPublish = + std::make_shared>(std::make_shared(false)); }; // Tensorstore Array's don't have an IndexDomain and so they can't be slice with From b8f956ae91e4490b40b80ba1ecab2bf0660b1558 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Aug 2024 19:43:09 +0000 Subject: [PATCH 3/4] Switched to shared_ptr and captured variable in lambda to fix heap-use-after-free behavior --- mdio/dataset.h | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 05a0924..5e79f56 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -854,15 +855,14 @@ class Dataset { // We need to update the entire .zmetadata file std::vector json_vars; - std::vector> - vars; // Keeps the Variables in memory. Fix for premature reference - // decrement in LLVM compiler. - for (auto key : keys) { - auto var = variables.at(key).value(); + std::vector>> + vars; // Keeps the Variables in memory using shared_ptr + for (const auto& key : keys) { + auto var = std::make_shared>(variables.at(key).value()); vars.push_back(var); // Get the JSON, drop transform, and add attributes nlohmann::json json = - var.get_store().spec().value().ToJson(IncludeDefaults{}).value(); + var->get_store().spec().value().ToJson(IncludeDefaults{}).value(); json.erase("transform"); json.erase("dtype"); json["metadata"].erase("filters"); @@ -873,7 +873,7 @@ class Dataset { std::string path = json["kvstore"]["path"].get(); path.pop_back(); json["kvstore"]["path"] = path; - nlohmann::json meta = var.getMetadata(); + nlohmann::json meta = var->getMetadata(); if (meta.contains("coordinates")) { meta["attributes"]["coordinates"] = meta["coordinates"]; meta.erase("coordinates"); @@ -909,18 +909,16 @@ class Dataset { promises; std::vector futures; - vars.clear(); // Clear the vector so we can add only the modified Variables - for (auto key : modifiedVariables) { + for (const auto& key : modifiedVariables) { auto pair = tensorstore::PromiseFuturePair< tensorstore::TimestampedStorageGeneration>::Make(); - auto var = variables.at(key).value(); - vars.push_back(var); - auto updateFuture = var.PublishMetadata(); + auto var = std::make_shared>(variables.at(key).value()); + auto updateFuture = var->PublishMetadata(); updateFuture.ExecuteWhenReady( - [promise = std::move(pair.promise)]( - tensorstore::ReadyFuture< - tensorstore::TimestampedStorageGeneration> - readyFut) { promise.SetResult(readyFut.result()); }); + [promise = std::move(pair.promise), + var](tensorstore::ReadyFuture< + tensorstore::TimestampedStorageGeneration> + readyFut) { promise.SetResult(readyFut.result()); }); variableFutures.push_back(std::move(updateFuture)); futures.push_back(std::move(pair.future)); } From 4372c6ff4297105aa474be741f4a983c87bed496 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Aug 2024 19:43:26 +0000 Subject: [PATCH 4/4] Expand test coverage --- mdio/variable_test.cc | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/mdio/variable_test.cc b/mdio/variable_test.cc index c95d018..a880c51 100644 --- a/mdio/variable_test.cc +++ b/mdio/variable_test.cc @@ -481,6 +481,51 @@ TEST(Variable, mismatchAttrs) { std::filesystem::remove_all("test.zarr"); } +TEST(Variable, updateMetadata) { + nlohmann::json json = GetJsonSpecStruct(); + auto variableRes = + mdio::Variable<>::Open(json, mdio::constants::kCreateClean); + ASSERT_TRUE(variableRes.status().ok()) << variableRes.status(); + auto variable = variableRes.value(); + + // Update the metadata + nlohmann::json attrs = { + {"count", 100}, + {"min", -1000.0}, + {"max", 1000.0}, + {"sum", 0.0}, + {"sumSquares", 0.0}, + {"histogram", {{"binCenters", {1.0, 2.0, 3.0}}, {"counts", {1, 2, 3}}}}}; + auto attrsUpdateRes = variable.UpdateAttributes(attrs); + ASSERT_TRUE(attrsUpdateRes.status().ok()) << attrsUpdateRes.status(); + + auto publishFuture = variable.PublishMetadata(); + EXPECT_TRUE(publishFuture.status().ok()) << publishFuture.status(); +} + +TEST(Variable, publishNoStats) { + nlohmann::json json = GetJsonSpecStruct(); + auto variableRes = + mdio::Variable<>::Open(json, mdio::constants::kCreateClean); + ASSERT_TRUE(variableRes.status().ok()) << variableRes.status(); + auto variable = variableRes.value(); + + auto publishFuture = variable.PublishMetadata(); + EXPECT_TRUE(publishFuture.status().ok()) << publishFuture.status(); +} + +TEST(Variable, publishNoAttrs) { + nlohmann::json json = GetJsonSpecStruct(); + json["attributes"].erase("long_name"); + auto variableRes = + mdio::Variable<>::Open(json, mdio::constants::kCreateClean); + ASSERT_TRUE(variableRes.status().ok()) << variableRes.status(); + auto variable = variableRes.value(); + + auto publishFuture = variable.PublishMetadata(); + EXPECT_TRUE(publishFuture.status().ok()) << publishFuture.status(); +} + TEST(Variable, slice) { auto variable = mdio::Variable<>::Open(json_good, mdio::constants::kCreateClean).value();