From bea6a0bab52979fc522acc97d5dc24119027a57f Mon Sep 17 00:00:00 2001 From: Shawn Wang Date: Wed, 8 Jan 2025 15:44:13 +0800 Subject: [PATCH] sparse: add sparse index file header v1 Introduce a sparse index file header to support different index type in the future. Note it also upgrades the global version from 6 to 7 to keep the old index working normally. Signed-off-by: Shawn Wang --- include/knowhere/utils.h | 12 ++ include/knowhere/version.h | 2 +- src/index/sparse/sparse_index_node.cc | 12 +- src/index/sparse/sparse_inverted_index.h | 175 +++++++++++++++++++---- 4 files changed, 168 insertions(+), 33 deletions(-) diff --git a/include/knowhere/utils.h b/include/knowhere/utils.h index bd8ddca24..fa4d14085 100644 --- a/include/knowhere/utils.h +++ b/include/knowhere/utils.h @@ -200,6 +200,18 @@ readBinaryPOD(R& in, T& podRef) { in.read((char*)&podRef, sizeof(T)); } +template +static void +writeBinaryString(W& out, const std::string& str) { + out.write((char*)str.data(), str.size()); +} + +template +static void +readBinaryString(R& in, std::string& str) { + in.read((char*)str.data(), str.size()); +} + // taken from // https://github.com/Microsoft/BLAS-on-flash/blob/master/include/utils.h // round up X to the nearest multiple of Y diff --git a/include/knowhere/version.h b/include/knowhere/version.h index 244d5b66c..bb2087525 100644 --- a/include/knowhere/version.h +++ b/include/knowhere/version.h @@ -21,7 +21,7 @@ namespace knowhere { namespace { static constexpr int32_t default_version = 0; static constexpr int32_t minimal_version = 0; -static constexpr int32_t current_version = 6; +static constexpr int32_t current_version = 7; } // namespace class Version { diff --git a/src/index/sparse/sparse_index_node.cc b/src/index/sparse/sparse_index_node.cc index 9ce1c6290..b55454d55 100644 --- a/src/index/sparse/sparse_index_node.cc +++ b/src/index/sparse/sparse_index_node.cc @@ -37,8 +37,10 @@ class SparseInvertedIndexNode : public IndexNode { static_assert(std::is_same_v, "SparseInvertedIndexNode only support float"); public: - explicit SparseInvertedIndexNode(const int32_t& /*version*/, const Object& /*object*/) - : search_pool_(ThreadPool::GetGlobalSearchThreadPool()), build_pool_(ThreadPool::GetGlobalBuildThreadPool()) { + explicit SparseInvertedIndexNode(const int32_t& version, const Object& /*object*/) + : IndexNode(version), + search_pool_(ThreadPool::GetGlobalSearchThreadPool()), + build_pool_(ThreadPool::GetGlobalBuildThreadPool()) { } ~SparseInvertedIndexNode() override { @@ -245,7 +247,7 @@ class SparseInvertedIndexNode : public IndexNode { return Status::empty_index; } MemoryIOWriter writer; - RETURN_IF_ERROR(index_->Save(writer)); + RETURN_IF_ERROR(index_->Save(writer, this->version_.VersionNumber())); std::shared_ptr data(writer.data()); binset.Append(Type(), data, writer.tellg()); return Status::success; @@ -269,7 +271,7 @@ class SparseInvertedIndexNode : public IndexNode { return index_or.error(); } index_ = index_or.value(); - return index_->Load(reader, 0, ""); + return index_->Load(reader, 0, "", this->version_.VersionNumber()); } Status @@ -309,7 +311,7 @@ class SparseInvertedIndexNode : public IndexNode { MemoryIOReader map_reader(reinterpret_cast(mapped_memory), map_size); auto supplement_target_filename = filename + ".knowhere_sparse_index_supplement"; - return index_->Load(map_reader, map_flags, supplement_target_filename); + return index_->Load(map_reader, map_flags, supplement_target_filename, this->version_.VersionNumber()); } static std::unique_ptr diff --git a/src/index/sparse/sparse_inverted_index.h b/src/index/sparse/sparse_inverted_index.h index 1033a46a1..54e007259 100644 --- a/src/index/sparse/sparse_inverted_index.h +++ b/src/index/sparse/sparse_inverted_index.h @@ -47,12 +47,12 @@ class BaseInvertedIndex { virtual ~BaseInvertedIndex() = default; virtual Status - Save(MemoryIOWriter& writer) = 0; + Save(MemoryIOWriter& writer, IndexVersion index_version) = 0; // supplement_target_filename: when in mmap mode, we need an extra file to store the mmaped index data structure. // this file will be created during loading and deleted in the destructor. virtual Status - Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) = 0; + Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename, IndexVersion index_version) = 0; virtual Status Train(const SparseRow* data, size_t rows) = 0; @@ -156,27 +156,27 @@ class InvertedIndex : public BaseInvertedIndex { } } + /** + * Sparse Index file layout (version 0) + * + * 1. size_t rows + * 2. size_t cols + * 3. DType value_threshold_ (deprecated) + * 4. for each row: + * 1. size_t len + * 2. for each non-zero value: + * 1. table_t idx + * 2. DType val (when QType is different from DType, the QType value of val is stored as a DType with + * precision loss) + * + * inverted_index_ids_, inverted_index_vals_ and max_score_in_dim_ are + * not serialized, they will be constructed dynamically during + * deserialization. + * + * Data are densely packed in serialized bytes and no padding is added. + */ Status - Save(MemoryIOWriter& writer) override { - /** - * Layout: - * - * 1. size_t rows - * 2. size_t cols - * 3. DType value_threshold_ (deprecated) - * 4. for each row: - * 1. size_t len - * 2. for each non-zero value: - * 1. table_t idx - * 2. DType val (when QType is different from DType, the QType value of val is stored as a DType with - * precision loss) - * - * inverted_index_ids_, inverted_index_vals_ and max_score_in_dim_ are - * not serialized, they will be constructed dynamically during - * deserialization. - * - * Data are densely packed in serialized bytes and no padding is added. - */ + save_index_to_binary_v0(MemoryIOWriter& writer) { DType deprecated_value_threshold = 0; writeBinaryPOD(writer, n_rows_internal_); writeBinaryPOD(writer, max_dim_); @@ -189,8 +189,8 @@ class InvertedIndex : public BaseInvertedIndex { } auto dim_map_reverse = std::unordered_map(); - for (auto dim_it = dim_map_.begin(); dim_it != dim_map_.end(); ++dim_it) { - dim_map_reverse[dim_it->second] = dim_it->first; + for (auto & dim_it : dim_map_) { + dim_map_reverse[dim_it.second] = dim_it.first; } for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) { @@ -209,11 +209,81 @@ class InvertedIndex : public BaseInvertedIndex { } writer.write(raw_row.data(), raw_row.size() * SparseRow::element_size()); } - return Status::success; } + + /** + * Sparse Index file format (version 1) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Magic Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Version | Index Type | Index Flags | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Index Fmt | | + * +-+-+-+-+-+-+-+-+ + + * | Index Data | + * + + + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ES0 Fmt(opt) | | + * +-+-+-+-+-+-+-+-+ + + * | Extra Section 0 Data | + * + + + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ES1 Fmt(opt) | | + * +-+-+-+-+-+-+-+-+ + + * | Extra Section 1 Data | + * + + + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * Specification: + * 1. Magic Number should only be "SPRS_IDX". + * 2. Version is 1 for now. + * 3. Index Type currently only uses 0, which represents "Sparse Inverted Index". + * 4. Index Flags indicate the extra data sections' information. + * Each bit in the flags for different index type may have different meanings. + * For Sparse Inverted Index: + * - bit0: 1 means the file contains the maxscore section of each dimension. + * - bit1: 1 means the file contains blockmax section of each dimension. + * 5. Index Format is the format of the index data. + * - 0: the index data use the old format(version 0). + * 6. Index Data contains both the meta data and the main data of the index. + * 7. Extra data sections are used to save other data such as dimension blockmax maxscore. + * These data sections are optional and their existence can be indicated by the index flags. + */ + Status + save_index_to_binary_v1(MemoryIOWriter& writer) { + // NOTE: remember to update the binary version when the binary format changes + uint8_t binary_version = 1; + uint8_t index_type = 0; + uint16_t index_flags = 0; + writeBinaryString(writer, "SPRS_IDX"); + writeBinaryPOD(writer, binary_version); + writeBinaryPOD(writer, index_type); + if constexpr (algo == InvertedIndexAlgo::DAAT_WAND || algo == InvertedIndexAlgo::DAAT_MAXSCORE) { + index_flags |= 1 << 0; + } + writeBinaryPOD(writer, index_flags); + // use the old format(0) for now + uint8_t index_fmt = 0; + writeBinaryPOD(writer, index_fmt); + return save_index_to_binary_v0(writer); + } + Status - Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) override { + Save(MemoryIOWriter& writer, IndexVersion index_version) override { + if (index_version <= 6) { + return save_index_to_binary_v0(writer); + } else { + return save_index_to_binary_v1(writer); + } + } + + Status + load_index_from_binary_v0(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) { DType deprecated_value_threshold; int64_t rows; readBinaryPOD(reader, rows); @@ -249,10 +319,61 @@ class InvertedIndex : public BaseInvertedIndex { } n_rows_internal_ = rows; - return Status::success; } + Status + load_index_from_binary_v1(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) { + // check header + std::string magic_number(8, '\0'); + uint8_t file_version; + uint8_t index_type; + uint16_t index_flags; + uint8_t index_fmt; + + readBinaryString(reader, magic_number); + if (magic_number != "SPRS_IDX") { + LOG_KNOWHERE_ERROR_ << "Invalid BinarySet: wrong magic number."; + return Status::invalid_binary_set; + } + readBinaryPOD(reader, file_version); + if (file_version != 1) { + LOG_KNOWHERE_ERROR_ << "Invalid BinarySet: file version cannot be recognized."; + return Status::invalid_binary_set; + } + readBinaryPOD(reader, index_type); + if (index_type != 0) { + LOG_KNOWHERE_ERROR_ << "Invalid BinarySet: index type cannot be recognized."; + return Status::invalid_binary_set; + } + readBinaryPOD(reader, index_flags); + if constexpr (algo == InvertedIndexAlgo::DAAT_WAND || algo == InvertedIndexAlgo::DAAT_MAXSCORE) { + if (!(index_flags & (1 << 0))) { + LOG_KNOWHERE_ERROR_ << "Invalid BinarySet: dimension maxscore cannot be found, which is required for " + "DAAT_WAND or DAAT_MAXSCORE algorithm."; + return Status::invalid_binary_set; + } + } + readBinaryPOD(reader, index_fmt); + switch (index_fmt) { + case 0: // old format + return load_index_from_binary_v0(reader, map_flags, supplement_target_filename); + default: + LOG_KNOWHERE_ERROR_ << "Invalid BinarySet: index format cannot be recognized."; + return Status::invalid_binary_set; + } + + } + + Status + Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename, IndexVersion index_version) override { + if (index_version <= 6) { + return load_index_from_binary_v0(reader, map_flags, supplement_target_filename); + } else { + return load_index_from_binary_v1(reader, map_flags, supplement_target_filename); + } + } + // memory in reader must be guaranteed to be valid during the lifetime of this object. Status PrepareMmap(MemoryIOReader& reader, size_t rows, int map_flags, const std::string& supplement_target_filename) {