From 09dda841fd016f293cb24b14c1f11f5d48eee39b Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Sat, 19 Oct 2024 16:45:43 +0800 Subject: [PATCH 1/9] Add filter streamging interface --- src/index.cpp | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 0b01afa20..4daf2839c 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1806,9 +1806,46 @@ void Index::build(const std::string &data_file, const size_t nu size_t points_to_load = num_points_to_load == 0 ? _max_points : num_points_to_load; auto s = std::chrono::high_resolution_clock::now(); + + std::vector tags; + + if (_enable_tags) + { + if (filter_params.tags_file.empty()) + { + throw ANNException("Tag filename isn't set, while _enable_tags is set", -1, __FUNCSIG__, __FILE__, __LINE__); + } + else + { + if (file_exists(filter_params.tags_file)) + { + diskann::cout << "Loading tags from " << filter_params.tags_file << " for vamana index build" << std::endl; + TagT* tag_data = nullptr; + size_t npts, ndim; + diskann::load_bin(filter_params.tags_file, tag_data, npts, ndim); + if (npts < num_points_to_load) + { + std::stringstream sstream; + sstream << "Loaded " << npts << " tags, insufficient to populate tags for " << num_points_to_load + << " points to load"; + throw diskann::ANNException(sstream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + tags.resize(num_points_to_load); + memcpy(tags.data(), tag_data, sizeof(TagT) * num_points_to_load); + + delete[] tag_data; + } + else + { + throw diskann::ANNException(std::string("Tag file") + filter_params.tags_file + " does not exist", -1, __FUNCSIG__, + __FILE__, __LINE__); + } + } + } + if (filter_params.label_file == "") { - this->build(data_file.c_str(), points_to_load); + this->build(data_file.c_str(), points_to_load, tags); } else { @@ -1823,7 +1860,7 @@ void Index::build(const std::string &data_file, const size_t nu // LabelT unv_label_as_num = 0; this->set_universal_label(unv_label_as_num); } - this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load); + this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load, tags); } std::chrono::duration diff = std::chrono::high_resolution_clock::now() - s; std::cout << "Indexing time: " << diff.count() << "\n"; From ebcf94630fe556978937500b94d5b1cf89306fdd Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Wed, 27 Nov 2024 14:39:04 +0800 Subject: [PATCH 2/9] tmp save --- include/index.h | 12 +- src/CMakeLists.txt | 2 +- src/dll/CMakeLists.txt | 2 +- src/index.cpp | 366 ++++++++++++++++++----------------------- 4 files changed, 167 insertions(+), 215 deletions(-) diff --git a/include/index.h b/include/index.h index 320942013..3ddae5297 100644 --- a/include/index.h +++ b/include/index.h @@ -190,7 +190,7 @@ template clas DISKANN_DLLEXPORT void load(AlignedFileReader &reader, uint32_t num_threads, uint32_t search_l); #else // Reads the number of frozen points from graph's metadata file section. - DISKANN_DLLEXPORT static size_t get_graph_num_frozen_points(const std::string &graph_file); +// DISKANN_DLLEXPORT static size_t get_graph_num_frozen_points(const std::string &graph_file); DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads, uint32_t search_l); #endif @@ -291,7 +291,7 @@ template clas // repositions frozen points to the end of _data - if they have been moved // during deletion - DISKANN_DLLEXPORT void reposition_frozen_point_to_end(); +// DISKANN_DLLEXPORT void reposition_frozen_point_to_end(); DISKANN_DLLEXPORT void reposition_points(uint32_t old_location_start, uint32_t new_location_start, uint32_t num_locations); @@ -358,7 +358,7 @@ template clas // generates 1 frozen point that will never be deleted from the graph // This is not visible to the user - void generate_frozen_point(); +// void generate_frozen_point(); // determines navigating node of the graph by calculating medoid of datafopt uint32_t calculate_entry_point(); @@ -423,7 +423,7 @@ template clas // graph, mode = _consolidated_order in case of lazy deletion and // _compacted_order in case of eager deletion DISKANN_DLLEXPORT void compact_data(); - DISKANN_DLLEXPORT void compact_frozen_point(); +// DISKANN_DLLEXPORT void compact_frozen_point(); // Remove deleted nodes from adjacency list of node loc // Replace removed neighbors with second order neighbors. @@ -476,8 +476,8 @@ template clas // externally and won't be returned by search. At least 1 frozen point is // needed for a dynamic index. The frozen points have consecutive locations. // See also _start below. - size_t _num_frozen_pts = 0; - size_t _frozen_pts_used = 0; +// size_t _num_frozen_pts = 0; +// size_t _frozen_pts_used = 0; size_t _node_size; size_t _data_len; size_t _neighbor_len; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cbca26440..23a24bbdc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,7 @@ else() linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp in_mem_data_store.cpp in_mem_graph_store.cpp natural_number_set.cpp memory_mapper.cpp partition.cpp pq.cpp - pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp) + pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp neighbor_list.cpp in_mem_static_graph_store.cpp) if (RESTAPI) list(APPEND CPP_SOURCES restapi/search_wrapper.cpp restapi/server.cpp) endif() diff --git a/src/dll/CMakeLists.txt b/src/dll/CMakeLists.txt index 096d1b76e..11dde1432 100644 --- a/src/dll/CMakeLists.txt +++ b/src/dll/CMakeLists.txt @@ -4,7 +4,7 @@ add_library(${PROJECT_NAME} SHARED dllmain.cpp ../abstract_data_store.cpp ../partition.cpp ../pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp ../windows_aligned_file_reader.cpp ../distance.cpp ../pq_l2_distance.cpp ../memory_mapper.cpp ../index.cpp ../in_mem_data_store.cpp ../pq_data_store.cpp ../in_mem_graph_store.cpp ../math_utils.cpp ../disk_utils.cpp ../filter_utils.cpp - ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp ../scratch.cpp ../index_factory.cpp ../abstract_index.cpp) + ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp ../scratch.cpp ../index_factory.cpp ../abstract_index.cpp ../neighbor_list.cpp ../in_mem_static_graph_store.cpp) set(TARGET_DIR "$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}>") diff --git a/src/index.cpp b/src/index.cpp index 4daf2839c..5227e980a 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -36,7 +36,7 @@ Index::Index(const IndexConfig &index_config, std::shared_ptr graph_store, std::shared_ptr> pq_data_store) : _dist_metric(index_config.metric), _dim(index_config.dimension), _max_points(index_config.max_points), - _num_frozen_pts(index_config.num_frozen_pts), _dynamic_index(index_config.dynamic_index), + _dynamic_index(index_config.dynamic_index), _enable_tags(index_config.enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), _pq_dist(index_config.pq_dist_build), _use_opq(index_config.use_opq), _filtered_index(index_config.filtered_index), _num_pq_chunks(index_config.num_pq_chunks), @@ -60,17 +60,17 @@ Index::Index(const IndexConfig &index_config, std::shared_ptr size_t Index size_t Index 0) - { - std::memset((char *)&tag_data[_start], 0, sizeof(TagT) * _num_frozen_pts); - } + try { - tag_bytes_written = save_bin(tags_file, tag_data, _nd + _num_frozen_pts, 1); + tag_bytes_written = save_bin(tags_file, tag_data, _nd, 1); } catch (std::system_error &e) { @@ -240,7 +237,7 @@ template size_t Indexsave(data_file, (location_t)(_nd + _num_frozen_pts)); + return _data_store->save(data_file, (location_t)(_nd)); } // save the graph index on a file as an adjacency list. For each point, @@ -248,7 +245,7 @@ template size_t Index size_t Index::save_graph(std::string graph_file) { - return _graph_store->store(graph_file, _nd + _num_frozen_pts, _num_frozen_pts, _start); + return _graph_store->store(graph_file, _nd, 0, _start); } template @@ -280,7 +277,7 @@ void Index::save(const char *filename, bool compact_before_save if (compact_before_save) { compact_data(); - compact_frozen_point(); + // compact_frozen_point(); } else { @@ -321,7 +318,7 @@ void Index::save(const char *filename, bool compact_before_save { std::ofstream label_writer(std::string(filename) + "_labels.txt"); assert(label_writer.is_open()); - for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++) + for (uint32_t i = 0; i < _nd; i++) { for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++) { @@ -348,7 +345,7 @@ void Index::save(const char *filename, bool compact_before_save // write updated labels std::ofstream raw_label_writer(std::string(filename) + "_raw_labels.txt"); assert(raw_label_writer.is_open()); - for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++) + for (uint32_t i = 0; i < _nd; i++) { for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++) { @@ -392,7 +389,7 @@ void Index::save(const char *filename, bool compact_before_save // If frozen points were temporarily compacted to _nd, move back to // _max_points. - reposition_frozen_point_to_end(); + //reposition_frozen_point_to_end(); diskann::cout << "Time taken for save: " << timer.elapsed() / 1000000.0 << "s." << std::endl; } @@ -436,7 +433,7 @@ size_t Index::load_tags(const std::string tag_filename) throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - const size_t num_data_points = file_num_points - _num_frozen_pts; + const size_t num_data_points = file_num_points; _location_to_tag.reserve(num_data_points); _tag_to_location.reserve(num_data_points); for (uint32_t i = 0; i < (uint32_t)num_data_points; i++) @@ -487,10 +484,10 @@ size_t Index::load_data(std::string filename) throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - if (file_num_points > _max_points + _num_frozen_pts) + if (file_num_points > _max_points) { // update and tag lock acquired in load() before calling load_data - resize(file_num_points - _num_frozen_pts); + resize(file_num_points); } #ifdef EXEC_ENV_OLS @@ -586,7 +583,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui std::stringstream stream; stream << "ERROR: When loading index, loaded " << data_file_num_pts << " points from datafile, " << graph_num_pts << " from graph, and " << tags_file_num_pts - << " tags, with num_frozen_pts being set to " << _num_frozen_pts << " in constructor." << std::endl; + << " tags in constructor." << std::endl; diskann::cerr << stream.str() << std::endl; throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -595,7 +592,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui { _label_map = load_label_map(labels_map_file); parse_label_file_in_bitset(labels_file, label_num_pts, _label_map.size()); - assert(label_num_pts == data_file_num_pts - _num_frozen_pts); + assert(label_num_pts == data_file_num_pts); if (file_exists(labels_to_medoids)) { std::ifstream medoid_stream(labels_to_medoids); @@ -637,7 +634,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui } } - _nd = data_file_num_pts - _num_frozen_pts; + _nd = data_file_num_pts; _empty_slots.clear(); _empty_slots.reserve(_max_points); for (auto i = _nd; i < _max_points; i++) @@ -645,8 +642,8 @@ void Index::load(const char *filename, uint32_t num_threads, ui _empty_slots.insert((uint32_t)i); } - reposition_frozen_point_to_end(); - diskann::cout << "Num frozen points:" << _num_frozen_pts << " _nd: " << _nd << " _start: " << _start +// reposition_frozen_point_to_end(); + diskann::cout << " _nd: " << _nd << " _start: " << _start << " size(_location_to_tag): " << _location_to_tag.size() << " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points << std::endl; @@ -663,24 +660,24 @@ void Index::load(const char *filename, uint32_t num_threads, ui } #ifndef EXEC_ENV_OLS -template -size_t Index::get_graph_num_frozen_points(const std::string &graph_file) -{ - size_t expected_file_size; - uint32_t max_observed_degree, start; - size_t file_frozen_pts; - - std::ifstream in; - in.exceptions(std::ios::badbit | std::ios::failbit); - - in.open(graph_file, std::ios::binary); - in.read((char *)&expected_file_size, sizeof(size_t)); - in.read((char *)&max_observed_degree, sizeof(uint32_t)); - in.read((char *)&start, sizeof(uint32_t)); - in.read((char *)&file_frozen_pts, sizeof(size_t)); - - return file_frozen_pts; -} +//template +//size_t Index::get_graph_num_frozen_points(const std::string &graph_file) +//{ +// size_t expected_file_size; +// uint32_t max_observed_degree, start; +// size_t file_frozen_pts; +// +// std::ifstream in; +// in.exceptions(std::ios::badbit | std::ios::failbit); +// +// in.open(graph_file, std::ios::binary); +// in.read((char *)&expected_file_size, sizeof(size_t)); +// in.read((char *)&max_observed_degree, sizeof(uint32_t)); +// in.read((char *)&start, sizeof(uint32_t)); +// in.read((char *)&file_frozen_pts, sizeof(size_t)); +// +// return file_frozen_pts; +//} #endif #ifdef EXEC_ENV_OLS @@ -695,7 +692,7 @@ size_t Index::load_graph(std::string filename, size_t expected_ #endif auto res = _graph_store->load(filename, expected_num_points); _start = std::get<1>(res); - _num_frozen_pts = std::get<2>(res); +// _num_frozen_pts = std::get<2>(res); return std::get<0>(res); } @@ -743,18 +740,10 @@ template uint32_t Index std::vector Index::get_init_ids() { std::vector init_ids; - init_ids.reserve(1 + _num_frozen_pts); + init_ids.reserve(1); init_ids.emplace_back(_start); - for (uint32_t frozen = (uint32_t)_max_points; frozen < _max_points + _num_frozen_pts; frozen++) - { - if (frozen != _start) - { - init_ids.emplace_back(frozen); - } - } - return init_ids; } @@ -818,7 +807,7 @@ std::pair Index::iterate_to_fixed_point( } // Decide whether to use bitset or robin set to mark visited nodes - auto total_num_points = _max_points + _num_frozen_pts; + auto total_num_points = _max_points; bool fast_iterate = total_num_points <= MAX_POINTS_FOR_USING_BITSET; if (fast_iterate) @@ -875,7 +864,7 @@ std::pair Index::iterate_to_fixed_point( // Initialize the candidate pool with starting points for (auto id : init_ids) { - if (id >= _max_points + _num_frozen_pts) + if (id >= _max_points) { diskann::cerr << "Out of range loc found as an edge : " << id << std::endl; throw diskann::ANNException(std::string("Wrong loc") + std::to_string(id), -1, __FUNCSIG__, __FILE__, @@ -951,7 +940,7 @@ std::pair Index::iterate_to_fixed_point( auto neighbour_list = _graph_store->get_neighbours(n); for (auto id : neighbour_list) { - assert(id < _max_points + _num_frozen_pts); + assert(id < _max_points); if (!is_not_visited(id)) { @@ -983,7 +972,7 @@ std::pair Index::iterate_to_fixed_point( _locks[n].unlock_shared(); for (auto id : tmp_neighbor_list) { - assert(id < _max_points + _num_frozen_pts); + assert(id < _max_points); if (!is_not_visited(id)) { @@ -1112,7 +1101,7 @@ void Index::search_for_point_and_prune(int location, uint32_t L prune_neighbors(location, pool, pruned_list, scratch); assert(!pruned_list.empty()); - assert(_graph_store->get_total_points() == _max_points + _num_frozen_pts); + assert(_graph_store->get_total_points() == _max_points); } template @@ -1266,7 +1255,7 @@ void Index::inter_insert(uint32_t n, std::vector &pru for (auto des : src_pool) { // des.loc is the loc of the neighbors of n - assert(des < _max_points + _num_frozen_pts); + assert(des < _max_points); bool prune_needed = false; { @@ -1345,23 +1334,14 @@ template void Index visit_order; std::vector pool, tmp; tsl::robin_set visited; - visit_order.reserve(_nd + _num_frozen_pts); + visit_order.reserve(_nd); for (uint32_t i = 0; i < (uint32_t)_nd; i++) { visit_order.emplace_back(i); } - // If there are any frozen points, add them all. - for (uint32_t frozen = (uint32_t)_max_points; frozen < _max_points + _num_frozen_pts; frozen++) - { - visit_order.emplace_back(frozen); - } - // if there are frozen points, the first such one is set to be the _start - if (_num_frozen_pts > 0) - _start = (uint32_t)_max_points; - else - _start = calculate_entry_point(); + _start = calculate_entry_point(); diskann::Timer link_timer; @@ -1450,7 +1430,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons diskann::Timer timer; #pragma omp parallel for - for (int64_t node = 0; node < (int64_t)(_max_points + _num_frozen_pts); node++) + for (int64_t node = 0; node < (int64_t)(_max_points); node++) { if ((size_t)node < _nd || (size_t)node >= _max_points) { @@ -1483,7 +1463,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons diskann::cout << "Prune time : " << timer.elapsed() / 1000 << "ms" << std::endl; size_t max = 0, min = 1 << 30, total = 0, cnt = 0; - for (size_t i = 0; i < _max_points + _num_frozen_pts; i++) + for (size_t i = 0; i < _max_points; i++) { if (i < _nd || i >= _max_points) { @@ -1500,7 +1480,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons if (_nd > 0) { diskann::cout << "Index built with degree: max:" << max - << " avg:" << (float)total / (float)(_nd + _num_frozen_pts) << " min:" << min + << " avg:" << (float)total / (float)(_nd) << " min:" << min << " count(deg<2):" << cnt << std::endl; } } @@ -1514,17 +1494,15 @@ void Index::set_start_points(const T *data, size_t data_count) if (_nd > 0) throw ANNException("Can not set starting point for a non-empty index", -1, __FUNCSIG__, __FILE__, __LINE__); - if (data_count != _num_frozen_pts * _dim) + if (data_count != _dim) throw ANNException("Invalid number of points", -1, __FUNCSIG__, __FILE__, __LINE__); // memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim * // sizeof(T) * _num_frozen_pts); - for (location_t i = 0; i < _num_frozen_pts; i++) - { - _data_store->set_vector((location_t)(i + _max_points), data + i * _dim); - } + _data_store->set_vector((location_t)(_max_points), data); + _has_built = true; - diskann::cout << "Index start points set: #" << _num_frozen_pts << std::endl; + diskann::cout << "Index start points set in location: #" << _max_points << std::endl; } template @@ -1553,24 +1531,21 @@ void Index::set_start_points_at_random(T radius, uint32_t rando std::normal_distribution<> d{0.0, 1.0}; std::vector points_data; - points_data.reserve(_dim * _num_frozen_pts); + points_data.reserve(_dim); std::vector real_vec(_dim); - for (size_t frozen_point = 0; frozen_point < _num_frozen_pts; frozen_point++) + double norm_sq = 0.0; + for (size_t i = 0; i < _dim; ++i) { - double norm_sq = 0.0; - for (size_t i = 0; i < _dim; ++i) - { - auto r = d(gen); - real_vec[i] = r; - norm_sq += r * r; - } - - const double norm = std::sqrt(norm_sq); - for (auto iter : real_vec) - points_data.push_back(static_cast(iter * radius / norm)); + auto r = d(gen); + real_vec[i] = r; + norm_sq += r * r; } + const double norm = std::sqrt(norm_sq); + for (auto iter : real_vec) + points_data.push_back(static_cast(iter * radius / norm)); + set_start_points(points_data.data(), points_data.size()); } @@ -1611,7 +1586,7 @@ void Index::build_with_data_populated(const std::vector & _data_store->get_aligned_dim()); } - generate_frozen_point(); +// generate_frozen_point(); link(); size_t max = 0, min = SIZE_MAX, total = 0, cnt = 0; @@ -1624,7 +1599,7 @@ void Index::build_with_data_populated(const std::vector & if (pool.size() < 2) cnt++; } - diskann::cout << "Index built with degree: max:" << max << " avg:" << (float)total / (float)(_nd + _num_frozen_pts) + diskann::cout << "Index built with degree: max:" << max << " avg:" << (float)total / (float)(_nd) << " min:" << min << " count(deg<2):" << cnt << std::endl; _has_built = true; @@ -2495,39 +2470,39 @@ template size_t Index void Index::generate_frozen_point() -{ - if (_num_frozen_pts == 0) - return; - - if (_num_frozen_pts > 1) - { - throw ANNException("More than one frozen point not supported in generate_frozen_point", -1, __FUNCSIG__, - __FILE__, __LINE__); - } - - if (_nd == 0) - { - throw ANNException("ERROR: Can not pick a frozen point since nd=0", -1, __FUNCSIG__, __FILE__, __LINE__); - } - size_t res = calculate_entry_point(); - - // REFACTOR PQ: Not sure if we should do this for both stores. - if (_pq_dist) - { - // copy the PQ data corresponding to the point returned by - // calculate_entry_point - // memcpy(_pq_data + _max_points * _num_pq_chunks, - // _pq_data + res * _num_pq_chunks, - // _num_pq_chunks * DIV_ROUND_UP(NUM_PQ_BITS, 8)); - _pq_data_store->copy_vectors((location_t)res, (location_t)_max_points, 1); - } - else - { - _data_store->copy_vectors((location_t)res, (location_t)_max_points, 1); - } - _frozen_pts_used++; -} +//template void Index::generate_frozen_point() +//{ +// if (_num_frozen_pts == 0) +// return; +// +// if (_num_frozen_pts > 1) +// { +// throw ANNException("More than one frozen point not supported in generate_frozen_point", -1, __FUNCSIG__, +// __FILE__, __LINE__); +// } +// +// if (_nd == 0) +// { +// throw ANNException("ERROR: Can not pick a frozen point since nd=0", -1, __FUNCSIG__, __FILE__, __LINE__); +// } +// size_t res = calculate_entry_point(); +// +// // REFACTOR PQ: Not sure if we should do this for both stores. +// if (_pq_dist) +// { +// // copy the PQ data corresponding to the point returned by +// // calculate_entry_point +// // memcpy(_pq_data + _max_points * _num_pq_chunks, +// // _pq_data + res * _num_pq_chunks, +// // _num_pq_chunks * DIV_ROUND_UP(NUM_PQ_BITS, 8)); +// _pq_data_store->copy_vectors((location_t)res, (location_t)_max_points, 1); +// } +// else +// { +// _data_store->copy_vectors((location_t)res, (location_t)_max_points, 1); +// } +// _frozen_pts_used++; +//} template int Index::enable_delete() { @@ -2703,7 +2678,7 @@ consolidation_report Index::consolidate_deletes(const IndexWrit num_calls_to_process_delete += 1; } } - for (int64_t loc = _max_points; loc < (int64_t)(_max_points + _num_frozen_pts); loc++) + for (int64_t loc = _max_points; loc < (int64_t)(_max_points); loc++) { ScratchStoreManager> manager(_query_scratch); auto scratch = manager.scratch_space(); @@ -2732,25 +2707,25 @@ consolidation_report Index::consolidate_deletes(const IndexWrit duration); } -template void Index::compact_frozen_point() -{ - if (_nd < _max_points && _num_frozen_pts > 0) - { - reposition_points((uint32_t)_max_points, (uint32_t)_nd, (uint32_t)_num_frozen_pts); - _start = (uint32_t)_nd; - - if (_filtered_index && _dynamic_index) - { - // update medoid id's as frozen points are treated as medoid - for (auto &[label, medoid_id] : _label_to_start_id) - { - /* if (label == _universal_label) - continue;*/ - _label_to_start_id[label] = (uint32_t)_nd + (medoid_id - (uint32_t)_max_points); - } - } - } -} +//template void Index::compact_frozen_point() +//{ +// if (_nd < _max_points && _num_frozen_pts > 0) +// { +// reposition_points((uint32_t)_max_points, (uint32_t)_nd, (uint32_t)_num_frozen_pts); +// _start = (uint32_t)_nd; +// +// if (_filtered_index && _dynamic_index) +// { +// // update medoid id's as frozen points are treated as medoid +// for (auto &[label, medoid_id] : _label_to_start_id) +// { +// /* if (label == _universal_label) +// continue;*/ +// _label_to_start_id[label] = (uint32_t)_nd + (medoid_id - (uint32_t)_max_points); +// } +// } +// } +//} // Should be called after acquiring _update_lock template void Index::compact_data() @@ -2774,7 +2749,7 @@ template void Index new_location = std::vector(_max_points + _num_frozen_pts, UINT32_MAX); + std::vector new_location = std::vector(_max_points, UINT32_MAX); uint32_t new_counter = 0; std::set empty_locations; @@ -2790,10 +2765,6 @@ template void Index void Index new_adj_list; - if ((new_location[old] < _max_points) // If point continues to exist - || (old >= _max_points && old < _max_points + _num_frozen_pts)) + if (new_location[old] < _max_points) // If point continues to exist { auto neighbour_list = _graph_store->get_neighbours((location_t)old); new_adj_list.reserve(neighbour_list.size()); @@ -2959,7 +2929,7 @@ void Index::reposition_points(uint32_t old_location_start, uint const uint32_t location_delta = new_location_start - old_location_start; std::vector updated_neighbours_location; - for (uint32_t i = 0; i < _max_points + _num_frozen_pts; i++) + for (uint32_t i = 0; i < _max_points; i++) { auto i_neighbours = _graph_store->get_neighbours((location_t)i); std::vector i_neighbours_copy; @@ -3027,35 +2997,35 @@ void Index::reposition_points(uint32_t old_location_start, uint _data_store->move_vectors(old_location_start, new_location_start, num_locations); } -template void Index::reposition_frozen_point_to_end() -{ - if (_num_frozen_pts == 0) - return; - - if (_nd == _max_points) - { - diskann::cout << "Not repositioning frozen point as it is already at the end." << std::endl; - return; - } - - reposition_points((uint32_t)_nd, (uint32_t)_max_points, (uint32_t)_num_frozen_pts); - _start = (uint32_t)_max_points; - - // update medoid id's as frozen points are treated as medoid - if (_filtered_index && _dynamic_index) - { - for (auto &[label, medoid_id] : _label_to_start_id) - { - /*if (label == _universal_label) - continue;*/ - _label_to_start_id[label] = (uint32_t)_max_points + (medoid_id - (uint32_t)_nd); - } - } -} +//template void Index::reposition_frozen_point_to_end() +//{ +// if (_num_frozen_pts == 0) +// return; +// +// if (_nd == _max_points) +// { +// diskann::cout << "Not repositioning frozen point as it is already at the end." << std::endl; +// return; +// } +// +// reposition_points((uint32_t)_nd, (uint32_t)_max_points, (uint32_t)_num_frozen_pts); +// _start = (uint32_t)_max_points; +// +// // update medoid id's as frozen points are treated as medoid +// if (_filtered_index && _dynamic_index) +// { +// for (auto &[label, medoid_id] : _label_to_start_id) +// { +// /*if (label == _universal_label) +// continue;*/ +// _label_to_start_id[label] = (uint32_t)_max_points + (medoid_id - (uint32_t)_nd); +// } +// } +//} template void Index::resize(size_t new_max_points) { - const size_t new_internal_points = new_max_points + _num_frozen_pts; + const size_t new_internal_points = new_max_points ; auto start = std::chrono::high_resolution_clock::now(); assert(_empty_slots.size() == 0); // should not resize if there are empty slots. @@ -3063,12 +3033,6 @@ template void Indexresize_graph(new_internal_points); _locks = std::vector(new_internal_points); - if (_num_frozen_pts != 0) - { - reposition_points((uint32_t)_max_points, (uint32_t)new_max_points, (uint32_t)_num_frozen_pts); - _start = (uint32_t)new_max_points; - } - _max_points = new_max_points; _empty_slots.reserve(_max_points); for (auto i = _nd; i < _max_points; i++) @@ -3151,28 +3115,16 @@ int Index::insert_point(const T *point, const TagT tag, const s return -1; } - _location_to_labels[location] = labels; - + // don't support new label for (LabelT label : labels) { if (_labels.find(label) == _labels.end()) { - if (_frozen_pts_used >= _num_frozen_pts) - { - throw ANNException( - "Error: For dynamic filtered index, the number of frozen points should be atleast equal " - "to number of unique labels.", - -1); - } - - auto fz_location = (int)(_max_points) + _frozen_pts_used; // as first _fz_point - _labels.insert(label); - _label_to_start_id[label] = (uint32_t)fz_location; - _location_to_labels[fz_location] = {label}; - _data_store->set_vector((location_t)fz_location, point); - _frozen_pts_used++; + return -1; } } + + _location_to_labels[location] = labels; } if (location == -1) @@ -3409,7 +3361,7 @@ template void Index ul(_update_lock); - boost::dynamic_bitset<> visited(_max_points + _num_frozen_pts); + boost::dynamic_bitset<> visited(_max_points); size_t MAX_BFS_LEVELS = 32; auto bfs_sets = new tsl::robin_set[MAX_BFS_LEVELS]; @@ -3417,7 +3369,7 @@ template void Index Date: Wed, 27 Nov 2024 16:23:30 +0800 Subject: [PATCH 3/9] Fix some issue --- include/index.h | 2 + src/index.cpp | 98 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/include/index.h b/include/index.h index 3ddae5297..46a771e67 100644 --- a/include/index.h +++ b/include/index.h @@ -413,6 +413,8 @@ template clas size_t release_location(int location); size_t release_locations(const tsl::robin_set &locations); + bool is_frozen_point(uint32_t location) const; + // Resize the index when no slots are left for insertion. // Acquire exclusive _update_lock and _tag_lock before calling. void resize(size_t new_max_points); diff --git a/src/index.cpp b/src/index.cpp index 5227e980a..0ff4fc332 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1914,7 +1914,14 @@ void Index::parse_label_file(const std::string &label_file, siz { line_cnt++; } - _location_to_labels.resize(line_cnt, std::vector()); + if (_dynamic_index) + { + _location_to_labels.resize(_max_points, std::vector()); + } + else + { + _location_to_labels.resize(line_cnt, std::vector()); + } infile.clear(); infile.seekg(0, std::ios::beg); @@ -1994,7 +2001,14 @@ void Index::parse_label_file_in_bitset(const std::string& label } _bitmask_buf._bitmask_size = simple_bitmask::get_bitmask_size(num_labels); - _bitmask_buf._buf.resize(line_cnt * _bitmask_buf._bitmask_size, 0); + if (_dynamic_index) + { + _bitmask_buf._buf.resize(_max_points * _bitmask_buf._bitmask_size, 0); + } + else + { + _bitmask_buf._buf.resize(line_cnt * _bitmask_buf._bitmask_size, 0); + } infile.clear(); infile.seekg(0, std::ios::beg); @@ -2621,13 +2635,13 @@ consolidation_report Index::consolidate_deletes(const IndexWrit throw ANNException(err, -1, __FUNCSIG__, __FILE__, __LINE__); } - if (_location_to_tag.size() + _delete_set->size() != _nd) - { - diskann::cerr << "Error: _location_to_tag.size (" << _location_to_tag.size() << ") + _delete_set->size (" - << _delete_set->size() << ") != _nd(" << _nd << ") "; - return consolidation_report(diskann::consolidation_report::status_code::INCONSISTENT_COUNT_ERROR, 0, 0, 0, - 0, 0, 0, 0); - } + //if (_location_to_tag.size() + _delete_set->size() != _nd) + //{ + // diskann::cerr << "Error: _location_to_tag.size (" << _location_to_tag.size() << ") + _delete_set->size (" + // << _delete_set->size() << ") != _nd(" << _nd << ") "; + // return consolidation_report(diskann::consolidation_report::status_code::INCONSISTENT_COUNT_ERROR, 0, 0, 0, + // 0, 0, 0, 0); + //} if (_location_to_tag.size() != _tag_to_location.size()) { @@ -2678,13 +2692,6 @@ consolidation_report Index::consolidate_deletes(const IndexWrit num_calls_to_process_delete += 1; } } - for (int64_t loc = _max_points; loc < (int64_t)(_max_points); loc++) - { - ScratchStoreManager> manager(_query_scratch); - auto scratch = manager.scratch_space(); - process_delete(*old_delete_set, loc, range, maxc, alpha, scratch); - num_calls_to_process_delete += 1; - } std::unique_lock tl(_tag_lock); size_t ret_nd = release_locations(*old_delete_set); @@ -2914,6 +2921,24 @@ size_t Index::release_locations(const tsl::robin_set return _nd; } +template +bool Index::is_frozen_point(uint32_t location) const +{ + if (_filtered_index) + { + for (const auto kv : _label_to_start_id) + { + if (kv.second == location) + { + return true; + } + } + return false; + } + + return _start == location; +} + template void Index::reposition_points(uint32_t old_location_start, uint32_t new_location_start, uint32_t num_locations) @@ -3099,19 +3124,13 @@ int Index::insert_point(const T *point, const TagT tag, const s -1, __FUNCSIG__, __FILE__, __LINE__); } - std::shared_lock shared_ul(_update_lock); - std::unique_lock tl(_tag_lock); - std::unique_lock dl(_delete_lock); - - auto location = reserve_location(); if (_filtered_index) { if (labels.empty()) { - release_location(location); std::cerr << "Error: Can't insert point with tag " + get_tag_string(tag) + - " . there are no labels for the point." - << std::endl; + " . there are no labels for the point." + << std::endl; return -1; } @@ -3123,10 +3142,14 @@ int Index::insert_point(const T *point, const TagT tag, const s return -1; } } - - _location_to_labels[location] = labels; } + std::shared_lock shared_ul(_update_lock); + std::unique_lock tl(_tag_lock); + std::unique_lock dl(_delete_lock); + + auto location = reserve_location(); + if (location == -1) { #if EXPAND_IF_FULL @@ -3167,6 +3190,18 @@ int Index::insert_point(const T *point, const TagT tag, const s } // cant insert as active pts >= max_pts dl.unlock(); + if (_filtered_index) + { + // _location_to_labels[location] = labels; + auto bitsets = _bitmask_buf.get_bitmask(location); + memset(bitsets, 0, _bitmask_buf._bitmask_size); + simple_bitmask bm(bitsets, _bitmask_buf._bitmask_size); + for (LabelT label : labels) + { + bm.set(label); + } + } + // Insert tag and mapping to location if (_enable_tags) { @@ -3271,7 +3306,11 @@ template int Index assert(_tag_to_location[tag] < _max_points); const auto location = _tag_to_location[tag]; - _delete_set->insert(location); + if (!is_frozen_point(location)) + { + _delete_set->insert(location); + } + _location_to_tag.erase(location); _tag_to_location.erase(tag); return 0; @@ -3298,7 +3337,10 @@ void Index::lazy_delete(const std::vector &tags, std::vec else { const auto location = _tag_to_location[tag]; - _delete_set->insert(location); + if (!is_frozen_point(location)) + { + _delete_set->insert(location); + } _location_to_tag.erase(location); _tag_to_location.erase(tag); } From 541322781b48fd898dbbed962c5b95de7e8d5b2d Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Wed, 27 Nov 2024 17:17:43 +0800 Subject: [PATCH 4/9] Fix issue --- src/index.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 0ff4fc332..1beb23b08 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -139,18 +139,16 @@ Index::Index(Metric m, const size_t dim, const size_t max_point .with_data_type(diskann_type_to_name()) .build(), IndexFactory::construct_datastore(DataStoreStrategy::MEMORY, - (max_points == 0 ? (size_t)1 : max_points) + - (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), + (max_points == 0 ? (size_t)1 : max_points), dim, m), IndexFactory::construct_graphstore(GraphStoreStrategy::MEMORY, - (max_points == 0 ? (size_t)1 : max_points) + - (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), + (max_points == 0 ? (size_t)1 : max_points), (size_t)((index_parameters == nullptr ? 0 : index_parameters->max_degree) * defaults::GRAPH_SLACK_FACTOR * 1.05))) { if (_pq_dist) { - _pq_data_store = IndexFactory::construct_pq_datastore(DataStoreStrategy::MEMORY, max_points + num_frozen_pts, + _pq_data_store = IndexFactory::construct_pq_datastore(DataStoreStrategy::MEMORY, max_points, dim, m, num_pq_chunks, use_opq); } else From 1ebc6c9db044439afb8c5b2f2fe974a55e832d1b Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Wed, 27 Nov 2024 17:45:11 +0800 Subject: [PATCH 5/9] Fix issue --- src/index_factory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index_factory.cpp b/src/index_factory.cpp index 5c7dbee6b..18cddae9d 100644 --- a/src/index_factory.cpp +++ b/src/index_factory.cpp @@ -122,7 +122,7 @@ std::shared_ptr> IndexFactory::construct_pq_datastore(DataStoreSt template std::unique_ptr IndexFactory::create_instance() { - size_t num_points = _config->max_points + _config->num_frozen_pts; + size_t num_points = _config->max_points; size_t dim = _config->dimension; // auto graph_store = construct_graphstore(_config->graph_strategy, num_points); auto data_store = construct_datastore(_config->data_strategy, num_points, dim, _config->metric); From f95a52661cde8aeb830e1e9b6b0353997db4f292 Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Thu, 28 Nov 2024 18:21:25 +0800 Subject: [PATCH 6/9] Fix insert interface issue --- include/abstract_index.h | 6 +-- include/index.h | 2 +- src/abstract_index.cpp | 111 ++++++++++++++------------------------- src/index.cpp | 12 +++-- 4 files changed, 51 insertions(+), 80 deletions(-) diff --git a/include/abstract_index.h b/include/abstract_index.h index 7c84a8ec9..3f7dae7cf 100644 --- a/include/abstract_index.h +++ b/include/abstract_index.h @@ -80,8 +80,8 @@ class AbstractIndex float *distances); // insert points with labels, labels should be present for filtered index - template - int insert_point(const data_type *point, const tag_type tag, const std::vector &labels); + template + int insert_point(const data_type *point, const tag_type tag, const std::vector &labels); // insert point for unfiltered index build. do not use with filtered index template int insert_point(const data_type *point, const tag_type tag); @@ -116,7 +116,7 @@ class AbstractIndex virtual std::pair _search_with_filters(const DataType &query, const std::string &filter_label, const size_t K, const uint32_t L, std::any &indices, float *distances) = 0; - virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) = 0; + virtual int _insert_point(const DataType &data_point, const TagType tag, const std::vector &labels) = 0; virtual int _insert_point(const DataType &data_point, const TagType tag) = 0; virtual int _lazy_delete(const TagType &tag) = 0; virtual void _lazy_delete(TagVector &tags, TagVector &failed_tags) = 0; diff --git a/include/index.h b/include/index.h index 46a771e67..05c868e31 100644 --- a/include/index.h +++ b/include/index.h @@ -328,7 +328,7 @@ template clas float *distances) override; virtual int _insert_point(const DataType &data_point, const TagType tag) override; - virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) override; + virtual int _insert_point(const DataType &data_point, const TagType tag, const std::vector &labels) override; virtual int _lazy_delete(const TagType &tag) override; diff --git a/src/abstract_index.cpp b/src/abstract_index.cpp index 7550bda3a..6047170a9 100644 --- a/src/abstract_index.cpp +++ b/src/abstract_index.cpp @@ -57,13 +57,12 @@ int AbstractIndex::insert_point(const data_type *point, const tag_type tag) return this->_insert_point(any_point, any_tag); } -template -int AbstractIndex::insert_point(const data_type *point, const tag_type tag, const std::vector &labels) +template +int AbstractIndex::insert_point(const data_type *point, const tag_type tag, const std::vector& labels) { auto any_point = std::any(point); auto any_tag = std::any(tag); - auto any_labels = Labelvector(labels); - return this->_insert_point(any_point, any_tag, any_labels); + return this->_insert_point(any_point, any_tag, labels); } template int AbstractIndex::lazy_delete(const tag_type &tag) @@ -259,75 +258,41 @@ template DISKANN_DLLEXPORT int AbstractIndex::insert_point(c template DISKANN_DLLEXPORT int AbstractIndex::insert_point(const uint8_t* point, const tag_uint128 tag); template DISKANN_DLLEXPORT int AbstractIndex::insert_point(const int8_t* point, const tag_uint128 tag); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const int32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const int32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const int32_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const uint32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const uint32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const uint32_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const int64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const int64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const int64_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const uint64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const uint64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const uint64_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float* point, const tag_uint128 tag, const std::vector& labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t* point, const tag_uint128 tag, const std::vector& labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t* point, const tag_uint128 tag, const std::vector& labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const int32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const int32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const int32_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const uint32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const uint32_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const uint32_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const int64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const int64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const int64_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float *point, const uint64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t *point, const uint64_t tag, const std::vector &labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t *point, const uint64_t tag, const std::vector &labels); - -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const float* point, const tag_uint128 tag, const std::vector& labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const uint8_t* point, const tag_uint128 tag, const std::vector& labels); -template DISKANN_DLLEXPORT int AbstractIndex::insert_point( - const int8_t* point, const tag_uint128 tag, const std::vector& labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int64_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint64_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float* point, const tag_uint128 tag, const std::vector& labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t* point, const tag_uint128 tag, const std::vector& labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t* point, const tag_uint128 tag, const std::vector& labels); + template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete(const int32_t &tag); template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete(const uint32_t &tag); diff --git a/src/index.cpp b/src/index.cpp index 1beb23b08..dd732b9c8 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3085,12 +3085,18 @@ int Index::_insert_point(const DataType &point, const TagType t } template -int Index::_insert_point(const DataType &point, const TagType tag, Labelvector &labels) +int Index::_insert_point(const DataType &point, const TagType tag, const std::vector& labels) { try { - return this->insert_point(std::any_cast(point), std::any_cast(tag), - labels.get>()); + std::vector converted_labels; + converted_labels.reserve(labels.size()); + for (const auto& label : labels) + { + auto converted_label = this->get_converted_label(label); + converted_labels.push_back(converted_label); + } + return this->insert_point(std::any_cast(point), std::any_cast(tag), converted_labels); } catch (const std::bad_any_cast &anycast_e) { From 764be36ec82b14de5b757ecf56988b07e6cf4b2f Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Mon, 2 Dec 2024 13:25:05 +0800 Subject: [PATCH 7/9] Fix insert with filtered label --- include/index.h | 7 ++++ src/index.cpp | 107 +++++++++++++++++++++++++++++------------------- 2 files changed, 73 insertions(+), 41 deletions(-) diff --git a/include/index.h b/include/index.h index 05c868e31..a5a2b35cc 100644 --- a/include/index.h +++ b/include/index.h @@ -384,6 +384,13 @@ template clas InMemQueryScratch *scratch, bool use_filter = false, uint32_t filteredLindex = 0); + void search_for_point_and_prune(int location, uint32_t Lindex, std::vector& pruned_list, + const std::vector& labels, + InMemQueryScratch* scratch, + uint32_t filteredLindex); + + void prune_search_result(int location, std::vector& pruned_list, InMemQueryScratch* scratch); + void prune_neighbors(const uint32_t location, std::vector &pool, std::vector &pruned_list, InMemQueryScratch *scratch); diff --git a/src/index.cpp b/src/index.cpp index dd732b9c8..d01017add 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1034,53 +1034,79 @@ void Index::search_for_point_and_prune(int location, uint32_t L { _data_store->get_vector(location, scratch->aligned_query()); iterate_to_fixed_point(scratch, Lindex, init_ids, false, unused_filter_label, false); + prune_search_result(location, pruned_list, scratch); } else { + std::vector labels; std::shared_lock tl(_tag_lock, std::defer_lock); if (_dynamic_index) tl.lock(); - std::vector filter_specific_start_nodes; - for (auto &x : _location_to_labels[location]) - filter_specific_start_nodes.emplace_back(_label_to_start_id[x]); + + labels = _location_to_labels[location]; if (_dynamic_index) tl.unlock(); - _data_store->get_vector(location, scratch->aligned_query()); - iterate_to_fixed_point(scratch, filteredLindex, filter_specific_start_nodes, true, - _location_to_labels[location], false); + search_for_point_and_prune(location, Lindex, pruned_list, labels, scratch, filteredLindex); + } + + assert(_graph_store->get_total_points() == _max_points); +} + +template +void Index::search_for_point_and_prune( + int location, uint32_t Lindex, + std::vector& pruned_list, + const std::vector& labels, + InMemQueryScratch* scratch, + uint32_t filteredLindex) +{ + std::vector filter_specific_start_nodes; + for (auto& x : labels) + filter_specific_start_nodes.emplace_back(_label_to_start_id[x]); + + _data_store->get_vector(location, scratch->aligned_query()); + iterate_to_fixed_point(scratch, filteredLindex, filter_specific_start_nodes, true, + labels, false); - if (Lindex > 0) + if (Lindex > 0) + { + // combine candidate pools obtained with filter and unfiltered criteria. + const std::vector init_ids = get_init_ids(); + const std::vector unused_filter_label; + std::set best_candidate_pool; + for (auto filtered_neighbor : scratch->pool()) { - // combine candidate pools obtained with filter and unfiltered criteria. - std::set best_candidate_pool; - for (auto filtered_neighbor : scratch->pool()) - { - best_candidate_pool.insert(filtered_neighbor); - } + best_candidate_pool.insert(filtered_neighbor); + } - // clear scratch for finding unfiltered candidates - scratch->clear(); + // clear scratch for finding unfiltered candidates + scratch->clear(); - _data_store->get_vector(location, scratch->aligned_query()); - iterate_to_fixed_point(scratch, Lindex, init_ids, false, unused_filter_label, false); + _data_store->get_vector(location, scratch->aligned_query()); + iterate_to_fixed_point(scratch, Lindex, init_ids, false, unused_filter_label, false); - for (auto unfiltered_neighbour : scratch->pool()) + for (auto unfiltered_neighbour : scratch->pool()) + { + // insert if this neighbour is not already in best_candidate_pool + if (best_candidate_pool.find(unfiltered_neighbour) == best_candidate_pool.end()) { - // insert if this neighbour is not already in best_candidate_pool - if (best_candidate_pool.find(unfiltered_neighbour) == best_candidate_pool.end()) - { - best_candidate_pool.insert(unfiltered_neighbour); - } + best_candidate_pool.insert(unfiltered_neighbour); } - - scratch->pool().clear(); - std::copy(best_candidate_pool.begin(), best_candidate_pool.end(), std::back_inserter(scratch->pool())); } + + scratch->pool().clear(); + std::copy(best_candidate_pool.begin(), best_candidate_pool.end(), std::back_inserter(scratch->pool())); } - auto &pool = scratch->pool(); + prune_search_result(location, pruned_list, scratch); +} + +template +void Index::prune_search_result(int location, std::vector& pruned_list, InMemQueryScratch* scratch) +{ + auto& pool = scratch->pool(); for (uint32_t i = 0; i < pool.size(); i++) { @@ -1099,7 +1125,6 @@ void Index::search_for_point_and_prune(int location, uint32_t L prune_neighbors(location, pool, pruned_list, scratch); assert(!pruned_list.empty()); - assert(_graph_store->get_total_points() == _max_points); } template @@ -3194,18 +3219,6 @@ int Index::insert_point(const T *point, const TagT tag, const s } // cant insert as active pts >= max_pts dl.unlock(); - if (_filtered_index) - { - // _location_to_labels[location] = labels; - auto bitsets = _bitmask_buf.get_bitmask(location); - memset(bitsets, 0, _bitmask_buf._bitmask_size); - simple_bitmask bm(bitsets, _bitmask_buf._bitmask_size); - for (LabelT label : labels) - { - bm.set(label); - } - } - // Insert tag and mapping to location if (_enable_tags) { @@ -3230,7 +3243,7 @@ int Index::insert_point(const T *point, const TagT tag, const s if (_filtered_index) { // when filtered the best_candidates will share the same label ( label_present > distance) - search_for_point_and_prune(location, _indexingQueueSize, pruned_list, scratch, true, _filterIndexingQueueSize); + search_for_point_and_prune(location, _indexingQueueSize, pruned_list, labels, scratch, _filterIndexingQueueSize); } else { @@ -3257,6 +3270,18 @@ int Index::insert_point(const T *point, const TagT tag, const s _graph_store->set_neighbours(location, neighbor_links); assert(_graph_store->get_neighbours(location).size() <= _indexingRange); + if (_filtered_index) + { + // _location_to_labels[location] = labels; + auto bitsets = _bitmask_buf.get_bitmask(location); + memset(bitsets, 0, _bitmask_buf._bitmask_size); + simple_bitmask bm(bitsets, _bitmask_buf._bitmask_size); + for (LabelT label : labels) + { + bm.set(label); + } + } + if (_conc_consolidate) tlock.unlock(); } From f78f289382241d01bdc9cde427c09e6788b96cf6 Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Mon, 2 Dec 2024 19:20:46 +0800 Subject: [PATCH 8/9] Fix entry point missing --- src/index.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index d01017add..dd53fed80 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2439,8 +2439,8 @@ size_t Index::search_with_tags(const T *query, const uint64_t K std::shared_lock ul(_update_lock); - const std::vector init_ids = get_init_ids(); - + std::vector init_ids = get_init_ids(); + //_distance->preprocess_query(query, _data_store->get_dims(), // scratch->aligned_query()); _data_store->preprocess_query(query, scratch); @@ -2453,6 +2453,18 @@ size_t Index::search_with_tags(const T *query, const uint64_t K { std::vector filter_vec; auto converted_label = this->get_converted_label(filter_label); + + if (_label_to_start_id.find(converted_label) != _label_to_start_id.end()) + { + init_ids.emplace_back(_label_to_start_id[converted_label]); + } + else + { + diskann::cout << "No filtered medoid found. exitting " + << std::endl; // RKNOTE: If universal label found start there + throw diskann::ANNException("No filtered medoid found. exitting ", -1); + } + filter_vec.push_back(converted_label); iterate_to_fixed_point(scratch, L, init_ids, true, filter_vec, true); } From 6d18a73d2a5b4a89787d12937979c3e180145975 Mon Sep 17 00:00:00 2001 From: Sanhaoji2 Date: Tue, 3 Dec 2024 15:32:42 +0800 Subject: [PATCH 9/9] Fix frozen point check --- src/index.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/index.cpp b/src/index.cpp index dd53fed80..12f2e6792 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2968,7 +2968,6 @@ bool Index::is_frozen_point(uint32_t location) const return true; } } - return false; } return _start == location;