From 1a462da1cca13918e74b134b8679b22f14fe1e69 Mon Sep 17 00:00:00 2001 From: Shawn Wang Date: Wed, 8 Jan 2025 23:04:53 +0800 Subject: [PATCH] sparse: add max score ratio downscaling for approximate searching Signed-off-by: Shawn Wang --- .../sparse/sparse_inverted_index_config.h | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/index/sparse/sparse_inverted_index_config.h b/src/index/sparse/sparse_inverted_index_config.h index 7c56494eb..4763eb162 100644 --- a/src/index/sparse/sparse_inverted_index_config.h +++ b/src/index/sparse/sparse_inverted_index_config.h @@ -43,22 +43,31 @@ class SparseInvertedIndexConfig : public BaseConfig { .for_search() .for_range_search(); /** - * The term frequency part of score of BM25 is: - * tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avgdl))) - * as more documents being added to the collection, avgdl can also - * change. In WAND index we precompute and cache this score in order to - * speed up the search process, but if avgdl changes, we need to - * re-compute such score which is expensive. To avoid this, we upscale - * the max score by a ratio to compensate for avgdl changes. This will - * make the max score larger than the actual max score, it makes the - * filtering less aggressive, but guarantees the correctness. - * The larger the ratio, the less aggressive the filtering is. + * wand_bm25_max_score_ratio is assigned two functions: + * 1. to upscale the max score to compensate for avgdl changes + * The term frequency part of score of BM25 is: + * tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avgdl))) + * as more documents being added to the collection, avgdl can also + * change. In WAND index we precompute and cache this score in order to + * speed up the search process, but if avgdl changes, we need to + * re-compute such score which is expensive. To avoid this, we upscale + * the max score by a ratio to compensate for avgdl changes. This will + * make the max score larger than the actual max score, it makes the + * filtering less aggressive, but guarantees the correctness. + * 2. to downscale the max score for approximate searching + * In the searching process, we use the sum of the max scores to + * filter the candidate vectors. If the sum is smaller than the + * threshold, skip current vector. If approximate searching is enabled, + * we can make the skipping more aggressive by downscaling the max + * score. Since the possibility that the maxscore of all dims in the + * query appears on the same vector is relatively small, it won't lead + * to a sharp change in the recall rate within a certain range. */ KNOWHERE_CONFIG_DECLARE_FIELD(wand_bm25_max_score_ratio) - .set_range(1.0, 1.3) + .set_range(0.5, 1.3) .set_default(1.05) .description("ratio to upscale max score to compensate for avgdl changes") - .for_train() + .for_train_and_search() .for_deserialize() .for_deserialize_from_file(); }