From 1a462da1cca13918e74b134b8679b22f14fe1e69 Mon Sep 17 00:00:00 2001
From: Shawn Wang <shawn.wang@zilliz.com>
Date: Wed, 8 Jan 2025 23:04:53 +0800
Subject: [PATCH] sparse: add max score ratio downscaling for approximate
 searching

Signed-off-by: Shawn Wang <shawn.wang@zilliz.com>
---
 .../sparse/sparse_inverted_index_config.h     | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/index/sparse/sparse_inverted_index_config.h b/src/index/sparse/sparse_inverted_index_config.h
index 7c56494eb..4763eb162 100644
--- a/src/index/sparse/sparse_inverted_index_config.h
+++ b/src/index/sparse/sparse_inverted_index_config.h
@@ -43,22 +43,31 @@ class SparseInvertedIndexConfig : public BaseConfig {
             .for_search()
             .for_range_search();
         /**
-         * The term frequency part of score of BM25 is:
-         * tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avgdl)))
-         * as more documents being added to the collection, avgdl can also
-         * change. In WAND index we precompute and cache this score in order to
-         * speed up the search process, but if avgdl changes, we need to
-         * re-compute such score which is expensive. To avoid this, we upscale
-         * the max score by a ratio to compensate for avgdl changes. This will
-         * make the max score larger than the actual max score, it makes the
-         * filtering less aggressive, but guarantees the correctness.
-         * The larger the ratio, the less aggressive the filtering is.
+         * wand_bm25_max_score_ratio is assigned two functions:
+         * 1. to upscale the max score to compensate for avgdl changes
+         *    The term frequency part of score of BM25 is:
+         *    tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avgdl)))
+         *    as more documents being added to the collection, avgdl can also
+         *    change. In WAND index we precompute and cache this score in order to
+         *    speed up the search process, but if avgdl changes, we need to
+         *    re-compute such score which is expensive. To avoid this, we upscale
+         *    the max score by a ratio to compensate for avgdl changes. This will
+         *    make the max score larger than the actual max score, it makes the
+         *    filtering less aggressive, but guarantees the correctness.
+         * 2. to downscale the max score for approximate searching
+         *    In the searching process, we use the sum of the max scores to
+         *    filter the candidate vectors. If the sum is smaller than the
+         *    threshold, skip current vector. If approximate searching is enabled,
+         *    we can make the skipping more aggressive by downscaling the max
+         *    score. Since the possibility that the maxscore of all dims in the
+         *    query appears on the same vector is relatively small, it won't lead
+         *    to a sharp change in the recall rate within a certain range.
          */
         KNOWHERE_CONFIG_DECLARE_FIELD(wand_bm25_max_score_ratio)
-            .set_range(1.0, 1.3)
+            .set_range(0.5, 1.3)
             .set_default(1.05)
             .description("ratio to upscale max score to compensate for avgdl changes")
-            .for_train()
+            .for_train_and_search()
             .for_deserialize()
             .for_deserialize_from_file();
     }