From 5328f9748cea0da35e0574e0b5b565a33d0acb0f Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Wed, 28 Aug 2024 08:26:09 -0400 Subject: [PATCH] fix trap with knnPerfTest.py that quietly overrode distance metric specification; added a WTF comment --- src/main/knn/KnnGraphTester.java | 5 ++++- src/python/knnPerfTest.py | 15 ++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 060c4074e..c19775250 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -281,7 +281,10 @@ private void run(String... args) throws Exception { case "euclidean": similarityFunction = VectorSimilarityFunction.EUCLIDEAN; break; - case "angular": + case "angular": // TODO: why is angular a synonym for DOT_PRODUCT? this only holds true if vectors are normalized to unit + // sphere? but also, low values for angular mean the vectors are similar, but high values of dot_product mean + // the vectors are similar + case "dot_product": similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; break; default: diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index 9b98be9b2..847e98301 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -53,6 +53,7 @@ 'numMergeWorker': (12,), 'numMergeThread': (4,), 'encoding': ('float32',), + # 'metric': ('angular',), # default is angular (dot_product) #'quantize': (True,), #'fanout': (0,), #'topK': (10,), @@ -78,9 +79,9 @@ def run_knn_benchmark(checkout, values): #dim = 100 #doc_vectors = constants.GLOVE_VECTOR_DOCS_FILE #query_vectors = '%s/luceneutil/tasks/vector-task-100d.vec' % constants.BASE_DIR - dim = 768 - doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec' - query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec' + #dim = 768 + #doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec' + #query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec' #dim = 384 #doc_vectors = '%s/data/enwiki-20120502-lines-1k-minilm.vec' % constants.BASE_DIR #query_vectors = '%s/luceneutil/tasks/vector-task-minilm.vec' % constants.BASE_DIR @@ -92,9 +93,9 @@ def run_knn_benchmark(checkout, values): #query_vectors = '/d/electronics_query_vectors.bin' # Cohere dataset - #dim = 768 - #doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR - #query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR + dim = 768 + doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR + query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout)) cmd = constants.JAVA_EXE.split(' ') + ['-cp', cp, '--add-modules', 'jdk.incubator.vector', @@ -128,7 +129,7 @@ def run_knn_benchmark(checkout, values): '-docs', doc_vectors, '-reindex', '-search', query_vectors, - '-metric', 'euclidean', + #'-metric', 'euclidean', # '-numMergeThread', '8', '-numMergeWorker', '8', # '-forceMerge', '-quiet']