Skip to content

Commit

Permalink
fix trap with knnPerfTest.py that quietly overrode distance metric sp…
Browse files Browse the repository at this point in the history
…ecification; added a WTF comment
  • Loading branch information
mikemccand committed Aug 28, 2024
1 parent 632db26 commit 5328f97
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
5 changes: 4 additions & 1 deletion src/main/knn/KnnGraphTester.java
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,10 @@ private void run(String... args) throws Exception {
case "euclidean":
similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
break;
case "angular":
case "angular": // TODO: why is angular a synonym for DOT_PRODUCT? this only holds true if vectors are normalized to unit
// sphere? but also, low values for angular mean the vectors are similar, but high values of dot_product mean
// the vectors are similar
case "dot_product":
similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
break;
default:
Expand Down
15 changes: 8 additions & 7 deletions src/python/knnPerfTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
'numMergeWorker': (12,),
'numMergeThread': (4,),
'encoding': ('float32',),
# 'metric': ('angular',), # default is angular (dot_product)
#'quantize': (True,),
#'fanout': (0,),
#'topK': (10,),
Expand All @@ -78,9 +79,9 @@ def run_knn_benchmark(checkout, values):
#dim = 100
#doc_vectors = constants.GLOVE_VECTOR_DOCS_FILE
#query_vectors = '%s/luceneutil/tasks/vector-task-100d.vec' % constants.BASE_DIR
dim = 768
doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec'
query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec'
#dim = 768
#doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec'
#query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec'
#dim = 384
#doc_vectors = '%s/data/enwiki-20120502-lines-1k-minilm.vec' % constants.BASE_DIR
#query_vectors = '%s/luceneutil/tasks/vector-task-minilm.vec' % constants.BASE_DIR
Expand All @@ -92,9 +93,9 @@ def run_knn_benchmark(checkout, values):
#query_vectors = '/d/electronics_query_vectors.bin'

# Cohere dataset
#dim = 768
#doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR
#query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR
dim = 768
doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR
query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR
cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout))
cmd = constants.JAVA_EXE.split(' ') + ['-cp', cp,
'--add-modules', 'jdk.incubator.vector',
Expand Down Expand Up @@ -128,7 +129,7 @@ def run_knn_benchmark(checkout, values):
'-docs', doc_vectors,
'-reindex',
'-search', query_vectors,
'-metric', 'euclidean',
#'-metric', 'euclidean',
# '-numMergeThread', '8', '-numMergeWorker', '8',
# '-forceMerge',
'-quiet']
Expand Down

0 comments on commit 5328f97

Please sign in to comment.