From e8cc6811ffa7b299c03371f4d6649b96c1533208 Mon Sep 17 00:00:00 2001 From: luyuncheng Date: Tue, 12 Nov 2024 21:12:20 +0800 Subject: [PATCH 1/4] Introduce FaissEngineFlatVector Readers read flatValues directly from faiss file Signed-off-by: luyuncheng --- .../knn/common/FieldInfoExtractor.java | 6 + .../FaissEngineFlatKnnVectorsReader.java | 190 ++++++++++++++++++ .../FaissEngineFlatVectorValues.java | 145 +++++++++++++ .../FaissEngineKnnVectorsReader.java | 51 +++++ .../NativeEngines990KnnVectorsReader.java | 9 +- .../knn/index/codec/util/KNNCodecUtil.java | 2 +- ...NativeEngineFlatKnnVectorsReaderTests.java | 160 +++++++++++++++ 7 files changed, 560 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java create mode 100644 src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java create mode 100644 src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java create mode 100644 src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java diff --git a/src/main/java/org/opensearch/knn/common/FieldInfoExtractor.java b/src/main/java/org/opensearch/knn/common/FieldInfoExtractor.java index 8a77b595f..12455f73b 100644 --- a/src/main/java/org/opensearch/knn/common/FieldInfoExtractor.java +++ b/src/main/java/org/opensearch/knn/common/FieldInfoExtractor.java @@ -15,6 +15,7 @@ import org.opensearch.knn.indices.ModelUtil; import static org.opensearch.knn.common.KNNConstants.MODEL_ID; +import static org.opensearch.knn.common.KNNConstants.PARAMETERS; import static org.opensearch.knn.indices.ModelUtil.getModelMetadata; import org.opensearch.knn.index.engine.qframe.QuantizationConfig; import org.opensearch.knn.index.engine.qframe.QuantizationConfigParser; @@ -103,4 +104,9 @@ public static SpaceType getSpaceType(final ModelDao modelDao, final FieldInfo fi } return modelMetadata.getSpaceType(); } + + public static String getParameters(final FieldInfo fieldInfo) { + final String parameters = fieldInfo.getAttribute(PARAMETERS); + return parameters; + } } diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java new file mode 100644 index 000000000..f78c41440 --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java @@ -0,0 +1,190 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.KNN990Codec; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.opensearch.knn.common.FieldInfoExtractor; +import org.opensearch.knn.index.SpaceType; +import org.opensearch.knn.index.VectorDataType; +import org.opensearch.knn.index.codec.util.KNNCodecUtil; +import org.opensearch.knn.index.engine.KNNEngine; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.HashMap; +import java.util.Map; + +import static org.opensearch.knn.index.engine.KNNEngine.FAISS; + +/** + * There is 3 Index in one faiss file <-id-><-hnsw-><-Storage-> + * File Structure like followings: + * |-typeIDMap-||-id_header-| + * |-typeHnsw-||-hnsw_header-||-hnswGraph-| + * |-typeStorage-||-storage_Header-||-storageVector-| + * |-idmap_vector-| + * + * header would like: + * |dim|ntotal|dummy|dummy|is_trained|metric_type|metric_arg| + * + * Example for HNSW32,Flat: + * |idMapType|idMapHeader|hnswType|hnswHeader|hnswGraph|flatType|flatHeader|Vectors|IdVector|FOOTER_MAGIC+CHECKSUM| + */ +@Getter +public class FaissEngineFlatKnnVectorsReader extends FaissEngineKnnVectorsReader { + + // 1. A Footer magic number (int - 4 bytes) + // 2. A checksum algorithm id (int - 4 bytes) + // 3. A checksum (long - bytes) + // The checksum is computed on all the bytes written to the file up to that point. + // Logic where footer is written in Lucene can be found here: + // https://github.com/apache/lucene/blob/branch_9_0/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java#L390-L412 + public static final int FOOT_MAGIC_SIZE = RamUsageEstimator.primitiveSizes.get(Integer.TYPE); + public static final int ALGORITHM_SIZE = RamUsageEstimator.primitiveSizes.get(Integer.TYPE); + public static final int CHECKSUM_SIZE = RamUsageEstimator.primitiveSizes.get(Long.TYPE); + public static final int FLOAT_SIZE = RamUsageEstimator.primitiveSizes.get(Float.TYPE); + public static final int SIZET_SIZE = RamUsageEstimator.primitiveSizes.get(Long.TYPE); + public static final int FOOTER_SIZE = FOOT_MAGIC_SIZE + ALGORITHM_SIZE + CHECKSUM_SIZE; + + private Map fieldFileMap; + private Map fieldMetaMap; + @Override + public void checkIntegrity() throws IOException { + + } + + public FaissEngineFlatKnnVectorsReader(SegmentReadState state) throws IOException { + fieldFileMap = new HashMap<>(); + fieldMetaMap = new HashMap<>(); + boolean success = false; + try { + for (FieldInfo field : state.fieldInfos) { + + KNNEngine knnEngine = KNNCodecUtil.getNativeKNNEngine(field); + if (knnEngine == null || FAISS != knnEngine) { + continue; + } + final String vectorIndexFileName = KNNCodecUtil.getNativeEngineFileFromFieldInfo(field, state.segmentInfo); + if (vectorIndexFileName == null) { + continue; + } + //TODO for fp16, pq + VectorDataType vectorDataType = FieldInfoExtractor.extractVectorDataType(field); + SpaceType spaceType = FieldInfoExtractor.getSpaceType(null, field); + if (vectorDataType != VectorDataType.FLOAT) { + continue; + } + String parameter = FieldInfoExtractor.getParameters(field); + System.out.print(parameter); + if (parameter == null || parameter.contains("BHNSW")) { + continue; + } + //TODO if not exist file, change to lucene flatVector + IndexInput in = state.directory.openInput(vectorIndexFileName, state.context.withRandomAccess()); + if(in == null) { + continue; + } + fieldFileMap.put(field.getName(), in); + } + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + + for(Map.Entry entry : fieldFileMap.entrySet()) { + IndexInput in = entry.getValue(); + int h = in.readInt(); + MetaInfo metaInfo = read_index_header(in); + fieldMetaMap.put(entry.getKey(), metaInfo); + } + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + MetaInfo metaInfo = fieldMetaMap.get(field); + IndexInput input = fieldFileMap.get(field); + FaissEngineFlatVectorValues vectorValues = new FaissEngineFlatVectorValues(metaInfo, input); + return vectorValues; + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + return null; + } + + @Override + public boolean isNativeVectors(String field) { + return fieldFileMap.containsKey(field) && fieldMetaMap.containsKey(field); + } + + private MetaInfo read_index_header(IndexInput in) throws IOException { + + int d = in.readInt(); + long ntotal = in.readLong(); + long dummy; + dummy = in.readLong(); + dummy = in.readLong(); + byte is_trained = in.readByte(); + // + int metric_type = in.readInt(); + float metric_arg = 0; + if (metric_type > 1) { + metric_arg = Float.intBitsToFloat(in.readInt()); + } + long filesize = in.length(); + // There is (ntotal+1) * idx_t and FOOTER_SIZE + long idSeek = filesize - (ntotal + 1) * SIZET_SIZE - FOOTER_SIZE; + //in.seek(idSeek); +// long size = in.readLong(); + +// long[] ids = new long[(int) ntotal]; +// in.readLongs(ids, 0, (int) ntotal); + long vectorSeek = idSeek - (FLOAT_SIZE * d) * ntotal - SIZET_SIZE; +// in.seek(vectorSeek); + +// float[] v = new float[(int) (d * ntotal)]; +// size = in.readLong(); +// System.out.println("Vector Size: " + size + " d * ntotal" + d * ntotal); +// for(int i = 0; i < ntotal; i++) { +// in.readFloats(v, i * d, d); +// System.out.println("vector:"); +// for (int j = 0; j < d; j++) { +// System.out.println(v[i*d + j]); +// } +// } + return new MetaInfo(d, ntotal, is_trained, metric_type, metric_arg, idSeek, vectorSeek); + } + @Override + public void close() throws IOException { + for(Map.Entry entry : fieldFileMap.entrySet()) { + IndexInput in = entry.getValue(); + IOUtils.close(in); + } + } + + + @AllArgsConstructor + @Getter + public class MetaInfo { + int d; + long ntotal; + byte isTrained; + int metricType; + float metricArg; + long idSeek; + long vectorSeek; + } +} diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java new file mode 100644 index 000000000..168f028eb --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java @@ -0,0 +1,145 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.KNN990Codec; + +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.store.IndexInput; +import org.opensearch.knn.index.KNNVectorSimilarityFunction; +import org.opensearch.knn.index.SpaceType; + +import java.io.IOException; +import java.util.Arrays; + +import static org.opensearch.knn.index.codec.KNN990Codec.FaissEngineFlatKnnVectorsReader.FLOAT_SIZE; +import static org.opensearch.knn.index.codec.KNN990Codec.FaissEngineFlatKnnVectorsReader.SIZET_SIZE; + +public class FaissEngineFlatVectorValues extends FloatVectorValues { + private static final int BUCKET_VECTORS = 64; //every time read only bucket size vectors. + protected FaissEngineFlatKnnVectorsReader.MetaInfo metaInfo; + protected final IndexInput slice; + protected final VectorSimilarityFunction similarityFunction; + protected final FlatVectorsScorer flatVectorsScorer; + protected final float[] value; + protected final long[] ids; + protected final float[] buf; + protected int docId = -1; + protected int ord = -1; + + public FaissEngineFlatVectorValues(FaissEngineFlatKnnVectorsReader.MetaInfo metaInfo, IndexInput input) throws IOException { + this.metaInfo = metaInfo; + this.slice = input.clone(); + this.similarityFunction = getVectorSimilarityFunction(metaInfo.metricType).getVectorSimilarityFunction(); + this.flatVectorsScorer = FlatVectorScorerUtil.getLucene99FlatVectorsScorer(); + this.value = new float[(int) (metaInfo.d * metaInfo.ntotal)]; + this.ids= new long[(int) metaInfo.ntotal]; + this.buf = new float[metaInfo.d]; + readIds(); + } + + protected void readIds() throws IOException { + slice.seek(metaInfo.idSeek); + long size = slice.readLong(); + assert size == metaInfo.ntotal; + slice.readLongs(ids, 0, (int) metaInfo.ntotal); + } + + protected void readBucketVectors() throws IOException { + assert ord >= 0; + assert ord <= metaInfo.ntotal; + int bucketIndex = ord / BUCKET_VECTORS; + slice.seek(metaInfo.vectorSeek + SIZET_SIZE + bucketIndex * BUCKET_VECTORS * FLOAT_SIZE * metaInfo.d); + + for (int i = 0, o = ord; + i < BUCKET_VECTORS && o < metaInfo.ntotal; + i++, o++) { + slice.readFloats(value, i * metaInfo.d, metaInfo.d); + } + } +// public void readInfo() throws IOException { +// slice.seek(metaInfo.idSeek); +// long size = slice.readLong(); +// assert size == metaInfo.ntotal; +// slice.readLongs(ids, 0, (int) metaInfo.ntotal); +// +// slice.seek(metaInfo.vectorSeek); +// size = slice.readLong(); +// for(int i = 0; i < metaInfo.ntotal; i++) { +// slice.readFloats(value, i * metaInfo.d, metaInfo.d); +// } +// } + + @Override + public int dimension() { + return metaInfo.d; + } + + @Override + public int size() { + return (int) metaInfo.ntotal; + } + + @Override + public float[] vectorValue() throws IOException { + if(ord % BUCKET_VECTORS == 0) { + readBucketVectors(); + } + int bucketOrder = ord % BUCKET_VECTORS; + + System.arraycopy(value, bucketOrder * metaInfo.d, buf, 0, metaInfo.d); + return buf; + } + + @Override + public VectorScorer scorer(float[] floats) throws IOException { + //TODO + return null; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + return advance(docId + 1); + } + + @Override + public int advance(int target) throws IOException { + ord = Arrays.binarySearch(ids, ord + 1, ids.length, target); + if (ord < 0) { + ord = -(ord + 1); + } + assert ord <= ids.length; + if (ord == ids.length) { + docId = NO_MORE_DOCS; + } else { + docId = (int) ids[ord]; + } + return docId; + } + + KNNVectorSimilarityFunction getVectorSimilarityFunction(int metricType) { + // Ref from jni/external/faiss/c_api/Index_c.h + switch (metricType) { + case 0: + return SpaceType.INNER_PRODUCT.getKnnVectorSimilarityFunction(); + case 1: + return SpaceType.L2.getKnnVectorSimilarityFunction(); + case 2: + return SpaceType.L1.getKnnVectorSimilarityFunction(); + case 3: + return SpaceType.LINF.getKnnVectorSimilarityFunction(); + default: + return SpaceType.L2.getKnnVectorSimilarityFunction(); + } + } +} diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java new file mode 100644 index 000000000..661bba642 --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java @@ -0,0 +1,51 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.KNN990Codec; + +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; + +import java.io.Closeable; +import java.io.IOException; + +/** Reads vectors from faiss index. like Lucene KnnVectorsReader without search */ +public abstract class FaissEngineKnnVectorsReader implements Closeable { + + /** Sole constructor */ + protected FaissEngineKnnVectorsReader() {} + + /** + * Checks consistency of this reader. + * + *

Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value + * against large data files. + * + * @lucene.internal + */ + public abstract void checkIntegrity() throws IOException; + + /** + * Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if + * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is + * never {@code null}. + */ + public abstract FloatVectorValues getFloatVectorValues(String field) throws IOException; + + /** + * Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if + * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is + * never {@code null}. + */ + public abstract ByteVectorValues getByteVectorValues(String field) throws IOException; + + /** + * Return true if and only if we can get native engine files and extract docValues. + * @param field KNN vectors enabled on its {@link FieldInfo} + * @return boolean for native engines vectors + */ + public abstract boolean isNativeVectors(String field); +} diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsReader.java index efabc3a70..b0c3e0c1d 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsReader.java @@ -45,14 +45,16 @@ public class NativeEngines990KnnVectorsReader extends KnnVectorsReader { private final FlatVectorsReader flatVectorsReader; + private final FaissEngineKnnVectorsReader faissEngineKnnVectorsReader; private Map quantizationStateCacheKeyPerField; private SegmentReadState segmentReadState; private final List cacheKeys; - public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) { + public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) throws IOException { this.flatVectorsReader = flatVectorsReader; this.segmentReadState = state; this.cacheKeys = getVectorCacheKeysFromSegmentReaderState(state); + this.faissEngineKnnVectorsReader = new FaissEngineFlatKnnVectorsReader(state); loadCacheKeyMap(); } @@ -77,6 +79,9 @@ public void checkIntegrity() throws IOException { */ @Override public FloatVectorValues getFloatVectorValues(final String field) throws IOException { + if (faissEngineKnnVectorsReader.isNativeVectors(field)) { + faissEngineKnnVectorsReader.getFloatVectorValues(field); + } return flatVectorsReader.getFloatVectorValues(field); } @@ -188,7 +193,7 @@ public void close() throws IOException { cacheKeys.forEach(nativeMemoryCacheManager::invalidate); // Close a reader. - IOUtils.close(flatVectorsReader); + IOUtils.close(flatVectorsReader, faissEngineKnnVectorsReader); // Clean up quantized state cache. if (quantizationStateCacheKeyPerField != null) { diff --git a/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java b/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java index 3ccfc3c2b..08dd96ebb 100644 --- a/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java +++ b/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java @@ -122,7 +122,7 @@ public static String getNativeEngineFileFromFieldInfo(FieldInfo field, SegmentIn * @param field which field we need produce from engine * @return if and only if Native Engine we return specific engine, else return null */ - private static KNNEngine getNativeKNNEngine(@NonNull FieldInfo field) { + public static KNNEngine getNativeKNNEngine(@NonNull FieldInfo field) { final KNNEngine engine = FieldInfoExtractor.extractKNNEngine(field); if (KNNEngine.getEnginesThatCreateCustomSegmentFiles().contains(engine)) { return engine; diff --git a/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java new file mode 100644 index 000000000..be8cdd653 --- /dev/null +++ b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java @@ -0,0 +1,160 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.KNN990Codec; + +import com.google.common.collect.ImmutableMap; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.opensearch.Version; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.knn.KNNTestCase; +import org.opensearch.knn.common.KNNConstants; +import org.opensearch.knn.index.SpaceType; +import org.opensearch.knn.index.VectorDataType; +import org.opensearch.knn.index.codec.KNNCodecTestUtil; +import org.opensearch.knn.index.codec.nativeindex.NativeIndexWriter; +import org.opensearch.knn.index.codec.util.KNNCodecUtil; +import org.opensearch.knn.index.engine.KNNEngine; +import org.opensearch.knn.index.engine.KNNMethodConfigContext; +import org.opensearch.knn.index.engine.KNNMethodContext; +import org.opensearch.knn.index.engine.MethodComponentContext; +import org.opensearch.knn.index.mapper.KNNVectorFieldMapper; +import org.opensearch.knn.index.vectorvalues.KNNVectorValues; +import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory; +import org.opensearch.knn.index.vectorvalues.TestVectorValues; +import org.opensearch.knn.plugin.stats.KNNGraphValue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.opensearch.knn.common.FieldInfoExtractor.extractVectorDataType; +import static org.opensearch.knn.common.KNNConstants.METHOD_HNSW; +import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_EF_CONSTRUCTION; +import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_EF_SEARCH; +import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_M; +import static org.opensearch.knn.index.codec.KNNCodecTestUtil.assertFileInCorrectLocation; +import static org.opensearch.knn.index.codec.KNNCodecTestUtil.assertLoadableByEngine; +import static org.opensearch.knn.index.codec.KNNCodecTestUtil.assertValidFooter; + +public class NativeEngineFlatKnnVectorsReaderTests extends KNNTestCase { + private static final int EF_SEARCH = 10; + private static final Map HNSW_METHODPARAMETERS = Map.of(METHOD_PARAMETER_EF_SEARCH, EF_SEARCH); + + private static Directory directory; + private static Codec codec; + + @BeforeClass + public static void setStaticVariables() { + directory = newFSDirectory(createTempDir()); + codec = new KNN990Codec(); + } + + @AfterClass + public static void closeStaticVariables() throws IOException { + directory.close(); + } + + public void testAddKNNFloatField_FaissEngine_ReadFaissFile_success() throws IOException { + String segmentName = "_0"; + int docsInSegment = 100; + String fieldName = String.format("testField%s", randomAlphaOfLength(4)); + + KNNEngine knnEngine = KNNEngine.FAISS; + SpaceType spaceType = SpaceType.INNER_PRODUCT; + int dimension = 16; + + SegmentInfo segmentInfo = KNNCodecTestUtil.segmentInfoBuilder() + .directory(directory) + .segmentName(segmentName) + .docsInSegment(docsInSegment) + .codec(codec) + .build(); + KNNMethodConfigContext knnMethodConfigContext = KNNMethodConfigContext.builder() + .vectorDataType(VectorDataType.FLOAT) + .versionCreated(Version.CURRENT) + .build(); + KNNMethodContext knnMethodContext = new KNNMethodContext( + knnEngine, + spaceType, + new MethodComponentContext(METHOD_HNSW, ImmutableMap.of(METHOD_PARAMETER_M, 16, METHOD_PARAMETER_EF_CONSTRUCTION, 512)) + ); + + String parameterString = XContentFactory.jsonBuilder() + .map(knnEngine.getKNNLibraryIndexingContext(knnMethodContext, knnMethodConfigContext).getLibraryParameters()) + .toString(); + + FieldInfo[] fieldInfoArray = new FieldInfo[] { + KNNCodecTestUtil.FieldInfoBuilder.builder(fieldName) + .addAttribute(KNNVectorFieldMapper.KNN_FIELD, "true") + .addAttribute(KNNConstants.KNN_ENGINE, knnEngine.getName()) + .addAttribute(KNNConstants.SPACE_TYPE, spaceType.getValue()) + .addAttribute(KNNConstants.PARAMETERS, parameterString) + .build() }; + + + FieldInfos fieldInfos = new FieldInfos(fieldInfoArray); + SegmentWriteState state = new SegmentWriteState(null, directory, segmentInfo, fieldInfos, null, IOContext.DEFAULT); + + long initialRefreshOperations = KNNGraphValue.REFRESH_TOTAL_OPERATIONS.getValue(); + + // Add documents to the field + float[][] vectorsData = TestVectorValues.getRandomVectors(docsInSegment, dimension); + List vectorList = new ArrayList<>(); + for(int i = 0; i < docsInSegment; i++) { + vectorList.add(vectorsData[i]); + } + TestVectorValues.PreDefinedFloatVectorValues preDefinedFloatVectorValues = new TestVectorValues.PreDefinedFloatVectorValues(vectorList); + + FieldInfo field = fieldInfoArray[0]; + final VectorDataType vectorDataType = extractVectorDataType(field); + final KNNVectorValues knnVectorValues = KNNVectorValuesFactory.getVectorValues(vectorDataType, preDefinedFloatVectorValues); + + NativeIndexWriter.getWriter(field, state).flushIndex(knnVectorValues, (int) knnVectorValues.totalLiveDocs()); + + // The document should be created in the correct location + String expectedFile = KNNCodecUtil.buildEngineFileName(segmentName, knnEngine.getVersion(), fieldName, knnEngine.getExtension()); + assertFileInCorrectLocation(state, expectedFile); + + // The footer should be valid + assertValidFooter(state.directory, expectedFile); + + // The document should be readable by faiss + assertLoadableByEngine(HNSW_METHODPARAMETERS, state, expectedFile, knnEngine, spaceType, dimension); + + // The graph creation statistics should be updated + assertEquals(1 + initialRefreshOperations, (long) KNNGraphValue.REFRESH_TOTAL_OPERATIONS.getValue()); + + // Files Should set into segment info + segmentInfo.setFiles(Collections.singleton(expectedFile)); + + // Reader From Faiss File and get FloatVectorValues + SegmentReadState readState = new SegmentReadState(directory, segmentInfo, fieldInfos, IOContext.DEFAULT); + FaissEngineFlatKnnVectorsReader faissReader = new FaissEngineFlatKnnVectorsReader(readState); + FloatVectorValues vectorValues = faissReader.getFloatVectorValues(fieldName); + + FaissEngineFlatKnnVectorsReader.MetaInfo metaInfo = faissReader.getFieldMetaMap().get(fieldName); + + for (int i = 0; i < metaInfo.ntotal; i++){ + vectorValues.nextDoc(); + float[] actualVector = vectorValues.vectorValue(); + float[] expectVector = vectorsData[i]; + assertArrayEquals(actualVector, expectVector, 0.001f); + } + faissReader.close(); + } +} From ec8bd78bb72525e245b7cb1fb9da4fa3b599a4f2 Mon Sep 17 00:00:00 2001 From: luyuncheng Date: Tue, 12 Nov 2024 21:19:13 +0800 Subject: [PATCH 2/4] Introduce FaissEngineFlatVector Readers read flatValues directly from faiss file Signed-off-by: luyuncheng --- .../index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java index f78c41440..0c049757c 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java @@ -86,7 +86,6 @@ public FaissEngineFlatKnnVectorsReader(SegmentReadState state) throws IOExceptio continue; } String parameter = FieldInfoExtractor.getParameters(field); - System.out.print(parameter); if (parameter == null || parameter.contains("BHNSW")) { continue; } From f093253286b9cb8b52940b59fd1bab1b156342fe Mon Sep 17 00:00:00 2001 From: luyuncheng Date: Tue, 12 Nov 2024 21:26:40 +0800 Subject: [PATCH 3/4] Spotless Signed-off-by: luyuncheng --- .../FaissEngineFlatKnnVectorsReader.java | 50 +++++++++---------- .../FaissEngineFlatVectorValues.java | 36 +++++++------ .../FaissEngineKnnVectorsReader.java | 2 - ...NativeEngineFlatKnnVectorsReaderTests.java | 47 ++++++++--------- 4 files changed, 66 insertions(+), 69 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java index 0c049757c..6f064cdd5 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java @@ -21,7 +21,6 @@ import org.opensearch.knn.index.engine.KNNEngine; import java.io.IOException; -import java.nio.file.Files; import java.util.HashMap; import java.util.Map; @@ -59,6 +58,7 @@ public class FaissEngineFlatKnnVectorsReader extends FaissEngineKnnVectorsReader private Map fieldFileMap; private Map fieldMetaMap; + @Override public void checkIntegrity() throws IOException { @@ -79,7 +79,7 @@ public FaissEngineFlatKnnVectorsReader(SegmentReadState state) throws IOExceptio if (vectorIndexFileName == null) { continue; } - //TODO for fp16, pq + // TODO for fp16, pq VectorDataType vectorDataType = FieldInfoExtractor.extractVectorDataType(field); SpaceType spaceType = FieldInfoExtractor.getSpaceType(null, field); if (vectorDataType != VectorDataType.FLOAT) { @@ -89,21 +89,21 @@ public FaissEngineFlatKnnVectorsReader(SegmentReadState state) throws IOExceptio if (parameter == null || parameter.contains("BHNSW")) { continue; } - //TODO if not exist file, change to lucene flatVector + // TODO if not exist file, change to lucene flatVector IndexInput in = state.directory.openInput(vectorIndexFileName, state.context.withRandomAccess()); - if(in == null) { + if (in == null) { continue; } fieldFileMap.put(field.getName(), in); } success = true; - } finally { + } finally { if (success == false) { IOUtils.closeWhileHandlingException(this); } } - for(Map.Entry entry : fieldFileMap.entrySet()) { + for (Map.Entry entry : fieldFileMap.entrySet()) { IndexInput in = entry.getValue(); int h = in.readInt(); MetaInfo metaInfo = read_index_header(in); @@ -132,7 +132,7 @@ public boolean isNativeVectors(String field) { private MetaInfo read_index_header(IndexInput in) throws IOException { int d = in.readInt(); - long ntotal = in.readLong(); + long ntotal = in.readLong(); long dummy; dummy = in.readLong(); dummy = in.readLong(); @@ -146,35 +146,35 @@ private MetaInfo read_index_header(IndexInput in) throws IOException { long filesize = in.length(); // There is (ntotal+1) * idx_t and FOOTER_SIZE long idSeek = filesize - (ntotal + 1) * SIZET_SIZE - FOOTER_SIZE; - //in.seek(idSeek); -// long size = in.readLong(); + // in.seek(idSeek); + // long size = in.readLong(); -// long[] ids = new long[(int) ntotal]; -// in.readLongs(ids, 0, (int) ntotal); + // long[] ids = new long[(int) ntotal]; + // in.readLongs(ids, 0, (int) ntotal); long vectorSeek = idSeek - (FLOAT_SIZE * d) * ntotal - SIZET_SIZE; -// in.seek(vectorSeek); - -// float[] v = new float[(int) (d * ntotal)]; -// size = in.readLong(); -// System.out.println("Vector Size: " + size + " d * ntotal" + d * ntotal); -// for(int i = 0; i < ntotal; i++) { -// in.readFloats(v, i * d, d); -// System.out.println("vector:"); -// for (int j = 0; j < d; j++) { -// System.out.println(v[i*d + j]); -// } -// } + // in.seek(vectorSeek); + + // float[] v = new float[(int) (d * ntotal)]; + // size = in.readLong(); + // System.out.println("Vector Size: " + size + " d * ntotal" + d * ntotal); + // for(int i = 0; i < ntotal; i++) { + // in.readFloats(v, i * d, d); + // System.out.println("vector:"); + // for (int j = 0; j < d; j++) { + // System.out.println(v[i*d + j]); + // } + // } return new MetaInfo(d, ntotal, is_trained, metric_type, metric_arg, idSeek, vectorSeek); } + @Override public void close() throws IOException { - for(Map.Entry entry : fieldFileMap.entrySet()) { + for (Map.Entry entry : fieldFileMap.entrySet()) { IndexInput in = entry.getValue(); IOUtils.close(in); } } - @AllArgsConstructor @Getter public class MetaInfo { diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java index 168f028eb..5f7c7cfab 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatVectorValues.java @@ -21,7 +21,7 @@ import static org.opensearch.knn.index.codec.KNN990Codec.FaissEngineFlatKnnVectorsReader.SIZET_SIZE; public class FaissEngineFlatVectorValues extends FloatVectorValues { - private static final int BUCKET_VECTORS = 64; //every time read only bucket size vectors. + private static final int BUCKET_VECTORS = 64; // every time read only bucket size vectors. protected FaissEngineFlatKnnVectorsReader.MetaInfo metaInfo; protected final IndexInput slice; protected final VectorSimilarityFunction similarityFunction; @@ -38,7 +38,7 @@ public FaissEngineFlatVectorValues(FaissEngineFlatKnnVectorsReader.MetaInfo meta this.similarityFunction = getVectorSimilarityFunction(metaInfo.metricType).getVectorSimilarityFunction(); this.flatVectorsScorer = FlatVectorScorerUtil.getLucene99FlatVectorsScorer(); this.value = new float[(int) (metaInfo.d * metaInfo.ntotal)]; - this.ids= new long[(int) metaInfo.ntotal]; + this.ids = new long[(int) metaInfo.ntotal]; this.buf = new float[metaInfo.d]; readIds(); } @@ -56,24 +56,22 @@ protected void readBucketVectors() throws IOException { int bucketIndex = ord / BUCKET_VECTORS; slice.seek(metaInfo.vectorSeek + SIZET_SIZE + bucketIndex * BUCKET_VECTORS * FLOAT_SIZE * metaInfo.d); - for (int i = 0, o = ord; - i < BUCKET_VECTORS && o < metaInfo.ntotal; - i++, o++) { + for (int i = 0, o = ord; i < BUCKET_VECTORS && o < metaInfo.ntotal; i++, o++) { slice.readFloats(value, i * metaInfo.d, metaInfo.d); } } -// public void readInfo() throws IOException { -// slice.seek(metaInfo.idSeek); -// long size = slice.readLong(); -// assert size == metaInfo.ntotal; -// slice.readLongs(ids, 0, (int) metaInfo.ntotal); -// -// slice.seek(metaInfo.vectorSeek); -// size = slice.readLong(); -// for(int i = 0; i < metaInfo.ntotal; i++) { -// slice.readFloats(value, i * metaInfo.d, metaInfo.d); -// } -// } + // public void readInfo() throws IOException { + // slice.seek(metaInfo.idSeek); + // long size = slice.readLong(); + // assert size == metaInfo.ntotal; + // slice.readLongs(ids, 0, (int) metaInfo.ntotal); + // + // slice.seek(metaInfo.vectorSeek); + // size = slice.readLong(); + // for(int i = 0; i < metaInfo.ntotal; i++) { + // slice.readFloats(value, i * metaInfo.d, metaInfo.d); + // } + // } @Override public int dimension() { @@ -87,7 +85,7 @@ public int size() { @Override public float[] vectorValue() throws IOException { - if(ord % BUCKET_VECTORS == 0) { + if (ord % BUCKET_VECTORS == 0) { readBucketVectors(); } int bucketOrder = ord % BUCKET_VECTORS; @@ -98,7 +96,7 @@ public float[] vectorValue() throws IOException { @Override public VectorScorer scorer(float[] floats) throws IOException { - //TODO + // TODO return null; } diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java index 661bba642..0aea9fdb6 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineKnnVectorsReader.java @@ -23,8 +23,6 @@ protected FaissEngineKnnVectorsReader() {} * *

Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value * against large data files. - * - * @lucene.internal */ public abstract void checkIntegrity() throws IOException; diff --git a/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java index be8cdd653..a6c4c7c2a 100644 --- a/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java +++ b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFlatKnnVectorsReaderTests.java @@ -79,33 +79,32 @@ public void testAddKNNFloatField_FaissEngine_ReadFaissFile_success() throws IOEx int dimension = 16; SegmentInfo segmentInfo = KNNCodecTestUtil.segmentInfoBuilder() - .directory(directory) - .segmentName(segmentName) - .docsInSegment(docsInSegment) - .codec(codec) - .build(); + .directory(directory) + .segmentName(segmentName) + .docsInSegment(docsInSegment) + .codec(codec) + .build(); KNNMethodConfigContext knnMethodConfigContext = KNNMethodConfigContext.builder() - .vectorDataType(VectorDataType.FLOAT) - .versionCreated(Version.CURRENT) - .build(); + .vectorDataType(VectorDataType.FLOAT) + .versionCreated(Version.CURRENT) + .build(); KNNMethodContext knnMethodContext = new KNNMethodContext( - knnEngine, - spaceType, - new MethodComponentContext(METHOD_HNSW, ImmutableMap.of(METHOD_PARAMETER_M, 16, METHOD_PARAMETER_EF_CONSTRUCTION, 512)) + knnEngine, + spaceType, + new MethodComponentContext(METHOD_HNSW, ImmutableMap.of(METHOD_PARAMETER_M, 16, METHOD_PARAMETER_EF_CONSTRUCTION, 512)) ); String parameterString = XContentFactory.jsonBuilder() - .map(knnEngine.getKNNLibraryIndexingContext(knnMethodContext, knnMethodConfigContext).getLibraryParameters()) - .toString(); + .map(knnEngine.getKNNLibraryIndexingContext(knnMethodContext, knnMethodConfigContext).getLibraryParameters()) + .toString(); FieldInfo[] fieldInfoArray = new FieldInfo[] { - KNNCodecTestUtil.FieldInfoBuilder.builder(fieldName) - .addAttribute(KNNVectorFieldMapper.KNN_FIELD, "true") - .addAttribute(KNNConstants.KNN_ENGINE, knnEngine.getName()) - .addAttribute(KNNConstants.SPACE_TYPE, spaceType.getValue()) - .addAttribute(KNNConstants.PARAMETERS, parameterString) - .build() }; - + KNNCodecTestUtil.FieldInfoBuilder.builder(fieldName) + .addAttribute(KNNVectorFieldMapper.KNN_FIELD, "true") + .addAttribute(KNNConstants.KNN_ENGINE, knnEngine.getName()) + .addAttribute(KNNConstants.SPACE_TYPE, spaceType.getValue()) + .addAttribute(KNNConstants.PARAMETERS, parameterString) + .build() }; FieldInfos fieldInfos = new FieldInfos(fieldInfoArray); SegmentWriteState state = new SegmentWriteState(null, directory, segmentInfo, fieldInfos, null, IOContext.DEFAULT); @@ -115,10 +114,12 @@ public void testAddKNNFloatField_FaissEngine_ReadFaissFile_success() throws IOEx // Add documents to the field float[][] vectorsData = TestVectorValues.getRandomVectors(docsInSegment, dimension); List vectorList = new ArrayList<>(); - for(int i = 0; i < docsInSegment; i++) { + for (int i = 0; i < docsInSegment; i++) { vectorList.add(vectorsData[i]); } - TestVectorValues.PreDefinedFloatVectorValues preDefinedFloatVectorValues = new TestVectorValues.PreDefinedFloatVectorValues(vectorList); + TestVectorValues.PreDefinedFloatVectorValues preDefinedFloatVectorValues = new TestVectorValues.PreDefinedFloatVectorValues( + vectorList + ); FieldInfo field = fieldInfoArray[0]; final VectorDataType vectorDataType = extractVectorDataType(field); @@ -149,7 +150,7 @@ public void testAddKNNFloatField_FaissEngine_ReadFaissFile_success() throws IOEx FaissEngineFlatKnnVectorsReader.MetaInfo metaInfo = faissReader.getFieldMetaMap().get(fieldName); - for (int i = 0; i < metaInfo.ntotal; i++){ + for (int i = 0; i < metaInfo.ntotal; i++) { vectorValues.nextDoc(); float[] actualVector = vectorValues.vectorValue(); float[] expectVector = vectorsData[i]; From 576a5a3e6d61fb03b2ed8a9cdacf333cae1d5bd3 Mon Sep 17 00:00:00 2001 From: luyuncheng Date: Tue, 12 Nov 2024 21:31:52 +0800 Subject: [PATCH 4/4] Spotless Signed-off-by: luyuncheng --- .../codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java index 6f064cdd5..fe1d61ca3 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN990Codec/FaissEngineFlatKnnVectorsReader.java @@ -27,7 +27,7 @@ import static org.opensearch.knn.index.engine.KNNEngine.FAISS; /** - * There is 3 Index in one faiss file <-id-><-hnsw-><-Storage-> + * There is 3 Index in one faiss file |id|hnsw|Storage| * File Structure like followings: * |-typeIDMap-||-id_header-| * |-typeHnsw-||-hnsw_header-||-hnswGraph-|