diff --git a/example/index_support/prepare_data.py b/example/index_support/prepare_data.py new file mode 100644 index 0000000..389ee31 --- /dev/null +++ b/example/index_support/prepare_data.py @@ -0,0 +1,122 @@ +# hello_milvus.py demonstrates the basic operations of PyMilvus, a Python SDK of Milvus. +# 1. connect to Milvus +# 2. create collection +# 3. insert data +# 4. create index +# 5. search, query, and hybrid search on entities +# 6. delete entities by PK +# 7. drop collection +import time +import os +import numpy as np +from pymilvus import ( + connections, + utility, + FieldSchema, CollectionSchema, DataType, + Collection, +) + +fmt = "\n=== {:30} ===\n" +search_latency_fmt = "search latency = {:.4f}s" +num_entities, dim = 3000, 8 + +################################################################################# +# 1. connect to Milvus +# Add a new connection alias `default` for Milvus server in `localhost:19530` +# Actually the "default" alias is a buildin in PyMilvus. +# If the address of Milvus is the same as `localhost:19530`, you can omit all +# parameters and call the method as: `connections.connect()`. +# +# Note: the `using` parameter of the following methods is default to "default". +print(fmt.format("start connecting to Milvus")) + +host = os.environ.get('MILVUS_HOST') +if host == None: + host = "localhost" +print(fmt.format(f"Milvus host: {host}")) +connections.connect("default", host=host, port="19530") + +has = utility.has_collection("hello_milvus") +print(f"Does collection hello_milvus exist in Milvus: {has}") + +################################################################################# +# 2. create collection +# We're going to create a collection with 3 fields. +# +-+------------+------------+------------------+------------------------------+ +# | | field name | field type | other attributes | field description | +# +-+------------+------------+------------------+------------------------------+ +# |1| "pk" | Int64 | is_primary=True | "primary field" | +# | | | | auto_id=False | | +# +-+------------+------------+------------------+------------------------------+ +# |2| "random" | Double | | "a double field" | +# +-+------------+------------+------------------+------------------------------+ +# |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | +# +-+------------+------------+------------------+------------------------------+ +fields = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="random", dtype=DataType.DOUBLE), + FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) +] + +schema = CollectionSchema(fields, "hello_milvus") + +print(fmt.format("Create collection `hello_milvus`")) +hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong") + +################################################################################ +# 3. insert data +# We are going to insert 3000 rows of data into `hello_milvus` +# Data to be inserted must be organized in fields. +# +# The insert() method returns: +# - either automatically generated primary keys by Milvus if auto_id=True in the schema; +# - or the existing primary key field from the entities if auto_id=False in the schema. + +print(fmt.format("Start inserting entities")) +rng = np.random.default_rng(seed=19530) +entities = [ + # provide the pk field because `auto_id` is set to False + [i for i in range(num_entities)], + rng.random(num_entities).tolist(), # field random, only supports list + [str(i) for i in range(num_entities)], + rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list +] + +insert_result = hello_milvus.insert(entities) +hello_milvus.flush() +print(f"Number of entities in hello_milvus: {hello_milvus.num_entities}") # check the num_entites + +# create another collection +fields2 = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="random", dtype=DataType.DOUBLE), + FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) +] + +schema2 = CollectionSchema(fields2, "hello_milvus2") + +print(fmt.format("Create collection `hello_milvus2`")) +hello_milvus2 = Collection("hello_milvus2", schema2, consistency_level="Strong") + +entities2 = [ + rng.random(num_entities).tolist(), # field random, only supports list + [str(i) for i in range(num_entities)], + rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list +] + +insert_result2 = hello_milvus2.insert(entities2) +hello_milvus2.flush() +insert_result2 = hello_milvus2.insert(entities2) +hello_milvus2.flush() + +index_params = {"index_type": "IVF_FLAT", "params": {"nlist": 128}, "metric_type": "L2"} +hello_milvus.create_index("embeddings", index_params) + +hello_milvus2.create_index("embeddings", index_params) +index_params2 = {"index_type": "Trie"} +hello_milvus2.create_index("var", index_params2) + +print(f"Number of entities in hello_milvus2: {hello_milvus2.num_entities}") # check the num_entites +