I cannot get prefix search to work on a scalar field. I can however get infix search to work which suggests that my field is not properly tokenized.
ENV: Standalone Milvus 2.4.8 on Ubuntu 22.04
Here is my schema:
{
fields: [
{
name: 'id',
description: 'Id field',
data_type: DataType.VarChar,
is_primary_key: true,
max_length: this.NORMALISED_GUID_LENGTH
},
{
name: 'vector',
description: 'Vector field',
data_type: DataType.FloatVector,
dim: this.embeddingModel.dims
},
{
name: 'tag',
description: 'The partition tag',
data_type: DataType.VarChar,
max_length: this.NORMALISED_TAG_LENGTH
},
{
name: 'channels',
description: 'The channels that may access this entry',
data_type: DataType.VarChar,
max_length: this.MAX_CHANNELS_LENGTH
},
{
name: "payload",
description: 'The payload meta data',
data_type: DataType.JSON
}
],
partition_key_field: 'tag',
index_params: [
{
field_name: "vector",
index_type: "DISKANN",
metric_type: "IP"
},
{
field_name: "tag",
index_name: "tag_index",
index_type: "INVERTED"
},
{
field_name: "channels",
index_name: "channels_index",
index_type: "INVERTED"
}
]
}
The field of interest is 'channels'.
The channels field can look like this:
"48d302b1963841c39790fecf56b91ddc c8a80b710e455460ae8b2399f5adfef5 b9e8870f9f994bc6a172e118fa6e7c8a"
When I search it the filter looks like this (but it does not work):
'channels like "c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above
The following infix search works:
'channels like "%c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above
And the following prefix search works but only for the first token in the channels field:
'channels like "48d302b1963841c39790fecf56b91ddc%"' // note it is the 1st token in the channels field above
What am I doing wrong?
Array field might be the proper solution. Define the "channels" as Array type field, you can use ARRAY_CONTAINS/ARRAY_CONTAINS_ANY/ARRAY_CONTAINS_ALL to filter its elements. An example:
import random
from pymilvus import (
MilvusClient, DataType,
)
client = MilvusClient(uri="http://localhost:19530")
print(client.list_collections())
collection_name = "AAA"
schema = client.create_schema(enable_dynamic_field=False)
schema.add_field("id", DataType.VARCHAR, is_primary=True, auto_id=False, max_length=65535)
schema.add_field("vector", DataType.FLOAT_VECTOR, dim=128)
schema.add_field("tag", DataType.VARCHAR, max_length=65535, is_partition_key=True)
schema.add_field(field_name="channels", datatype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=1000, max_length=65535)
client.drop_collection(collection_name=collection_name)
client.create_collection(collection_name=collection_name, schema=schema)
print(f"Collection '{collection_name}' created")
index_params = client.prepare_index_params()
index_params.add_index(field_name = "vector", index_type="FLAT", metric_type="IP")
index_params.add_index(field_name = "tag", index_type="INVERTED")
client.create_index(collection_name, index_params)
client.load_collection(collection_name=collection_name)
data = [
{"id": "A", "vector": [random.random() for _ in range(128)], "tag": "a_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"]},
{"id": "B", "vector": [random.random() for _ in range(128)], "tag": "b_tag", "channels": ["12345", "678910", "c8a80b710e455460ae8b2399f5adfef5"]},
{"id": "C", "vector": [random.random() for _ in range(128)], "tag": "c_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "bbb", "aaa"]},
]
ids = client.insert(collection_name=collection_name, data=data)
print("insert done")
def search(filter: str):
print("\nSearch with filter: " + filter)
target_vector = [random.random() for _ in range(128)]
results = client.search(collection_name=collection_name,
data=[target_vector],
limit=10,
filter=filter,
anns_field="vector",
search_params={},
consistency_level="Strong",
output_fields=["channels"],
)
for hits in results:
for hit in hits:
print(hit)
search(filter='ARRAY_CONTAINS(channels, "c8a80b710e455460ae8b2399f5adfef5")')
search(filter='ARRAY_CONTAINS_ALL(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"])')
search(filter='ARRAY_CONTAINS_ANY(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5"])')