Cannot get prefix search to work on a scalar text field

I cannot get prefix search to work on a scalar field. I can however get infix search to work which suggests that my field is not properly tokenized.

ENV: Standalone Milvus 2.4.8 on Ubuntu 22.04

Here is my schema:

{
fields: [
                {
                    name: 'id',
                    description: 'Id field',
                    data_type: DataType.VarChar,
                    is_primary_key: true,
                    max_length: this.NORMALISED_GUID_LENGTH
                },
                {
                    name: 'vector',
                    description: 'Vector field',
                    data_type: DataType.FloatVector,
                    dim: this.embeddingModel.dims
                },
                {
                    name: 'tag',
                    description: 'The partition tag',
                    data_type: DataType.VarChar,
                    max_length: this.NORMALISED_TAG_LENGTH
                },
                {
                    name: 'channels',
                    description: 'The channels that may access this entry',
                    data_type: DataType.VarChar,
                    max_length: this.MAX_CHANNELS_LENGTH
                },
                {
                    name: "payload",
                    description: 'The payload meta data',
                    data_type: DataType.JSON
                }
            ],
            partition_key_field: 'tag',
            index_params: [
                {
                    field_name: "vector",
                    index_type: "DISKANN",
                    metric_type: "IP"
                },
                {
                    field_name: "tag",
                    index_name: "tag_index",
                    index_type: "INVERTED"
                },
                {
                    field_name: "channels",
                    index_name: "channels_index",
                    index_type: "INVERTED"
                }
            ]
}

The field of interest is 'channels'.

The channels field can look like this: "48d302b1963841c39790fecf56b91ddc c8a80b710e455460ae8b2399f5adfef5 b9e8870f9f994bc6a172e118fa6e7c8a"

When I search it the filter looks like this (but it does not work): 'channels like "c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above

The following infix search works: 'channels like "%c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above

And the following prefix search works but only for the first token in the channels field: 'channels like "48d302b1963841c39790fecf56b91ddc%"' // note it is the 1st token in the channels field above

What am I doing wrong?

Solution

Array field might be the proper solution. Define the "channels" as Array type field, you can use ARRAY_CONTAINS/ARRAY_CONTAINS_ANY/ARRAY_CONTAINS_ALL to filter its elements. An example:

import random
from pymilvus import (
MilvusClient, DataType,
)
client = MilvusClient(uri="http://localhost:19530")
print(client.list_collections())
collection_name = "AAA"
schema = client.create_schema(enable_dynamic_field=False)
schema.add_field("id", DataType.VARCHAR, is_primary=True, auto_id=False, max_length=65535)
schema.add_field("vector", DataType.FLOAT_VECTOR, dim=128)
schema.add_field("tag", DataType.VARCHAR, max_length=65535, is_partition_key=True)
schema.add_field(field_name="channels", datatype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=1000, max_length=65535)
client.drop_collection(collection_name=collection_name)
client.create_collection(collection_name=collection_name, schema=schema)
print(f"Collection '{collection_name}' created")
index_params = client.prepare_index_params()
index_params.add_index(field_name = "vector", index_type="FLAT", metric_type="IP")
index_params.add_index(field_name = "tag", index_type="INVERTED")
client.create_index(collection_name, index_params)
client.load_collection(collection_name=collection_name)
data = [
{"id": "A", "vector": [random.random() for _ in range(128)], "tag": "a_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"]},
{"id": "B", "vector": [random.random() for _ in range(128)], "tag": "b_tag", "channels": ["12345", "678910", "c8a80b710e455460ae8b2399f5adfef5"]},
{"id": "C", "vector": [random.random() for _ in range(128)], "tag": "c_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "bbb", "aaa"]},
]
ids = client.insert(collection_name=collection_name, data=data)
print("insert done")
def search(filter: str):
print("\nSearch with filter: " + filter)
target_vector = [random.random() for _ in range(128)]
results = client.search(collection_name=collection_name,
data=[target_vector],
limit=10,
filter=filter,
anns_field="vector",
search_params={},
consistency_level="Strong",
output_fields=["channels"],
)
for hits in results:
for hit in hits:
print(hit)
search(filter='ARRAY_CONTAINS(channels, "c8a80b710e455460ae8b2399f5adfef5")')
search(filter='ARRAY_CONTAINS_ALL(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"])')
search(filter='ARRAY_CONTAINS_ANY(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5"])')