databasevector-databasemilvus

Cannot get prefix search to work on a scalar text field


I cannot get prefix search to work on a scalar field. I can however get infix search to work which suggests that my field is not properly tokenized.

ENV: Standalone Milvus 2.4.8 on Ubuntu 22.04

Here is my schema:

{
fields: [
                {
                    name: 'id',
                    description: 'Id field',
                    data_type: DataType.VarChar,
                    is_primary_key: true,
                    max_length: this.NORMALISED_GUID_LENGTH
                },
                {
                    name: 'vector',
                    description: 'Vector field',
                    data_type: DataType.FloatVector,
                    dim: this.embeddingModel.dims
                },
                {
                    name: 'tag',
                    description: 'The partition tag',
                    data_type: DataType.VarChar,
                    max_length: this.NORMALISED_TAG_LENGTH
                },
                {
                    name: 'channels',
                    description: 'The channels that may access this entry',
                    data_type: DataType.VarChar,
                    max_length: this.MAX_CHANNELS_LENGTH
                },
                {
                    name: "payload",
                    description: 'The payload meta data',
                    data_type: DataType.JSON
                }
            ],
            partition_key_field: 'tag',
            index_params: [
                {
                    field_name: "vector",
                    index_type: "DISKANN",
                    metric_type: "IP"
                },
                {
                    field_name: "tag",
                    index_name: "tag_index",
                    index_type: "INVERTED"
                },
                {
                    field_name: "channels",
                    index_name: "channels_index",
                    index_type: "INVERTED"
                }
            ]
}

The field of interest is 'channels'.

The channels field can look like this: "48d302b1963841c39790fecf56b91ddc c8a80b710e455460ae8b2399f5adfef5 b9e8870f9f994bc6a172e118fa6e7c8a"

When I search it the filter looks like this (but it does not work): 'channels like "c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above

The following infix search works: 'channels like "%c8a80b710e455460ae8b2399f5adfef5%"' // note it is the 2nd token in the channels field above

And the following prefix search works but only for the first token in the channels field: 'channels like "48d302b1963841c39790fecf56b91ddc%"' // note it is the 1st token in the channels field above

What am I doing wrong?


Solution

  • Array field might be the proper solution. Define the "channels" as Array type field, you can use ARRAY_CONTAINS/ARRAY_CONTAINS_ANY/ARRAY_CONTAINS_ALL to filter its elements. An example:

    import random
    from pymilvus import (
    MilvusClient, DataType,
    )
    client = MilvusClient(uri="http://localhost:19530")
    print(client.list_collections())
    collection_name = "AAA"
    schema = client.create_schema(enable_dynamic_field=False)
    schema.add_field("id", DataType.VARCHAR, is_primary=True, auto_id=False, max_length=65535)
    schema.add_field("vector", DataType.FLOAT_VECTOR, dim=128)
    schema.add_field("tag", DataType.VARCHAR, max_length=65535, is_partition_key=True)
    schema.add_field(field_name="channels", datatype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=1000, max_length=65535)
    client.drop_collection(collection_name=collection_name)
    client.create_collection(collection_name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' created")
    index_params = client.prepare_index_params()
    index_params.add_index(field_name = "vector", index_type="FLAT", metric_type="IP")
    index_params.add_index(field_name = "tag", index_type="INVERTED")
    client.create_index(collection_name, index_params)
    client.load_collection(collection_name=collection_name)
    data = [
    {"id": "A", "vector": [random.random() for _ in range(128)], "tag": "a_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"]},
    {"id": "B", "vector": [random.random() for _ in range(128)], "tag": "b_tag", "channels": ["12345", "678910", "c8a80b710e455460ae8b2399f5adfef5"]},
    {"id": "C", "vector": [random.random() for _ in range(128)], "tag": "c_tag", "channels": ["48d302b1963841c39790fecf56b91ddc", "bbb", "aaa"]},
    ]
    ids = client.insert(collection_name=collection_name, data=data)
    print("insert done")
    def search(filter: str):
    print("\nSearch with filter: " + filter)
    target_vector = [random.random() for _ in range(128)]
    results = client.search(collection_name=collection_name,
    data=[target_vector],
    limit=10,
    filter=filter,
    anns_field="vector",
    search_params={},
    consistency_level="Strong",
    output_fields=["channels"],
    )
    for hits in results:
    for hit in hits:
    print(hit)
    search(filter='ARRAY_CONTAINS(channels, "c8a80b710e455460ae8b2399f5adfef5")')
    search(filter='ARRAY_CONTAINS_ALL(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5", "b9e8870f9f994bc6a172e118fa6e7c8a"])')
    search(filter='ARRAY_CONTAINS_ANY(channels, ["48d302b1963841c39790fecf56b91ddc", "c8a80b710e455460ae8b2399f5adfef5"])')