azureazure-ai-searchmicrosoft-copilot

How to populate Azure AI Search field with a custom skill?


I am trying to populate a ParsedDate field in an Azure Search Index so that I can later on create a scoring profile to improve the search results (using Freshness).

I defined the index, skillset and indexer according to the documentation but when I run the indexer, the field is always Null.

This is how I defined the skill in my skillset:


  {
      "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
      "name": "#0",
      "description": "A custom skill that parses dates from file names",
      "uri": "https://az-function.azurewebsites.net/api/custom-skill-date-extraction?code=ABC",
      "httpMethod": "POST",
      "timeout": "PT30S",
      "batchSize": 1,
      "context": "/document",
      "inputs": [
        {
          "name": "fileName",
          "source": "/document/metadata_storage_path" 
        }
      ],
      "outputs": [
        {
          "name": "parsedDate",
          "targetName": "parsedDate"
        }
      ]
    }

This is how I defined the index:

def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    # Environment Variables
    endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
    endpoint_openai = os.environ["AZURE_OPENAI_ENDPOINT"]
    deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID"]
    credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if os.environ["AZURE_SEARCH_ADMIN_KEY"] else DefaultAzureCredential()
    credential = DefaultAzureCredential()
    customer = readRequestBody(req)
    _ , datasource_name = utils.getStorageAccountInfo(customer, credential)
    index_name = utils.get_index_name( datasource_name)

    # Logic for creating a search index
    try:
        index_client = SearchIndexClient(endpoint=endpoint, credential=credential_search)
        fields = [
            SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
            SearchField(name="parsedDate", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True, facetable=True),
            SearchField(name="title", type=SearchFieldDataType.String),
            SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
            SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
            SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
        ]

        vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=endpoint_openai,  
                deployment_id=deployment_id,  
                api_key=credential_search,  
            ),  
        ),  
    ],  
)  
        semantic_search = SemanticSearch(configurations=[SemanticConfiguration(
            name="my-semantic-config",
            prioritized_fields=SemanticPrioritizedFields(content_fields=[SemanticField(field_name="chunk"),SemanticField(field_name="title")] )
        )])

        index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
        result = index_client.create_or_update_index(index)
        return func.HttpResponse(f"{result.name} created", status_code=200)
    except Exception as e:
        return func.HttpResponse(f"Failed to create or update the index. Error: {str(e)}", status_code=500)

And finally, how I configured the indexer:

Python

def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')
    
    # Environment Variables
    endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
    credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])
    customer = readRequestBody(req)
    credential = DefaultAzureCredential()
    _, data_source_name= utils.getStorageAccountInfo(customer, credential)
    index_name = utils.get_index_name(data_source_name)
    skillset_name = utils.get_skillset_name(data_source_name)

    
    # Indexer creation logic
    try:
        indexer_name = f"{data_source_name}-indexer"
        indexer = SearchIndexer(
            name=indexer_name,
            description="Indexer to index documents and generate embeddings",
            skillset_name=skillset_name,
            target_index_name=index_name,
            data_source_name=data_source_name,
            field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title"),
                            FieldMapping(source_field_name="parsedDate", target_field_name="parsedDate")],
            parameters=IndexingParameters(
                configuration={
                    "dataToExtract": "contentAndMetadata",
                    "imageAction": "generateNormalizedImages"
                }
            )
        )
        
        indexer_client = SearchIndexerClient(endpoint, credential_search)
        indexer_result = indexer_client.create_or_update_indexer(indexer)
        
        # Run the indexer
        indexer_client.run_indexer(indexer_name)
        message = f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.'
        logging.info(message)
        return func.HttpResponse(message, status_code=200)
    except Exception as e:
        error_message = f"Failed to create or run the indexer. Error: {str(e)}"
        logging.error(error_message)
        return func.HttpResponse(error_message, status_code=500)

I have also tried using out_field_mappings in the indexer but to no avail.

Any suggestion would be great.

Azure AI Search


Solution

  • You will get receive null values for the following reasons:

    1. Improper outputFieldMappings in the indexer, such as a name mismatch in the skillset output targetName or the sourceFieldName in the indexer definition.
    2. Empty outputFieldMappings.
    3. A mismatch between the output name in the skillset and the data value from the custom web API.

    Example: If parsedDate is given in the output name field and you return data like the example below, you will get a null value. The field should match the output name in the skillset.

    {
      "values": [
        {
          "recordId": "0",
          "data": {
            "parsedate": "2015-01-01T00:00:00.000Z"
          }
        }
      ]
    }
    

    Next, use the following code in your Azure function when returning values:

        req_body = req.get_json()
        values = req_body.get('values')
        res = []
        for i in values:
            tmp = i
            tmp['data'] = {'parsedDte': "parsed_date_from_path"}  # example: 2015-01-01T00:00:00.000Z
            res.append(tmp)
        if res:
            return func.HttpResponse(json.dumps({"values": res}), mimetype="application/json")
    

    Configure outputFieldMappings:

    "outputs": [
            {
              "name": "parsedDate",
              "targetName": "parsedDate"
            }]
    

    For the above output in the skillset, you need to provide outputFieldMappings like this:

    "outputFieldMappings": [
        {
          "sourceFieldName": "/document/parsedDate", # output of skillset
          "targetFieldName": "parsedDate" # Target index field name
        }
      ]
    

    Or in code:

    indexer_name = f"vs-code-2-indexer"
    indexer = SearchIndexer(
        name=indexer_name,
        description="Indexer to index documents and generate embeddings",
        skillset_name="skillset1712921532571",
        target_index_name="vs-code-2",
        data_source_name="hotels-sample",
        output_field_mappings=[FieldMapping(source_field_name="/document/parsedDate", target_field_name="parsedDate")]
    )
    

    If the above solution doesn't work, delete the current index and create a new index with the same definition. Then reset and run the indexer with the above outputFieldMappings.

    Indexer Configuration