I am trying to populate a ParsedDate field in an Azure Search Index so that I can later on create a scoring profile to improve the search results (using Freshness).
I defined the index, skillset and indexer according to the documentation but when I run the indexer, the field is always Null.
This is how I defined the skill in my skillset:
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"name": "#0",
"description": "A custom skill that parses dates from file names",
"uri": "https://az-function.azurewebsites.net/api/custom-skill-date-extraction?code=ABC",
"httpMethod": "POST",
"timeout": "PT30S",
"batchSize": 1,
"context": "/document",
"inputs": [
{
"name": "fileName",
"source": "/document/metadata_storage_path"
}
],
"outputs": [
{
"name": "parsedDate",
"targetName": "parsedDate"
}
]
}
This is how I defined the index:
def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Environment Variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
endpoint_openai = os.environ["AZURE_OPENAI_ENDPOINT"]
deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID"]
credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if os.environ["AZURE_SEARCH_ADMIN_KEY"] else DefaultAzureCredential()
credential = DefaultAzureCredential()
customer = readRequestBody(req)
_ , datasource_name = utils.getStorageAccountInfo(customer, credential)
index_name = utils.get_index_name( datasource_name)
# Logic for creating a search index
try:
index_client = SearchIndexClient(endpoint=endpoint, credential=credential_search)
fields = [
SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
SearchField(name="parsedDate", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True, facetable=True),
SearchField(name="title", type=SearchFieldDataType.String),
SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
name="myHnsw",
parameters=HnswParameters(
m=4,
ef_construction=400,
ef_search=500,
metric=VectorSearchAlgorithmMetric.COSINE,
),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE,
),
),
],
profiles=[
VectorSearchProfile(
name="myHnswProfile",
algorithm_configuration_name="myHnsw",
vectorizer="myOpenAI",
),
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
vectorizer="myOpenAI",
),
],
vectorizers=[
AzureOpenAIVectorizer(
name="myOpenAI",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=endpoint_openai,
deployment_id=deployment_id,
api_key=credential_search,
),
),
],
)
semantic_search = SemanticSearch(configurations=[SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(content_fields=[SemanticField(field_name="chunk"),SemanticField(field_name="title")] )
)])
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
return func.HttpResponse(f"{result.name} created", status_code=200)
except Exception as e:
return func.HttpResponse(f"Failed to create or update the index. Error: {str(e)}", status_code=500)
And finally, how I configured the indexer:
Python
def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Environment Variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])
customer = readRequestBody(req)
credential = DefaultAzureCredential()
_, data_source_name= utils.getStorageAccountInfo(customer, credential)
index_name = utils.get_index_name(data_source_name)
skillset_name = utils.get_skillset_name(data_source_name)
# Indexer creation logic
try:
indexer_name = f"{data_source_name}-indexer"
indexer = SearchIndexer(
name=indexer_name,
description="Indexer to index documents and generate embeddings",
skillset_name=skillset_name,
target_index_name=index_name,
data_source_name=data_source_name,
field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title"),
FieldMapping(source_field_name="parsedDate", target_field_name="parsedDate")],
parameters=IndexingParameters(
configuration={
"dataToExtract": "contentAndMetadata",
"imageAction": "generateNormalizedImages"
}
)
)
indexer_client = SearchIndexerClient(endpoint, credential_search)
indexer_result = indexer_client.create_or_update_indexer(indexer)
# Run the indexer
indexer_client.run_indexer(indexer_name)
message = f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.'
logging.info(message)
return func.HttpResponse(message, status_code=200)
except Exception as e:
error_message = f"Failed to create or run the indexer. Error: {str(e)}"
logging.error(error_message)
return func.HttpResponse(error_message, status_code=500)
I have also tried using out_field_mappings in the indexer but to no avail.
Any suggestion would be great.
Azure AI Search
You will get receive null
values for the following reasons:
outputFieldMappings
in the indexer, such as a name mismatch in the skillset output targetName
or the sourceFieldName
in the indexer definition.outputFieldMappings
.Example:
If parsedDate
is given in the output name
field and you return data like the example below, you will get a null value. The field should match the output name in the skillset.
{
"values": [
{
"recordId": "0",
"data": {
"parsedate": "2015-01-01T00:00:00.000Z"
}
}
]
}
Next, use the following code in your Azure function when returning values:
req_body = req.get_json()
values = req_body.get('values')
res = []
for i in values:
tmp = i
tmp['data'] = {'parsedDte': "parsed_date_from_path"} # example: 2015-01-01T00:00:00.000Z
res.append(tmp)
if res:
return func.HttpResponse(json.dumps({"values": res}), mimetype="application/json")
Configure outputFieldMappings
:
"outputs": [
{
"name": "parsedDate",
"targetName": "parsedDate"
}]
For the above output in the skillset, you need to provide outputFieldMappings
like this:
"outputFieldMappings": [
{
"sourceFieldName": "/document/parsedDate", # output of skillset
"targetFieldName": "parsedDate" # Target index field name
}
]
Or in code:
indexer_name = f"vs-code-2-indexer"
indexer = SearchIndexer(
name=indexer_name,
description="Indexer to index documents and generate embeddings",
skillset_name="skillset1712921532571",
target_index_name="vs-code-2",
data_source_name="hotels-sample",
output_field_mappings=[FieldMapping(source_field_name="/document/parsedDate", target_field_name="parsedDate")]
)
If the above solution doesn't work, delete the current index and create a new index with the same definition. Then reset and run the indexer with the above outputFieldMappings
.