I have the following custom WebApiSkill:
@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplit&PageSkill(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
try:
req_body = req.get_json()
except ValueError:
return func.HttpResponse("Invalid input", status_code=400)
try:
# 'values' expected top-level key in the request body
response_body = {"values": []}
for value in req_body.get('values', []):
recordId = value.get('recordId')
text = value.get('data', {}).get('text', '')
# Remove sequences of dots, numbers following them, and
# any additional punctuation or newline characters, replacing them with a single space
cleaned_text = re.sub(r"[',.\n]+|\d+", ' ', text)
# Replace multiple spaces with a single space and trim leading/trailing spaces
cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()
# Pattern to match sequences of ". " occurring more than twice
cleaned_text = re.sub(r"(\. ){3,}", "", cleaned_text)
chunks, page_numbers = split_text_into_chunks_with_overlap(cleaned_text, chunk_size=256, overlap_size=20)
# response object for specific pdf
response_record = {
"recordId": recordId,
"data": {
"textItems": chunks, # chunks is a str list
"numberItems": page_numbers # page_numbers is an int list
}
}
response_body['values'].append(response_record)
return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
except ValueError:
return func.HttpResponse("Function app crashed", status_code=400)
The inputs and outputs of this skill in the skillset are defined like this:
inputs=[
InputFieldMappingEntry(name="text", source="/document/content")
],
outputs=[
OutputFieldMappingEntry(name="textItems", target_name="pages"),
OutputFieldMappingEntry(name="numberItems", target_name="numbers")
],
And the SearchIndexerIndexProjectionSelector is configured in the following way:
index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
parent_key_field_name="parent_id",
source_context="/document/pages/*",
mappings=[
InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
InputFieldMappingEntry(name="page_number", source="/document/numbers/*"),
],
),
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
),
)
My search fields look like this:
fields = [
SearchField(
name="parent_id",
type=SearchFieldDataType.String,
sortable=True,
filterable=True,
facetable=True
),
SearchField(
name="title",
type=SearchFieldDataType.String
),
SearchField(
name="chunk_id",
type=SearchFieldDataType.String,
key=True,
sortable=True,
filterable=True,
facetable=True,
analyzer_name="keyword"
),
SearchField(
name="chunk",
type=SearchFieldDataType.String,
sortable=False,
filterable=False,
facetable=False
),
SearchField(
name="vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=1536,
vector_search_profile_name="myHnswProfile"
),
SearchField(
name="page_number",
type=SearchFieldDataType.Int32,
sortable=True,
filterable=True,
facetable=True
),
]
I get the following error:
The data field 'page_number' in the document with key 'xyz' has an invalid value of type 'Edm.String' ('String maps to Edm.String'). The expected type was 'Edm.Int32'.
When changing the value to String the index creation passes, with the following result under page_numbers:
"page_number": "[1,2,3,4,5,6,7,...]"
But I want to get a single value under each chunk
The reason you are getting this error because of the source_context
in index projection.
The context /document/pages/*
is works for chunk
because it is present around this context but numbers
is present in the /document/numbers/*
context, so it is not projecting to target index instead taking literal value.
To make this correct you need to return the list of dictionaries contains chunk and numbers from custom web Api skillset.
Below is the example.
response_record = {
"recordId": recordId,
"data": {
"subdata": [
{'textItems': 'This is string 1.', 'numberItems': 1},
{'textItems': 'This is string 2.', 'numberItems': 2},
{'textItems': 'This is string 3.', 'numberItems': 3},
{'textItems': 'This is string 4.', 'numberItems': 4},
{'textItems': 'This is string 5.', 'numberItems': 5},
{'textItems': 'This is string 6.', 'numberItems': 6}]
}
}
Here, i am returning the field subdata
having list dictionary with fields textItems
and numberItems
.
You can use below code to convert like above.
combined_list = [{'textItems': text, 'numberItems': number} for text, number in zip(chunks, page_numbers)]
Next, you can directly reference this in index projection no need to do output field mappings.
Skillset defination.
"indexProjections": {
"selectors": [
{
"targetIndexName": "targetindex",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/subdata/*",
"mappings": [
{
"name": "chunk",
"source": "/document/subdata/*/textItems"
},
{
"name": "page_number",
"source": "/document/subdata/*/numberItems"
}
]
}
],
"parameters": {}
},
Here, i given sourceContext
as /document/subdata/*
in that i am getting
/document/subdata/*/textItems
for chunk
and /document/subdata/*/numberItems
for page_number
.
Output:
Note: Use metadata_storage_path
or metadata_filename
to differentiate the chunks between each document.
UPDATE
Skillset
{
"@odata.context": "https://jgsaisearch.search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC8B62DB084EBF\"",
"name": "skillset1718252807308",
"description": "",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"name": "jgs_func",
"description": "",
"context": "/document",
"uri": "https://jgsfunctionapp.azurewebsites.net/api/http_trigger1?code=TITuBRAeLG0EfpzrhmylNpvEJRITVUUmtb1JKi1MNYXuAzFuLFTeUg==",
"httpMethod": "POST",
"timeout": "PT30S",
"batchSize": 1,
"degreeOfParallelism": 1,
"authResourceId": "",
"inputs": [
{
"name": "content",
"source": "/document/content"
}
],
"outputs": [
{
"name": "subdata",
"targetName": "subdata"
}
],
"httpHeaders": {},
"authIdentity": null
}
],
"cognitiveServices": {
"@odata.type": "#Microsoft.Azure.Search.DefaultCognitiveServices",
"description": null
},
"knowledgeStore": null,
"indexProjections": {
"selectors": [
{
"targetIndexName": "targetindex",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/subdata/*",
"mappings": [
{
"name": "chunk",
"source": "/document/subdata/*/textItems",
"sourceContext": null,
"inputs": []
},
{
"name": "page_number",
"source": "/document/subdata/*/numberItems",
"sourceContext": null,
"inputs": []
}
]
}
],
"parameters": {}
},
"encryptionKey": null
}