pythonazure-cognitive-services

Error in Azure Cognitive Search Service when storing document page associated to each chunk extracted from PDF in a custom WebApiSkill


I have the following custom WebApiSkill:

@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplit&PageSkill(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    try:
        req_body = req.get_json()
    except ValueError:
        return func.HttpResponse("Invalid input", status_code=400)

    try:
        # 'values' expected top-level key in the request body
        response_body = {"values": []}
        for value in req_body.get('values', []):
            recordId = value.get('recordId')
            text = value.get('data', {}).get('text', '')

            # Remove sequences of dots, numbers following them, and
            # any additional punctuation or newline characters, replacing them with a single space
            cleaned_text = re.sub(r"[',.\n]+|\d+", ' ', text)

            # Replace multiple spaces with a single space and trim leading/trailing spaces
            cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()

            # Pattern to match sequences of ". " occurring more than twice
            cleaned_text = re.sub(r"(\. ){3,}", "", cleaned_text)

            chunks, page_numbers = split_text_into_chunks_with_overlap(cleaned_text, chunk_size=256, overlap_size=20)
            
            # response object for specific pdf
            response_record = {
                "recordId": recordId,
                "data": {
                    "textItems": chunks,  # chunks is a str list
                    "numberItems": page_numbers # page_numbers is an int list
                }
            }
            response_body['values'].append(response_record)

        return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
    except ValueError:
        return func.HttpResponse("Function app crashed", status_code=400)

The inputs and outputs of this skill in the skillset are defined like this:

inputs=[
    InputFieldMappingEntry(name="text", source="/document/content")
],
outputs=[
    OutputFieldMappingEntry(name="textItems", target_name="pages"),
    OutputFieldMappingEntry(name="numberItems", target_name="numbers")
],

And the SearchIndexerIndexProjectionSelector is configured in the following way:

index_projections = SearchIndexerIndexProjections(  
        selectors=[  
            SearchIndexerIndexProjectionSelector(  
                target_index_name=index_name,  
                parent_key_field_name="parent_id",  
                source_context="/document/pages/*",  
                mappings=[  
                    InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                    InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                    InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
                    InputFieldMappingEntry(name="page_number", source="/document/numbers/*"), 
                ],  
            ),  
        ],  
        parameters=SearchIndexerIndexProjectionsParameters(  
            projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
        ),  
    )

My search fields look like this:

fields = [  
        SearchField(
            name="parent_id",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True
        ),  
        SearchField(
            name="title",
            type=SearchFieldDataType.String
        ),  
        SearchField(
            name="chunk_id",
            type=SearchFieldDataType.String,
            key=True,
            sortable=True,
            filterable=True,
            facetable=True,
            analyzer_name="keyword"
        ),  
        SearchField(
            name="chunk",
            type=SearchFieldDataType.String,
            sortable=False,
            filterable=False,
            facetable=False
        ),  
        SearchField(
            name="vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=1536,
            vector_search_profile_name="myHnswProfile"
        ),
        SearchField(
            name="page_number",
            type=SearchFieldDataType.Int32,
            sortable=True,
            filterable=True,
            facetable=True
        ), 
    ] 

I get the following error:

The data field 'page_number' in the document with key 'xyz' has an invalid value of type 'Edm.String' ('String maps to Edm.String'). The expected type was 'Edm.Int32'.

When changing the value to String the index creation passes, with the following result under page_numbers:

"page_number": "[1,2,3,4,5,6,7,...]"

But I want to get a single value under each chunk


Solution

  • The reason you are getting this error because of the source_context in index projection.

    The context /document/pages/* is works for chunk because it is present around this context but numbers is present in the /document/numbers/* context, so it is not projecting to target index instead taking literal value.

    To make this correct you need to return the list of dictionaries contains chunk and numbers from custom web Api skillset.

    Below is the example.

    response_record = {
                    "recordId": recordId,
                    "data": {
                        "subdata": [
                        {'textItems': 'This is string 1.', 'numberItems': 1},
                        {'textItems': 'This is string 2.', 'numberItems': 2},
                        {'textItems': 'This is string 3.', 'numberItems': 3},
                        {'textItems': 'This is string 4.', 'numberItems': 4},
                        {'textItems': 'This is string 5.', 'numberItems': 5},
                        {'textItems': 'This is string 6.', 'numberItems': 6}]
                        
                    }
                }
    

    Here, i am returning the field subdata having list dictionary with fields textItems and numberItems.

    You can use below code to convert like above.

    combined_list = [{'textItems': text, 'numberItems': number} for text, number in zip(chunks, page_numbers)]
    

    Next, you can directly reference this in index projection no need to do output field mappings.

    Skillset defination.

      "indexProjections": {
        "selectors": [
          {
            "targetIndexName": "targetindex",
            "parentKeyFieldName": "parent_id",
            "sourceContext": "/document/subdata/*",
            "mappings": [
              {
                "name": "chunk",
                "source": "/document/subdata/*/textItems"
              },
              {
                "name": "page_number",
                "source": "/document/subdata/*/numberItems"
              }
            ]
          }
        ],
        "parameters": {}
      },
    

    Here, i given sourceContext as /document/subdata/* in that i am getting /document/subdata/*/textItems for chunk and /document/subdata/*/numberItems for page_number.

    Output:

    enter image description here

    Note: Use metadata_storage_path or metadata_filename to differentiate the chunks between each document.

    UPDATE

    Skillset

    {
      "@odata.context": "https://jgsaisearch.search.windows.net/$metadata#skillsets/$entity",
      "@odata.etag": "\"0x8DC8B62DB084EBF\"",
      "name": "skillset1718252807308",
      "description": "",
      "skills": [
        {
          "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
          "name": "jgs_func",
          "description": "",
          "context": "/document",
          "uri": "https://jgsfunctionapp.azurewebsites.net/api/http_trigger1?code=TITuBRAeLG0EfpzrhmylNpvEJRITVUUmtb1JKi1MNYXuAzFuLFTeUg==",
          "httpMethod": "POST",
          "timeout": "PT30S",
          "batchSize": 1,
          "degreeOfParallelism": 1,
          "authResourceId": "",
          "inputs": [
            {
              "name": "content",
              "source": "/document/content"
            }
          ],
          "outputs": [
            {
              "name": "subdata",
              "targetName": "subdata"
            }
          ],
          "httpHeaders": {},
          "authIdentity": null
        }
      ],
      "cognitiveServices": {
        "@odata.type": "#Microsoft.Azure.Search.DefaultCognitiveServices",
        "description": null
      },
      "knowledgeStore": null,
      "indexProjections": {
        "selectors": [
          {
            "targetIndexName": "targetindex",
            "parentKeyFieldName": "parent_id",
            "sourceContext": "/document/subdata/*",
            "mappings": [
              {
                "name": "chunk",
                "source": "/document/subdata/*/textItems",
                "sourceContext": null,
                "inputs": []
              },
              {
                "name": "page_number",
                "source": "/document/subdata/*/numberItems",
                "sourceContext": null,
                "inputs": []
              }
            ]
          }
        ],
        "parameters": {}
      },
      "encryptionKey": null
    }