jsonazureindexingazure-cognitive-servicesindexer

how do I create a TEXTSPLIT skill in a cognitive search and how do I reference it in an indexer


I don`t know how to create a text split skill and reference it in an indexer

I created an indexer after importing my data, and I used built-in skills such as Language detection and Keyphase extraction. However, I can't find a way to add a TEXTSPLIT skill to the indexer I created.

index:

{
  "@odata.context": "https://mssearchser.search.windows.net/$metadata#indexes/$entity",
  "@odata.etag": "\"0x8DC690C7EB356B6\"",
  "name": "azureblob-index",
  "defaultScoringProfile": "",
  "fields": [
    {
      "name": "content",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_content_type",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_size",
      "type": "Edm.Int64",
      "searchable": false,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_last_modified",
      "type": "Edm.DateTimeOffset",
      "searchable": false,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_content_md5",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_name",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_path",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": true,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_storage_file_extension",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_content_type",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_author",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": true,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_creation_date",
      "type": "Edm.DateTimeOffset",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "metadata_last_modified",
      "type": "Edm.DateTimeOffset",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "locations",
      "type": "Collection(Edm.String)",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "keyphrases",
      "type": "Collection(Edm.String)",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "language",
      "type": "Edm.String",
      "searchable": false,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "merged_content",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "text",
      "type": "Collection(Edm.String)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "layoutText",
      "type": "Collection(Edm.String)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "corsOptions": null,
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "encryptionKey": null,
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
    "k1": null,
    "b": null
  },
  "semantic": null,
  "vectorSearch": null
}

indexer:

{
  "@odata.context": "https://mssearchser.search.windows.net/$metadata#indexers/$entity",
  "@odata.etag": "\"0x8DC690CB8941711\"",
  "name": "terminal-indexer",
  "description": "",
  "dataSourceName": "terminal-data",
  "skillsetName": "terminal-skillset",
  "targetIndexName": "azureblob-index",
  "disabled": null,
  "schedule": null,
  "parameters": {
    "batchSize": null,
    "maxFailedItems": 0,
    "maxFailedItemsPerBatch": 0,
    "base64EncodeKeys": null,
    "configuration": {
      "dataToExtract": "contentAndMetadata",
      "parsingMode": "default",
      "imageAction": "generateNormalizedImages"
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_path",
      "targetFieldName": "metadata_storage_path",
      "mappingFunction": {
        "name": "base64Encode",
        "parameters": null
      }
    }
  ],
  "outputFieldMappings": [
    {
      "sourceFieldName": "/document/merged_content/locations",
      "targetFieldName": "locations"
    },
    {
      "sourceFieldName": "/document/merged_content/keyphrases",
      "targetFieldName": "keyphrases"
    },
    {
      "sourceFieldName": "/document/language",
      "targetFieldName": "language"
    },
    {
      "sourceFieldName": "/document/merged_content",
      "targetFieldName": "merged_content"
    },
    {
      "sourceFieldName": "/document/normalized_images/*/text",
      "targetFieldName": "text"
    },
    {
      "sourceFieldName": "/document/normalized_images/*/layoutText",
      "targetFieldName": "layoutText"
    }
  ],
  "cache": null,
  "encryptionKey": null
}

Solution

  • Sample Definition:

    {
        "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
        "textSplitMode" : "pages", 
        "maximumPageLength": 1000,
        "pageOverlapLength": 100,
        "maximumPagesToTake": 1,
        "defaultLanguageCode": "en",
        "inputs": [
            {
                "name": "text",
                "source": "/document/content"
            },
            {
                "name": "languageCode",
                "source": "/document/language"
            }
        ],
        "outputs": [
            {
                "name": "textItems",
                "targetName": "mypages"
            }
        ]
    }
    
    
    
    

    Sample Input:

    {
        "values": [
            {
                "recordId": "1",
                "data": {
                    "text": "This is the loan application for Joe Romero...",
                    "languageCode": "en"
                }
            },
            {
                "recordId": "2",
                "data": {
                    "text": "This is the second document...",
                    "languageCode": "en"
                }
            }
        ]
    }
    
    

    Sample Output:

    {
        "values": [
            {
                "recordId": "1",
                "data": {
                    "textItems": [
                        "This is the loan...Here is the overlap part",
                        "Here is the overlap part...In the next section, we continue..."
                    ]
                }
            },
            {
                "recordId": "2",
                "data": {
                    "textItems": [
                        "This is the second document...Here is the overlap part...",
                        "Here is the overlap part...In the next section of the second doc..."
                    ]
                }
            }
        ]
    }
    
    
    
    

    For Skillset your data:

    Note: change the source according to the data you sent

    {
      "@odata.context": "https://[your-search-service].search.windows.net/$metadata#skillsets/$entity",
      "@odata.etag": "\"0x8DC690CB8941711\"",
      "name": "terminal-skillset1",
      "description": "",
      "skills": [
        {
          "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
          "textSplitMode": "pages",
          "maximumPageLength": 1000,
          "pageOverlapLength": 100,
          "maximumPagesToTake": 1,
          "defaultLanguageCode": "en",
          "inputs": [
            {
              "name": "text",
              "source": "/document/merged_content"
            },
            {
              "name": "languageCode",
              "source": "/document/language"
            }
          ],
          "outputs": [
            {
              "name": "textItems",
              "targetName": "mypages"
            }
          ]
        }
      ]
    }
    
    

    enter image description here

    change Skillset in the Indexer:

    change the sourceFieldName and `targetFieldName according to the fields

    
    {
      "@odata.context": "https://mssearchser.search.windows.net/$metadata#indexers/$entity",
      "@odata.etag": "\"0x8DC690CB8941711\"",
      "name": "terminal-indexer1",
      "description": "",
      "dataSourceName": "terminal-data",
      "skillsetName": "terminal-skillset", // Reference your skillset here
      "targetIndexName": "azureblob-index",
      "disabled": null,
      "schedule": null,
      "parameters": {
        "batchSize": null,
        "maxFailedItems": 0,
        "maxFailedItemsPerBatch": 0,
        "base64EncodeKeys": null,
        "configuration": {
          "dataToExtract": "contentAndMetadata",
          "parsingMode": "default",
          "imageAction": "generateNormalizedImages"
        }
      },
      "fieldMappings": [
        {
          "sourceFieldName": "metadata_storage_path",
          "targetFieldName": "metadata_storage_path",
          "mappingFunction": {
            "name": "base64Encode",
            "parameters": null
          }
        }
      ],
      "outputFieldMappings": [
        {
          "sourceFieldName": "/document/merged_content/locations",
          "targetFieldName": "locations"
        },
        {
          "sourceFieldName": "/document/merged_content/keyphrases",
          "targetFieldName": "keyphrases"
        },
        {
          "sourceFieldName": "/document/language",
          "targetFieldName": "language"
        },
        {
          "sourceFieldName": "/document/merged_content",
          "targetFieldName": "merged_content"
        },
        {
          "sourceFieldName": "/document/normalized_images/*/text",
          "targetFieldName": "text"
        },
        {
          "sourceFieldName": "/document/normalized_images/*/layoutText",
          "targetFieldName": "layoutText"
        },
        {
          "sourceFieldName": "/document/mypages/*", // Reference the output field from the skillset
          "targetFieldName": "textItems" // Map to your desired field
        }
      ],
      "cache": null,
      "encryptionKey": null
    }