I don`t know how to create a text split skill and reference it in an indexer
I created an indexer after importing my data, and I used built-in skills such as Language detection and Keyphase extraction. However, I can't find a way to add a TEXTSPLIT skill to the indexer I created.
index:
{
"@odata.context": "https://mssearchser.search.windows.net/$metadata#indexes/$entity",
"@odata.etag": "\"0x8DC690C7EB356B6\"",
"name": "azureblob-index",
"defaultScoringProfile": "",
"fields": [
{
"name": "content",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_content_type",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_size",
"type": "Edm.Int64",
"searchable": false,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_last_modified",
"type": "Edm.DateTimeOffset",
"searchable": false,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_content_md5",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_name",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_path",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": true,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_storage_file_extension",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_content_type",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_author",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": true,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_creation_date",
"type": "Edm.DateTimeOffset",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "metadata_last_modified",
"type": "Edm.DateTimeOffset",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "locations",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "keyphrases",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "language",
"type": "Edm.String",
"searchable": false,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "merged_content",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "text",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "layoutText",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
}
],
"scoringProfiles": [],
"corsOptions": null,
"suggesters": [],
"analyzers": [],
"normalizers": [],
"tokenizers": [],
"tokenFilters": [],
"charFilters": [],
"encryptionKey": null,
"similarity": {
"@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
"k1": null,
"b": null
},
"semantic": null,
"vectorSearch": null
}
indexer:
{
"@odata.context": "https://mssearchser.search.windows.net/$metadata#indexers/$entity",
"@odata.etag": "\"0x8DC690CB8941711\"",
"name": "terminal-indexer",
"description": "",
"dataSourceName": "terminal-data",
"skillsetName": "terminal-skillset",
"targetIndexName": "azureblob-index",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": 0,
"maxFailedItemsPerBatch": 0,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_path",
"targetFieldName": "metadata_storage_path",
"mappingFunction": {
"name": "base64Encode",
"parameters": null
}
}
],
"outputFieldMappings": [
{
"sourceFieldName": "/document/merged_content/locations",
"targetFieldName": "locations"
},
{
"sourceFieldName": "/document/merged_content/keyphrases",
"targetFieldName": "keyphrases"
},
{
"sourceFieldName": "/document/language",
"targetFieldName": "language"
},
{
"sourceFieldName": "/document/merged_content",
"targetFieldName": "merged_content"
},
{
"sourceFieldName": "/document/normalized_images/*/text",
"targetFieldName": "text"
},
{
"sourceFieldName": "/document/normalized_images/*/layoutText",
"targetFieldName": "layoutText"
}
],
"cache": null,
"encryptionKey": null
}
Sample Definition:
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"textSplitMode" : "pages",
"maximumPageLength": 1000,
"pageOverlapLength": 100,
"maximumPagesToTake": 1,
"defaultLanguageCode": "en",
"inputs": [
{
"name": "text",
"source": "/document/content"
},
{
"name": "languageCode",
"source": "/document/language"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "mypages"
}
]
}
Sample Input:
{
"values": [
{
"recordId": "1",
"data": {
"text": "This is the loan application for Joe Romero...",
"languageCode": "en"
}
},
{
"recordId": "2",
"data": {
"text": "This is the second document...",
"languageCode": "en"
}
}
]
}
Sample Output:
{
"values": [
{
"recordId": "1",
"data": {
"textItems": [
"This is the loan...Here is the overlap part",
"Here is the overlap part...In the next section, we continue..."
]
}
},
{
"recordId": "2",
"data": {
"textItems": [
"This is the second document...Here is the overlap part...",
"Here is the overlap part...In the next section of the second doc..."
]
}
}
]
}
For Skillset your data:
Note: change the source according to the data you sent
{
"@odata.context": "https://[your-search-service].search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC690CB8941711\"",
"name": "terminal-skillset1",
"description": "",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"textSplitMode": "pages",
"maximumPageLength": 1000,
"pageOverlapLength": 100,
"maximumPagesToTake": 1,
"defaultLanguageCode": "en",
"inputs": [
{
"name": "text",
"source": "/document/merged_content"
},
{
"name": "languageCode",
"source": "/document/language"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "mypages"
}
]
}
]
}
change Skillset in the Indexer:
change the sourceFieldName
and `targetFieldName according to the fields
{
"@odata.context": "https://mssearchser.search.windows.net/$metadata#indexers/$entity",
"@odata.etag": "\"0x8DC690CB8941711\"",
"name": "terminal-indexer1",
"description": "",
"dataSourceName": "terminal-data",
"skillsetName": "terminal-skillset", // Reference your skillset here
"targetIndexName": "azureblob-index",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": 0,
"maxFailedItemsPerBatch": 0,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_path",
"targetFieldName": "metadata_storage_path",
"mappingFunction": {
"name": "base64Encode",
"parameters": null
}
}
],
"outputFieldMappings": [
{
"sourceFieldName": "/document/merged_content/locations",
"targetFieldName": "locations"
},
{
"sourceFieldName": "/document/merged_content/keyphrases",
"targetFieldName": "keyphrases"
},
{
"sourceFieldName": "/document/language",
"targetFieldName": "language"
},
{
"sourceFieldName": "/document/merged_content",
"targetFieldName": "merged_content"
},
{
"sourceFieldName": "/document/normalized_images/*/text",
"targetFieldName": "text"
},
{
"sourceFieldName": "/document/normalized_images/*/layoutText",
"targetFieldName": "layoutText"
},
{
"sourceFieldName": "/document/mypages/*", // Reference the output field from the skillset
"targetFieldName": "textItems" // Map to your desired field
}
],
"cache": null,
"encryptionKey": null
}