I see the following under my indexer settings:
When hovering over it I read the following:
True means the original file data obtained from your blob data source is preserved. This allows passing the original file to a custom skill, or to the Document Extraction skill.
How do I read the original pdf file in the associated blob data source in a custom WebApiSkill?
file_data_base64 = value.get('data', {}).get('file_data', '')
...
I enabled Allow Skillset to read file data
in the indexer. My full setup:
inputs=[
InputFieldMappingEntry(name="file_data", source="/document/file_data")
],
import azure.functions as func
import datetime
import json
import logging
import base64
import fitz
from io import BytesIO
app = func.FunctionApp()
logging.basicConfig(level=logging.INFO)
@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
try:
req_body = req.get_json()
logging.info('Request body parsed successfully.')
except ValueError:
logging.error(f"Invalid input: {e}")
return func.HttpResponse("Invalid input", status_code=400)
# 'values' expected top-level key in the request body
response_body = {"values": []}
for value in req_body.get('values', []):
recordId = value.get('recordId')
file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
if not file_data_base64:
logging.error("No file_data found in the request.")
return func.HttpResponse("Invalid input: No file_data found", status_code=400)
try:
file_data = base64.b64decode(file_data_base64)
try:
pdf_document = fitz.open(stream=BytesIO(file_data), filetype='pdf')
except fitz.FileDataError as e:
logging.error(f"Failed to open PDF document: {e}")
return func.HttpResponse("Failed to open PDF document", status_code=400)
except Exception as e:
logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
return func.HttpResponse("An unexpected error occurred", status_code=500)
if pdf_document.page_count == 0:
logging.error("No pages found in the PDF document.")
return func.HttpResponse("Invalid PDF: No pages found", status_code=400)
extracted_text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
extracted_text += page.get_text()
combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}] # i deleted the chunking and associated page extraction for simplicity
response_record = {
"recordId": recordId,
"data": {
"subdata": combined_list
}
}
response_body['values'].append(response_record)
except Exception as e:
logging.error(f"Error processing file_data: {e}")
return func.HttpResponse("Error processing file_data", status_code=500)
logging.info('Function executed successfully.')
return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
The error:
Message:
Could not execute skill because the Web Api request failed.
Details:
Web Api response status: 'NotFound', Web Api response details: ''
Given that I have projections I cannot debug this properly as debugging is not supported with projections. The logging does not seem to log the specific error either despite the error handling and checks.
The input to your WebApiSkill in your skillset has to be set to:
"inputs": [
{
"name": "file_data",
"source": "/document/file_data"
}
]
Enable "Allow skillset to read file data" in indexer:
indexing_parameters_configuration = IndexingParametersConfiguration(
allow_skillset_to_read_file_data=True
)
# otherwise: HTTP response error: () Configuration property 'queryTimeout' is not supported for the data source of type 'azureblob'.
indexing_parameters_configuration.query_timeout = None
indexer = SearchIndexer(
name=indexer_name,
description="Chunks, pages and embeddings",
skillset_name=skillset_name,
target_index_name=target_index_name,
data_source_name=data_source_name,
parameters={"configuration": indexing_parameters_configuration}
)
The file_data
input is base64 encoded, so first it has to be decoded, and then opened as a byte-stream with the PDF reader of your choice:
import re
import azure.functions as func
import datetime
import json
import logging
import base64
import fitz
from io import BytesIO
app = func.FunctionApp()
logging.basicConfig(level=logging.INFO)
@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
try:
req_body = req.get_json()
logging.info('Request body parsed successfully.')
except ValueError:
logging.error(f"Invalid input: {e}")
return func.HttpResponse("Invalid input", status_code=400)
# 'values' expected top-level key in the request body
response_body = {"values": []}
for value in req_body.get('values', []):
recordId = value.get('recordId')
file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
if not file_data_base64:
logging.error("No file_data found in the request.")
return func.HttpResponse("Invalid input: No file_data found", status_code=400)
try:
file_data = base64.b64decode(file_data_base64)
try:
pdf_document = fitz.open(stream=BytesIO(file_data), filetype='pdf')
except fitz.FileDataError as e:
logging.error(f"Failed to open PDF document: {e}")
return func.HttpResponse("Failed to open PDF document", status_code=400)
except Exception as e:
logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
return func.HttpResponse("An unexpected error occurred", status_code=500)
if pdf_document.page_count == 0:
logging.error("No pages found in the PDF document.")
return func.HttpResponse("Invalid PDF: No pages found", status_code=400)
combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}] # I deleted the chunking and associated page extraction in order to reduce the amount of code
response_record = {
"recordId": recordId,
"data": {
"subdata": combined_list
}
}
response_body['values'].append(response_record)
except Exception as e:
logging.error(f"Error processing file_data: {e}")
return func.HttpResponse("Error processing file_data", status_code=500)
logging.info('Function executed successfully.')
return func.HttpResponse(json.dumps(response_body), mimetype="application/json")