I am trying to implement an AI document summarizer utilizing Azure Language. I got it to work when using Azure Document Intelligence and TextAnalyticsClient
as seen in this code:
def format_bounding_box(bounding_box):
if not bounding_box:
return "N/A"
reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
enter code here
def summary_single_blob(client, blob_ID):
language_endpoint = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_ENDPOINT']
language_key = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_KEY']
document_intelligence_endpoint = settings.AZURE_DOCUMENT_INTELLIGENCE['AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT']
document_intelligence_key = settings.AZURE_DOCUMENT_INTELLIGENCE['AZURE_DOCUMENT_INTELLIGENCE_KEY']
text_analytics_client = TextAnalyticsClient(
endpoint=language_endpoint,
credential=AzureKeyCredential(language_key),
)
document_intelligence_client = DocumentIntelligenceClient(
endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
)
blob_service_client = BlobServiceClient.from_connection_string(os.getenv("STORAGE_CON_STRING"))
try:
blobs = client.list_blobs()
for blob in blobs:
if blob.name == blob_ID:
blob_client = client.get_blob_client(blob)
blob_props = blob_client.get_blob_properties()
metadata = blob_props.metadata
sas_token = generate_blob_sas(
account_name=blob_service_client.account_name,
account_key=os.getenv("AZURE_ACCOUNT_KEY"),
container_name=client.container_name,
blob_name=blob.name,
permission=BlobSasPermissions(read=True, list=True),
expiry=datetime.utcnow() + timedelta(minutes=15)
)
blob_url = f"{blob_client.url}?{sas_token}"
poller = document_intelligence_client.begin_analyze_document("prebuilt-read", AnalyzeDocumentRequest(url_source=blob_url))
result = poller.result()
text_content = ""
for page in result.pages:
print("----Analyzing Read from page #{}----".format(page.page_number))
for _, line in enumerate(page.lines):
text_content += line.content + " "
print("----------------------------------------")
document = [text_content]
poller_language = text_analytics_client.begin_extract_summary(document)
extract_summary_results = poller_language.result()
for result in extract_summary_results:
print(result)
if result.kind == "ExtractiveSummarization":
print("Summary extracted: \n{}".format(
" ".join([sentence.text for sentence in result.sentences]))
)
elif result.is_error is True:
print("...Is an error with code '{}' and message '{}'".format(
result.error.code, result.error.message
))
print(f"No blob found with ID '{blob_ID}'.")
return False
except Exception as e:
print(f"An error occurred while deleting blob with ID '{blob_ID}': {str(e)}")
return False
But I run into the issue that my files can be too long which causes the issue of:
An error occurred while deleting blob with ID '8335_2_0.pdf': (InvalidDocumentBatch) Request Payload sent is too large to be processed. Limit request size to: 125000
Code: InvalidDocumentBatch
Message: Request Payload sent is too large to be processed. Limit request size to: 125000
Therefore, I decided to use the following code that utilizes my blobstorages:
def summary_single_blob(client, blob_ID):
language_endpoint = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_ENDPOINT']
language_key = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_KEY']
blob_service_client = BlobServiceClient.from_connection_string(os.getenv("STORAGE_CON_STRING"))
endpoint = f"{language_endpoint}language/analyze-documents/jobs?api-version=2024-11-15-preview"
# Set headers
headers = {
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Key": language_key
}
target_container_name = "summary-blobs"
try:
blobs = client.list_blobs()
for blob in blobs:
if blob.name == blob_ID:
blob_client = client.get_blob_client(blob)
blob_props = blob_client.get_blob_properties()
metadata = blob_props.metadata
sas_token = generate_blob_sas(
account_name=blob_service_client.account_name,
account_key=os.getenv("AZURE_ACCOUNT_KEY"),
container_name=client.container_name,
blob_name=blob.name,
permission=BlobSasPermissions(read=True, list=True),
expiry=datetime.utcnow() + timedelta(hours=24)
)
blob_url = f"{blob_client.url}?{sas_token}"
print("Blob URL:", blob_url)
sas_token_target = generate_container_sas(
account_name=blob_service_client.account_name,
account_key=os.getenv("AZURE_ACCOUNT_KEY"),
container_name=target_container_name,
permission=ContainerSasPermissions(write=True, list=True),
expiry=datetime.utcnow() + timedelta(hours=24)
)
container_target_sas_url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{target_container_name}?{sas_token_target}"
data = {
"tasks": [
{
"kind": "AbstractiveSummarization",
"parameters": {
"sentenceCount": 6
}
}
],
"analysisInput": {
"documents": [
{
"source": {
"location": f"{blob_url}"
},
"targets": {
"location": f"{container_target_sas_url}"
}
}
]
}
}
# Make the POST request
response = requests.post(endpoint, headers=headers, json=data)
# Print response
print(response.status_code)
print(response.json())
print(f"No blob found with ID '{blob_ID}'.")
return False
except Exception as e:
print(f"An error occurred while deleting blob with ID '{blob_ID}': {str(e)}")
return False
But here I get the issue that:
{'error': {'code': 'InvalidBody', 'innerError': {'requestId': 'e08a8acd-ddb9-44f0-86ff-d7473cd495fe'}, 'message': 'At least one input is missing an Id attribute.'}}
If I insert the id into the json like this:
data = {
"tasks": [
{
"kind": "AbstractiveSummarization",
"parameters": {
"sentenceCount": 6
}
}
],
"analysisInput": {
"documents": [
{
"id": "1",
"source": {
"location": f"{blob_url}"
},
"targets": {
"location": f"{container_target_sas_url}"
}
}
]
}
}
And get the error that:
{'error': {'code': 'InvalidRequest', 'details': [{'code': 'InvalidArgument', 'details': [{'code': 'InvalidArgument', 'message': "The value of parameter 'Location' cannot be null or empty."}], 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Invalid document in request.'}}], 'message': 'Invalid analysisInput in request.', 'innererror': {'code': 'InvalidRequestBodyFormat', 'message': 'The request body is invalid or malformed.'}}}
Has anyone else faced similar issues, and how did you resolve them?
Replace the field targets
with target
. Check the required input data properly.
data = {
"analysisInput": {
"documents": [
{
"id":"1",
"source": {
"location": f"https://<name>.blob.core.windows.net/data/pdf/requirements.txt?{sasToken}"
},
"target": {
"location": f"https://<name>.blob.core.windows.net/data?{sasToken}"
}
}
]
},
"tasks": [
{
"kind": "AbstractiveSummarization",
"parameters": {
"sentenceCount": 6
}
}
],
}
Output:
{'jobId': '48bf530xxxxx-yyyy-zzzzz44365',
'lastUpdatedDateTime': '2025-02-03T05:32:44Z',
'createdDateTime': '2025-02-03T05:32:28Z',
'expirationDateTime': '2025-02-04T05:32:28Z',
'status': 'succeeded',
'errors': [],
'tasks': {'completed': 1,
'failed': 0,
'inProgress': 0,
'total': 1,
'items': [{'kind': 'AbstractiveSummarizationLROResults',
'lastUpdateDateTime': '2025-02-03T05:32:44.177498Z',
'status': 'succeeded',
'results': {'documents': [{'id': '1',
'source': {'kind': 'AzureBlob',
'location': 'https://xyz.blob.core.windows.net/data/pdf/requirements.txt'},
'targets': [{'kind': 'AzureBlob',
'location': 'https://xyz.blob.core.windows.net/data/48bf5307-cbc0-4a2c-b341-8427b1d44365/AbstractiveSummarization/0001/requirements.json'}],
'warnings': []}],
'errors': [],
'modelVersion': '2024-10-25-phi35'}}]}}
and in storage account.