azureazure-functionsazure-language-understanding

Summarization of docx or pdf document


I am trying to implement an AI document summarizer utilizing Azure Language. I got it to work when using Azure Document Intelligence and TextAnalyticsClient as seen in this code:

def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
    return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
    enter code here

def summary_single_blob(client, blob_ID):
    language_endpoint = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_ENDPOINT']
    language_key = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_KEY']

    document_intelligence_endpoint = settings.AZURE_DOCUMENT_INTELLIGENCE['AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT']
    document_intelligence_key = settings.AZURE_DOCUMENT_INTELLIGENCE['AZURE_DOCUMENT_INTELLIGENCE_KEY']

    text_analytics_client = TextAnalyticsClient(
        endpoint=language_endpoint,
        credential=AzureKeyCredential(language_key),
    )

    document_intelligence_client  = DocumentIntelligenceClient(
        endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
    )
    blob_service_client = BlobServiceClient.from_connection_string(os.getenv("STORAGE_CON_STRING"))

    try:
        blobs = client.list_blobs()

        for blob in blobs:
            if blob.name == blob_ID:  
                blob_client = client.get_blob_client(blob)
                blob_props = blob_client.get_blob_properties()
                metadata = blob_props.metadata

                sas_token = generate_blob_sas(
                    account_name=blob_service_client.account_name,
                    account_key=os.getenv("AZURE_ACCOUNT_KEY"),
                    container_name=client.container_name,
                    blob_name=blob.name,
                    permission=BlobSasPermissions(read=True, list=True),
                    expiry=datetime.utcnow() + timedelta(minutes=15)
                )

                blob_url = f"{blob_client.url}?{sas_token}"
                poller = document_intelligence_client.begin_analyze_document("prebuilt-read", AnalyzeDocumentRequest(url_source=blob_url))
                result = poller.result()

                text_content = ""

                for page in result.pages:
                    print("----Analyzing Read from page #{}----".format(page.page_number))

                    for _, line in enumerate(page.lines):
                        text_content += line.content + " "
                print("----------------------------------------")

                document = [text_content]

                poller_language = text_analytics_client.begin_extract_summary(document)
                extract_summary_results = poller_language.result()

                for result in extract_summary_results:
                    print(result)
                    if result.kind == "ExtractiveSummarization":
                        print("Summary extracted: \n{}".format(
                            " ".join([sentence.text for sentence in result.sentences]))
                        )
                    elif result.is_error is True:
                        print("...Is an error with code '{}' and message '{}'".format(
                            result.error.code, result.error.message
                        ))


        print(f"No blob found with ID '{blob_ID}'.")
        return False
        
    except Exception as e:
        print(f"An error occurred while deleting blob with ID '{blob_ID}': {str(e)}")
        return False

But I run into the issue that my files can be too long which causes the issue of:

An error occurred while deleting blob with ID '8335_2_0.pdf': (InvalidDocumentBatch) Request Payload sent is too large to be processed. Limit request size to: 125000
Code: InvalidDocumentBatch
Message: Request Payload sent is too large to be processed. Limit request size to: 125000

Therefore, I decided to use the following code that utilizes my blobstorages:

def summary_single_blob(client, blob_ID):
    language_endpoint = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_ENDPOINT']
    language_key = settings.AZURE_LANGUAGE['AZURE_LANGUAGE_KEY']

    blob_service_client = BlobServiceClient.from_connection_string(os.getenv("STORAGE_CON_STRING"))


    endpoint = f"{language_endpoint}language/analyze-documents/jobs?api-version=2024-11-15-preview"

    # Set headers
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": language_key
    }

    target_container_name = "summary-blobs"

    try:
        blobs = client.list_blobs()

        for blob in blobs:
            if blob.name == blob_ID:  
                blob_client = client.get_blob_client(blob)
                blob_props = blob_client.get_blob_properties()
                metadata = blob_props.metadata

                sas_token = generate_blob_sas(
                    account_name=blob_service_client.account_name,
                    account_key=os.getenv("AZURE_ACCOUNT_KEY"),
                    container_name=client.container_name,
                    blob_name=blob.name,
                    permission=BlobSasPermissions(read=True, list=True),
                    expiry=datetime.utcnow() + timedelta(hours=24)
                )

                blob_url = f"{blob_client.url}?{sas_token}"
                print("Blob URL:", blob_url)
                
                sas_token_target = generate_container_sas(
                    account_name=blob_service_client.account_name,
                    account_key=os.getenv("AZURE_ACCOUNT_KEY"),
                    container_name=target_container_name,
                    permission=ContainerSasPermissions(write=True, list=True),
                    expiry=datetime.utcnow() + timedelta(hours=24)
                )

                container_target_sas_url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{target_container_name}?{sas_token_target}"
                
                
                data = {
                            "tasks": [
                            {
                                "kind": "AbstractiveSummarization",
                                "parameters": {
                                "sentenceCount": 6
                                }
                            }
                            ],
                            "analysisInput": {
                            "documents": [
                                {
                                "source": {
                                    "location": f"{blob_url}"
                                },
                                "targets": {
                                    "location": f"{container_target_sas_url}"
                                }
                                }
                            ]
                            }
                        }

                # Make the POST request
                response = requests.post(endpoint, headers=headers, json=data)  

                # Print response
                print(response.status_code)
                print(response.json())

        print(f"No blob found with ID '{blob_ID}'.")
        return False
        
    except Exception as e:
        print(f"An error occurred while deleting blob with ID '{blob_ID}': {str(e)}")
        return False

But here I get the issue that:

{'error': {'code': 'InvalidBody', 'innerError': {'requestId': 'e08a8acd-ddb9-44f0-86ff-d7473cd495fe'}, 'message': 'At least one input is missing an Id attribute.'}}

If I insert the id into the json like this:

data = {
                            "tasks": [
                            {
                                "kind": "AbstractiveSummarization",
                                "parameters": {
                                "sentenceCount": 6
                                }
                            }
                            ],
                            "analysisInput": {
                            "documents": [
                                {
                                "id": "1",
                                "source": {
                                    "location": f"{blob_url}"
                                },
                                "targets": {
                                    "location": f"{container_target_sas_url}"
                                }
                                }
                            ]
                            }
                        }

And get the error that:

{'error': {'code': 'InvalidRequest', 'details': [{'code': 'InvalidArgument', 'details': [{'code': 'InvalidArgument', 'message': "The value of parameter 'Location' cannot be null or empty."}], 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Invalid document in request.'}}], 'message': 'Invalid analysisInput in request.', 'innererror': {'code': 'InvalidRequestBodyFormat', 'message': 'The request body is invalid or malformed.'}}}

Has anyone else faced similar issues, and how did you resolve them?


Solution

  • Replace the field targets with target. Check the required input data properly.

    data = {
        "analysisInput": {
            "documents": [
                {
                    "id":"1",
                    "source": {
                        "location": f"https://<name>.blob.core.windows.net/data/pdf/requirements.txt?{sasToken}"
                        },
                    "target": {
                        "location": f"https://<name>.blob.core.windows.net/data?{sasToken}"
                        }
                }
            ]
        },
        "tasks": [
            {
                "kind": "AbstractiveSummarization",
                "parameters": {
                "sentenceCount": 6
                }
            }
        ],
    }
    

    Output:

    {'jobId': '48bf530xxxxx-yyyy-zzzzz44365',
     'lastUpdatedDateTime': '2025-02-03T05:32:44Z',
     'createdDateTime': '2025-02-03T05:32:28Z',
     'expirationDateTime': '2025-02-04T05:32:28Z',
     'status': 'succeeded',
     'errors': [],
     'tasks': {'completed': 1,
      'failed': 0,
      'inProgress': 0,
      'total': 1,
      'items': [{'kind': 'AbstractiveSummarizationLROResults',
        'lastUpdateDateTime': '2025-02-03T05:32:44.177498Z',
        'status': 'succeeded',
        'results': {'documents': [{'id': '1',
           'source': {'kind': 'AzureBlob',
            'location': 'https://xyz.blob.core.windows.net/data/pdf/requirements.txt'},
           'targets': [{'kind': 'AzureBlob',
             'location': 'https://xyz.blob.core.windows.net/data/48bf5307-cbc0-4a2c-b341-8427b1d44365/AbstractiveSummarization/0001/requirements.json'}],
           'warnings': []}],
         'errors': [],
         'modelVersion': '2024-10-25-phi35'}}]}}
    

    and in storage account.

    enter image description here