python google-cloud-platform google-cloud-storage cloud-document-ai

google.api_core.exceptions.InternalServerError: 500 Failed to process all the documents

I am getting this error when trying to implement the Document OCR from google cloud in python as explained here: https://cloud.google.com/document-ai/docs/ocr#documentai_process_document-python.

When I run

operation.result(timeout=None)

I get this error

Traceback (most recent call last):
      File "<input>", line 1, in <module>
      File "/Users/Niolo/Desktop/project/venv/lib/python3.8/site-packages/google/api_core/future/polling.py", line 134, in result
        raise self._exception
    google.api_core.exceptions.InternalServerError: 500 Failed to process all the documents

My full code

import re
import os
from google.cloud import storage
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions


project_id = 'my_project_id'
location = 'eu'  # Format is 'us' or 'eu'
processor_id = 'my_processor_id'  # Create processor in Cloud Console
gcs_input_uri = "gs://my_bucket/toy1.py"
gcs_output_uri = "gs://my_bucket"
gcs_output_uri_prefix = "gs://"

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/Niolo/Desktop/Work/DocumentAI/OCR/key.json"


def batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_input_uri,
    gcs_output_uri,
    gcs_output_uri_prefix,
    timeout: int = 300,
):
    # Set endpoint to EU
    options = ClientOptions(api_endpoint="eu-documentai.googleapis.com:443")
    # Instantiates a client
    client = documentai.DocumentProcessorServiceClient(client_options=options)
    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

    # 'mime_type' can be 'application/pdf', 'image/tiff',
    # and 'image/gif', or 'application/json'
    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
        gcs_source=gcs_input_uri, mime_type="application/pdf"
    )

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri
    )

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=[input_config],
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish

    operation.result(timeout=None)

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print("Output files:")

    for i, blob in enumerate(blob_list):
        # Download the contents of this blob as a bytes object.
        if ".json" not in blob.name:
            print(f"skipping non-supported file type {blob.name}")
            return
        # Only parses JSON files
        blob_as_bytes = blob.download_as_bytes()

        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}")

        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

        # Read the text recognition output from the processor
        for page in document.pages:
            for form_field in page.form_fields:
                field_name = get_text(form_field.field_name, document)
                field_value = get_text(form_field.field_value, document)
                print("Extracted key value pair:")
                print(f"\t{field_name}, {field_value}")
            for paragraph in document.pages:
                paragraph_text = get_text(paragraph.layout, document)
                print(f"Paragraph text:\n{paragraph_text}")

Solution

For the following variables you need to supply them the correct values.

gcs_input_uri the full path of the pdf/tiff/gif file you would like to process

gcs_input_uri = 'gs://cloud-samples-data/documentai/loan_form.pdf'

gcs_output_uri the bucket where you will store the output. NOTE: don't add a "/" at the end of the bucket name. This will also result to a error 500!

gcs_output_uri = 'gs://samplebucket'

gcs_output_uri_prefix this will serve as a folder in your bucket.

gcs_output_uri_prefix = 'test'

Keep the timeout in operation.result() since client.batch_process_documents(request) returns a long running operation.

An object representing a long-running operation. The result type for the operation will be :class:~.document_processor_service.BatchProcessResponse: Response message for batch process document method.

# Wait for the operation to finish
operation.result(timeout=timeout)

Here is the working code:

import re
import os
from google.cloud import storage
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions    

project_id = 'tiph-ricconoel-batch8'
location = 'eu'  # Format is 'us' or 'eu'
processor_id = 'your_processor_id'  # Create processor in Cloud Console
gcs_input_uri = 'gs://cloud-samples-data/documentai/loan_form.pdf'
gcs_output_uri = 'gs://samplebucket'
gcs_output_uri_prefix = 'test'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/full_path/your_json_file.json'

def batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_input_uri,
    gcs_output_uri,
    gcs_output_uri_prefix,
    timeout: int = 300,
):

    # Set endpoint to EU
    options = ClientOptions(api_endpoint="eu-documentai.googleapis.com:443")
    # Instantiates a client
    client = documentai.DocumentProcessorServiceClient(client_options=options)

    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

    # 'mime_type' can be 'application/pdf', 'image/tiff',
    # and 'image/gif', or 'application/json'
    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
        gcs_source=gcs_input_uri, mime_type="application/pdf"
    )

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri
    )

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=[input_config],
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=timeout)

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print("Output files:")

    for i, blob in enumerate(blob_list):
        # Download the contents of this blob as a bytes object.
        if ".json" not in blob.name:
            print(f"skipping non-supported file type {blob.name}")
            return
        # Only parses JSON files
        blob_as_bytes = blob.download_as_bytes()

        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}")

        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

        # Read the text recognition output from the processor
        for page in document.pages:
            for form_field in page.form_fields:
                field_name = get_text(form_field.field_name, document)
                field_value = get_text(form_field.field_value, document)
                print("Extracted key value pair:")
                print(f"\t{field_name}, {field_value}")
            for paragraph in document.pages:
                paragraph_text = get_text(paragraph.layout, document)
                print(f"Paragraph text:\n{paragraph_text}")

This will create the output file in in gs://samplebucket/test/xxxxx/x/output.json. See testing below: