google-cloud-platformcloud-document-aigcp-ai-platform-notebookgoogle-cloud-document-ai

Google Document AI batch processing failing


Following Google's documentation, I am trying to perform a Document AI OCR batch request (async), and I constantly receive an error. I tried both with gcs_input_uri and gcs_input_prefix. I can not find any relevant logs in the GCP console, and no operations in the destination bucket 'operations' tab.

The error and code follow. I'd appreciate any assistance!

Error:

google.api_core.exceptions.InvalidArgument: 400 Failed to process all
documents. 3: Failed to process all documents.

Code:

import re
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai  # type: ignore
from google.cloud import storage

project_id = "document-ocr-xxxxx"
location = "us"
processor_id = "7797cdfaxxxxxx"  # Create processor before running sample
field_mask = "entities.id, entities.confidence, entities.type, entities.mentionText"
gcs_output_uri = "gs://score/"
input_mime_type = "application/pdf"
gcs_input_prefix = "gs://score_input/"


def batch_process_documents(
        project_id: str,
        location: str,
        processor_id: str,
        gcs_output_uri: str,
        gcs_input_prefix: Optional[str] = None,
        timeout: int = 400,
) -> None:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    gcs_input_uri = 'gs://score_input/278040xxxxx.pdf'
    if gcs_input_uri:
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )

    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
    name = client.processor_path(project_id, location, processor_id)
    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    try:
        print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    except (RetryError, InternalServerError) as e:
        print(e.message)


batch_process_documents(project_id, location, processor_id, gcs_output_uri, gcs_input_prefix)

Output:

(venv) ubuntu@MacBook-Pro-2 playground %  python doc_batch.py 
Waiting for operation projects/106038xxxxxx/locations/us/operations/1505725931527xxxxxx to complete...
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 39, in <module>
    cli.main()
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 430, in main
    run()
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 284, in run_file
    runpy.run_path(target, run_name="__main__")
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
    return _run_module_code(code, init_globals, run_name,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
    _run_code(code, mod_globals, init_globals,
  File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-2024.6.0-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
    exec(code, run_globals)
  File "/Users/ubuntu/source/playground/doc_batch.py", line 151, in <module>
    batch_process_documents(project_id, location, processor_id, gcs_output_uri, gcs_input_prefix)
  File "/Users/ubuntu/source/playground/doc_batch.py", line 90, in batch_process_documents
    operation.result(timeout=timeout)
  File "/Users/ubuntu/source/playground/venv/lib/python3.12/site-packages/google/api_core/future/polling.py", line 261, in result
    raise self._exception
google.api_core.exceptions.InvalidArgument: 400 Failed to process all documents. 3: Failed to process all documents.
(venv) ubuntu@MacBook-Pro-2 playground % 

Solution

  • Firstly, the problem was with my list of field to include

    field_mask = "entities.id, entities.confidence, entities.type, entities.mentionText"
    

    after looking into the API which was not easy to come by, I realized that the entities.* element does not support retrieving its children. only top level element is possible to retrieve "entities" (unlike the page element for instance).

    Unfortunately, the batch process request did not show this error, but after using the API to retrieve the operation status (get_operation), I could see the full error message.