I upload a pdf file to my streamlit application like this:
import streamlit as st
uploaded_file = st.file_uploader("Upload pdf file", type="pdf")
result = analyze_general_document(uploaded_file)
I want to analzye this pdf using the Azure Document Intelligence
python package like this:
from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
def set_client(secrets: dict):
endpoint = secrets["AI_DOCS_BASE"]
key = secrets["AI_DOCS_KEY"]
document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
return document_analysis_client
def analyze_general_document(uploaded_file, secrets: dict):
print(f"File type: {uploaded_file.type}")
print(f"File size: {uploaded_file.size} bytes")
client = set_client(secrets)
# poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)
I can successfully print the file type and file size as you can see in the terminal output:
File type: application/pdf
File size: 6928426 bytes
Also opening the file with PyMuPDF
works fine as well.
However the method begin_analyze_document
throws the following exeception:
Traceback (most recent call last):
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\streamlit\runtime\scriptrunner\exec_code.py", line 88, in exec_func_with_error_handling
result = func()
^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 579, in code_to_exec
exec(code, module.__dict__)
File "C:\Users\myuser\Documents\visual-studio-code\project\project-ai-docs\webapp\app.py", line 79, in <module>
main()
File "C:\Users\myuser\Documents\visual-studio-code\project\project-ai-docs\webapp\app.py", line 61, in main
zip_content = process_pdf(uploaded_file, secrets)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\Documents\visual-studio-code\project\project-ai-docs\webapp\app_backend.py", line 40, in process_pdf
analyze_general_document(uploaded_file, secrets)
File "C:\Users\myuser\Documents\visual-studio-code\project\project-ai-docs\webapp\az_document_intelligence.py", line 18, in analyze_general_document
poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\azure\core\tracing\decorator.py", line 105, in wrapper_use_tracer
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\azure\ai\formrecognizer\_document_analysis_client.py", line 129, in begin_analyze_document
return _client_op_path.begin_analyze_document( # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\azure\core\tracing\decorator.py", line 105, in wrapper_use_tracer
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\azure\ai\formrecognizer\_generated\v2023_07_31\operations\_document_models_operations.py", line 518, in begin_analyze_document
raw_result = self._analyze_document_initial( # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\AppData\Local\miniconda3\envs\projectai\Lib\site-packages\azure\ai\formrecognizer\_generated\v2023_07_31\operations\_document_models_operations.py", line 443, in _analyze_document_initial
raise HttpResponseError(response=response)
azure.core.exceptions.HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
"code": "InvalidContent",
"message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}
Why is the pdf considered invalid? I also tried wrapping it in a BytesIO object like this but it didn't work either:
def analyze_general_document(uploaded_file, secrets: dict):
print(f"File type: {uploaded_file.type}")
print(f"File size: {uploaded_file.size} bytes")
# Read the file as bytes
file_bytes = uploaded_file.read()
client = set_client(secrets)
# poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))
Azure Document Intelligence (formrecognizer) - 'InvalidContent' when passing pdf
You can use the below code that Analyze the pdf file with Azure Document Intelligence by uploaded with streamlit using python,
Code:
import streamlit as st
from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from PyPDF2 import PdfReader
def set_client(secrets: dict):
endpoint = secrets["AI_DOCS_BASE"]
key = secrets["AI_DOCS_KEY"]
return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
def validate_pdf(file):
try:
file.seek(0) # Reset pointer
reader = PdfReader(file)
if len(reader.pages) == 0:
raise ValueError("The PDF has no pages.")
print(f"PDF is valid with {len(reader.pages)} pages.")
except Exception as e:
raise ValueError(f"PDF validation failed: {e}")
def check_pdf_metadata(file):
file.seek(0) # Reset pointer
reader = PdfReader(file)
if reader.is_encrypted:
raise ValueError("Encrypted PDFs are not supported.")
print(f"PDF has {len(reader.pages)} pages and is not encrypted.")
def analyze_general_document(uploaded_file, secrets: dict):
validate_pdf(uploaded_file)
check_pdf_metadata(uploaded_file)
uploaded_file.seek(0) # Reset pointer
file_bytes = uploaded_file.read()
client = set_client(secrets)
try:
print("Sending file to Azure Document Intelligence...")
poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))
result = poller.result()
print("Analysis successful!")
return result
except Exception as e:
raise RuntimeError(f"Azure Document Intelligence error: {e}")
# Streamlit Application
def main():
st.title("Azure Document Intelligence PDF Analyzer")
uploaded_file = st.file_uploader("Upload PDF file", type="pdf")
if uploaded_file is not None:
# Display file details
st.write(f"File Name: {uploaded_file.name}")
st.write(f"File Type: {uploaded_file.type}")
st.write(f"File Size: {uploaded_file.size} bytes")
# Secrets for Azure setup
secrets = {
"AI_DOCS_BASE": "https://xxxxxxx.cognitiveservices.azure.com/",
"AI_DOCS_KEY": "xxxxx"
}
try:
# Analyze PDF
result = analyze_general_document(uploaded_file, secrets)
st.success("PDF analysis completed!")
st.json(result.to_dict()) # Display results in Streamlit
except Exception as e:
st.error(f"Error: {e}")
if __name__ == "__main__":
main()
Output:
PS C:\Users\xxxx> streamlit run set.py
You can now view your Streamlit app in your browser.
Local URL: http://localhost:8xxx1
Network URL: http://192.168.1.8:8xxx1
PDF is valid with 20 pages.
PDF has 20 pages and is not encrypted.
Sending file to Azure Document Intelligence...
Analysis successful!
Browser:
Reference: azure.ai.formrecognizer.DocumentAnalysisClient class | Microsoft Learn