I am trying to leverage Google's AutoML Natural Language to extract entities from PDF documents, and exported those entities and associated values into csv files. I have already trained an entity extraction model on the AutoML UI, and am now working on writing a Python program to invoke the model and perform the post processing task.
Here, I am just experimenting with writing some simple code to predict the extracted entities on a pdf file. However, I am running into some issues and would like some help; below is my code:
import sys
import os
from google.api_core.client_options import ClientOptions
from google.cloud import automl_v1
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/jetsonwu/intelligent-upload/AutoML_NLP/GCP/automl-pipeline-354119-bc09422edc36.json"
model_id = "TEN7692325184920879104"
file_path = "gs://intelligent_upload/electric-bill/pdf/China World Trade Center EB1.pdf"
def inline_text_payload(file_path):
with open(file_path, 'rb') as ff:
content = ff.read()
return {'text_snippet': {'content': content, 'mime_type': 'text/plain'} }
def pdf_payload(file_path):
return {'document': {'input_config': {'gcs_source': {'input_uris': [file_path] } } } }
def get_prediction(file_path, model_name):
options = ClientOptions(api_endpoint='us-automl.googleapis.com')
prediction_client = automl_v1.PredictionServiceClient(client_options=options)
# payload = inline_text_payload(file_path)
# Uncomment the following line (and comment the above line) if want to predict on PDFs.
payload = pdf_payload(file_path)
params = {}
request = prediction_client.predict(name=model_name, payload=payload, params=params)
return request # waits until request is returned
if __name__ == '__main__':
file_path = sys.argv[1]
model_name = sys.argv[2]
print(get_prediction(file_path, model_id))
The code above is pretty much copied straight from the AutoML UI, but I am getting some errors and could not figure out why... Here is the error message:
E0627 22:10:44.828979000 4336043392 hpack_parser.cc:1234] Error parsing metadata: error=invalid value key=content-type value=text/html; charset=UTF-8
---------------------------------------------------------------------------
_InactiveRpcError Traceback (most recent call last)
File ~/Library/Python/3.8/lib/python/site-packages/google/api_core/grpc_helpers.py:50, in _wrap_unary_errors.<locals>.error_remapped_callable(*args, **kwargs)
49 try:
---> 50 return callable_(*args, **kwargs)
51 except grpc.RpcError as exc:
File ~/Library/Python/3.8/lib/python/site-packages/grpc/_channel.py:946, in _UnaryUnaryMultiCallable.__call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
944 state, call, = self._blocking(request, timeout, metadata, credentials,
945 wait_for_ready, compression)
--> 946 return _end_unary_response_blocking(state, call, False, None)
File ~/Library/Python/3.8/lib/python/site-packages/grpc/_channel.py:849, in _end_unary_response_blocking(state, call, with_call, deadline)
848 else:
--> 849 raise _InactiveRpcError(state)
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNIMPLEMENTED
details = "Received http2 header with status: 404"
debug_error_string = "{"created":"@1656382244.829029000","description":"Error received from peer ipv4:142.250.191.234:443","file":"src/core/lib/surface/call.cc","file_line":967,"grpc_message":"Received http2 header with status: 404","grpc_status":12}"
>
The above exception was the direct cause of the following exception:
MethodNotImplemented Traceback (most recent call last)
...
50 return callable_(*args, **kwargs)
51 except grpc.RpcError as exc:
---> 52 raise exceptions.from_grpc_error(exc) from exc
MethodNotImplemented: 501 Received http2 header with status: 404
For the file path, I am using one of the pdf file that I have previously uploaded to cloud storage. Would anyone please take a look and help? Much appreciated!
We were able to replicate your scenario and make a solution based on this documentation:
AutoML Natural Language supports both a global API endpoint (automl.googleapis.com) and a European Union endpoint (eu-automl.googleapis.com).
I changed the api_endpoint from us-automl.googleapis.com
to automl.googleapis.com
. Also I modified model_id
including the path. See working code below:
import sys
import os
from google.api_core.client_options import ClientOptions
from google.cloud import automl_v1
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/<your-path>/tiph-anjelab-318a5dcad3c6.json"
model_id = "projects/<>/locations/us-central1/models/<your-model-id>"
file_path = "<your-path>/test.txt"
def inline_text_payload(file_path):
with open(file_path, 'rb') as ff:
content = ff.read()
return {'text_snippet': {'content': content, 'mime_type': 'text/plain'} }
def pdf_payload(file_path):
return {'document': {'input_config': {'gcs_source': {'input_uris': [file_path] } } } }
def get_prediction(file_path, model_name):
options = ClientOptions(api_endpoint='automl.googleapis.com')
prediction_client = automl_v1.PredictionServiceClient(client_options=options)
payload = inline_text_payload(file_path)
# Uncomment the following line (and comment the above line) if want to predict on PDFs.
#payload = pdf_payload(file_path)
params = {}
request = prediction_client.predict(name=model_name, payload=payload, params=params)
return request # waits until request is returned
if __name__ == '__main__':
#file_path = sys.argv[1]
#model_name = sys.argv[2]
print(get_prediction(file_path, model_id))
Output:
payload {
annotation_spec_id: "6167291562079289344"
display_name: "Modifier"
text_extraction {
score: 0.996536374092102
text_segment {
start_offset: 13
end_offset: 39
content: "hereditary hemochromatosis"
}
}
}
payload {
annotation_spec_id: "1555605543651901440"
display_name: "DiseaseClass"
text_extraction {
score: 0.9985179901123047
text_segment {
start_offset: 180
end_offset: 207
content: "autosomal recessive disease"
}
}
}
payload {
annotation_spec_id: "2708527048258748416"
display_name: "SpecificDisease"
text_extraction {
score: 0.999455988407135
text_segment {
start_offset: 208
end_offset: 234
content: "hereditary hemochromatosis"
}
}
}
payload {
annotation_spec_id: "6167291562079289344"
display_name: "Modifier"
text_extraction {
score: 0.9980571269989014
text_segment {
start_offset: 2323
end_offset: 2349
content: "hereditary hemochromatotic"
}
}
}