When I am trying to convert the message output from Google Cloud Natural Language API to json using protobuf, the sentiment and magnitude fields end up with no value at all. If I don't use it and just print them they will have values. How can this be fixed? I tried it with both json_results = MessageToJson(result, preserving_proto_field_name=True)
and json_results = MessageToJson(result)
, and I am not sure why this phenomenon happens.
Here's an example resulting file:
$ cat 10.json
{
"entities": [
{
"name": "Professor",
"type": "PERSON",
"salience": 0.47092151641845703,
"mentions": [
{
"text": {
"content": "Professor",
"begin_offset": 47
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Did U of California Fire Tenured Riverside",
"type": "ORGANIZATION",
"salience": 0.2889040410518646,
"mentions": [
{
"text": {
"content": "Did U of California Fire Tenured Riverside",
"begin_offset": 4
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Ted Cruz",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Ted_Cruz",
"mid": "/m/07j6ty"
},
"salience": 0.1294257491827011,
"mentions": [
{
"text": {
"content": "Ted Cruz",
"begin_offset": 60
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "some_url",
"type": "OTHER",
"salience": 0.0676858201622963,
"mentions": [
{
"text": {
"content": "some_url",
"begin_offset": 92
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Higher Ed",
"type": "OTHER",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Higher_education",
"mid": "/m/03r55"
},
"salience": 0.043062858283519745,
"mentions": [
{
"text": {
"content": "Higher Ed",
"begin_offset": 73
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
}
],
"language": "en"
}
Here's the code:
# copyright 2016 Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This application demonstrates how to perform basic operations with the
Google Cloud Natural Language API
For more information, the documentation at
https://cloud.google.com/natural-language/docs.
"""
import argparse
import sys
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six
import json
from google.protobuf.json_format import MessageToDict, MessageToJson
# [START def_sentiment_text]
def sentiment_text(text):
"""Detects sentiment in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_document_text]
# [START migration_analyze_sentiment]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# [END migration_document_text]
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
sentiment = client.analyze_sentiment(document).document_sentiment
print('Score: {}'.format(sentiment.score))
print('Magnitude: {}'.format(sentiment.magnitude))
# [END migration_analyze_sentiment]
# [END def_sentiment_text]
# [START def_sentiment_file]
def sentiment_file(gcs_uri):
"""Detects sentiment in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
# [START migration_document_gcs_uri]
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# [END migration_document_gcs_uri]
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
sentiment = client.analyze_sentiment(document).document_sentiment
print('Score: {}'.format(sentiment.score))
print('Magnitude: {}'.format(sentiment.magnitude))
# [END def_sentiment_file]
# [START def_entities_text]
def entities_text(text):
"""Detects entities in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_analyze_entities]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects entities in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
entities = client.analyze_entities(document).entities
# entity types from enums.Entity.Type
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entities:
print('=' * 20)
print(u'{:<16}: {}'.format('name', entity.name))
print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
print(u'{:<16}: {}'.format('metadata', entity.metadata))
print(u'{:<16}: {}'.format('salience', entity.salience))
print(u'{:<16}: {}'.format('wikipedia_url',
entity.metadata.get('wikipedia_url', '-')))
# [END migration_analyze_entities]
# [END def_entities_text]
# [START def_entities_file]
def entities_file(gcs_uri):
"""Detects entities in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
entities = client.analyze_entities(document).entities
# entity types from enums.Entity.Type
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entities:
print('=' * 20)
print(u'{:<16}: {}'.format('name', entity.name))
print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
print(u'{:<16}: {}'.format('metadata', entity.metadata))
print(u'{:<16}: {}'.format('salience', entity.salience))
print(u'{:<16}: {}'.format('wikipedia_url',
entity.metadata.get('wikipedia_url', '-')))
# [END def_entities_file]
# [START def_syntax_text]
def syntax_text(text):
"""Detects syntax in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_analyze_syntax]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects syntax in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
tokens = client.analyze_syntax(document).tokens
# part-of-speech tags from enums.PartOfSpeech.Tag
pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
for token in tokens:
print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
token.text.content))
# [END migration_analyze_syntax]
# [END def_syntax_text]
# [START def_syntax_file]
def syntax_file(gcs_uri):
"""Detects syntax in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detects syntax in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
tokens = client.analyze_syntax(document).tokens
# part-of-speech tags from enums.PartOfSpeech.Tag
pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
for token in tokens:
print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
token.text.content))
# [END def_syntax_file]
# [START def_entity_sentiment_text]
def entity_sentiment_text(text, line_number):
"""Detects entity sentiment in the provided text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
document = types.Document(
content=text.encode('utf-8'),
type=enums.Document.Type.PLAIN_TEXT)
# Detect and send native Python encoding to receive correct word offsets.
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
result = client.analyze_entity_sentiment(document, encoding)
json_results = MessageToJson(result, preserving_proto_field_name=True)
print(type(result))
'''
for entity in result.entities:
print('Mentions: ')
print(u'Name: "{}"'.format(entity.name))
for mention in entity.mentions:
print(u' Begin Offset : {}'.format(mention.text.begin_offset))
print(u' Content : {}'.format(mention.text.content))
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
print(u' Type : {}'.format(mention.type))
print(u'Salience: {}'.format(entity.salience))
print(u'Sentiment: {}\n'.format(entity.sentiment))
'''
print(result)
file_name = str(line_number)+".json"
with open(file_name, 'w') as outfile:
outfile.write(json_results)
#json.dump(result, outfile)
#for entity in result.entities:
# for mention in entity.mentions:
# [END def_entity_sentiment_text]
def entity_sentiment_file(gcs_uri):
"""Detects entity sentiment in a Google Cloud Storage file."""
client = language.LanguageServiceClient()
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detect and send native Python encoding to receive correct word offsets.
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
result = client.analyze_entity_sentiment(document, encoding)
for entity in result.entities:
print(u'Name: "{}"'.format(entity.name))
for mention in entity.mentions:
print(u' Begin Offset : {}'.format(mention.text.begin_offset))
print(u' Content : {}'.format(mention.text.content))
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
print(u' Type : {}'.format(mention.type))
print(u'Salience: {}'.format(entity.salience))
print(u'Sentiment: {}\n'.format(entity.sentiment))
# [START def_classify_text]
def classify_text(text):
"""Classifies content categories of the provided text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
document = types.Document(
content=text.encode('utf-8'),
type=enums.Document.Type.PLAIN_TEXT)
categories = client.classify_text(document).categories
for category in categories:
print(u'=' * 20)
print(u'{:<16}: {}'.format('name', category.name))
print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_text]
# [START def_classify_file]
def classify_file(gcs_uri):
"""Classifies content categories of the text in a Google Cloud Storage
file.
"""
client = language.LanguageServiceClient()
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
categories = client.classify_text(document).categories
for category in categories:
print(u'=' * 20)
print(u'{:<16}: {}'.format('name', category.name))
print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_file]
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(dest='command')
classify_text_parser = subparsers.add_parser(
'classify-text', help=classify_text.__doc__)
classify_text_parser.add_argument('text')
classify_text_parser = subparsers.add_parser(
'classify-file', help=classify_file.__doc__)
classify_text_parser.add_argument('gcs_uri')
sentiment_entities_text_parser = subparsers.add_parser(
'sentiment-entities-text', help=entity_sentiment_text.__doc__)
sentiment_entities_text_parser.add_argument('text')
#added by mona
sentiment_entities_text_parser.add_argument('line_number')
sentiment_entities_file_parser = subparsers.add_parser(
'sentiment-entities-file', help=entity_sentiment_file.__doc__)
sentiment_entities_file_parser.add_argument('gcs_uri')
sentiment_text_parser = subparsers.add_parser(
'sentiment-text', help=sentiment_text.__doc__)
sentiment_text_parser.add_argument('text')
sentiment_file_parser = subparsers.add_parser(
'sentiment-file', help=sentiment_file.__doc__)
sentiment_file_parser.add_argument('gcs_uri')
entities_text_parser = subparsers.add_parser(
'entities-text', help=entities_text.__doc__)
entities_text_parser.add_argument('text')
entities_file_parser = subparsers.add_parser(
'entities-file', help=entities_file.__doc__)
entities_file_parser.add_argument('gcs_uri')
syntax_text_parser = subparsers.add_parser(
'syntax-text', help=syntax_text.__doc__)
syntax_text_parser.add_argument('text')
syntax_file_parser = subparsers.add_parser(
'syntax-file', help=syntax_file.__doc__)
syntax_file_parser.add_argument('gcs_uri')
args = parser.parse_args()
if args.command == 'sentiment-text':
sentiment_text(args.text)
elif args.command == 'sentiment-file':
sentiment_file(args.gcs_uri)
elif args.command == 'entities-text':
entities_text(args.text)
elif args.command == 'entities-file':
entities_file(args.gcs_uri)
elif args.command == 'syntax-text':
syntax_text(args.text)
elif args.command == 'syntax-file':
syntax_file(args.gcs_uri)
elif args.command == 'sentiment-entities-text':
entity_sentiment_text(args.text, args.line_number)
elif args.command == 'sentiment-entities-file':
entity_sentiment_file(args.gcs_uri)
elif args.command == 'classify-text':
classify_text(args.text)
elif args.command == 'classify-file':
classify_file(args.gcs_uri)
Here's the script I use to run the code:
#!/bin/bash
n=1
while read -u 3 -r line; do
echo $n "${line::30}"
python entity_sentiment.py sentiment-entities-text "$line" $n
((n++))
done 3< 10tweets.txt
and then
bash -x runjob.sh
also, 10tweets.txt is:
$ cat 10tweets.txt
Trump on the other hand goes all in on water boarding AND some. #GOPDebate
RT @wpjenna Donald Trump promises that he will not touch the 2nd amendment -- "unless we're going to make it stronger."
Trump 23%, Rubio 19%, Kasich & Bush 14%, Christie 10%, Cruz 9% #NHPrimary
@realDonaldTrump Thank you for saying you won't use vulger language anymore. Talk about Sanders & Clinton. Take Cruz as VP. Mexican votes!!!
RT @SurfPHX Mr. Trump @realDonaldTrump tweeted 25 minutes ago. You all do realize, that our future President hardly sleeps. He's a Fighter and a Worker!
go, Bernie #DemDebate
Sanders calls out Clinton on taking Foreign Policy advice from Warmonger Henry Kissinger some_url via @YouTube
Cruz, Rubio, and the Moral Bankruptcy of Progressive Identity Politics some_url via @NRO
RT @scarylawyerguy "Who does Bernie Sanders listen to on foreign policy." - A question Hillary had to raise b/c the media will not. #DemDebate
Why Did U of California Fire Tenured Riverside Professor? / Ted Cruz and Higher Ed -- ... - some_url
If I just print the results it will show magnitude and sentiment as below:
$ cat 10.json
Mentions:
Name: "Professor"
Begin Offset : 47
Content : Professor
Magnitude : 0.0
Sentiment : 0.0
Type : 2
Salience: 0.47092151641845703
Sentiment:
Mentions:
Name: "Did U of California Fire Tenured Riverside"
Begin Offset : 4
Content : Did U of California Fire Tenured Riverside
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.2889040410518646
Sentiment:
Mentions:
Name: "Ted Cruz"
Begin Offset : 60
Content : Ted Cruz
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.1294257491827011
Sentiment:
Mentions:
Name: "some_url"
Begin Offset : 92
Content : some_url
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.0676858201622963
Sentiment:
Mentions:
Name: "Higher Ed"
Begin Offset : 73
Content : Higher Ed
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.043062858283519745
Sentiment:
Basically, the sentiment field is empty hence there is no way to extract score and magnitude from it like I did previously using print:
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
When the numerical values are zero, they get no field in the json. https://github.com/gogo/protobuf/issues/218 seems there is no fix to this yet. But you can check if these fields don't exist, it means their values is zero. Here's an example in which some of the score/magnitudes are zero hence don't exist in the json file and the rest do:
{
"entities": [
{
"name": "RT @scarylawyerguy",
"type": "OTHER",
"salience": 0.4150770902633667,
"mentions": [
{
"text": {
"content": "RT @scarylawyerguy"
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "foreign policy",
"type": "OTHER",
"salience": 0.19249163568019867,
"mentions": [
{
"text": {
"content": "foreign policy",
"begin_offset": 57
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Bernie Sanders",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Bernie_Sanders",
"mid": "/m/01_gbv"
},
"salience": 0.13153041899204254,
"mentions": [
{
"text": {
"content": "Bernie Sanders",
"begin_offset": 29
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "question",
"type": "OTHER",
"salience": 0.08613643795251846,
"mentions": [
{
"text": {
"content": "question",
"begin_offset": 78
},
"type": "COMMON",
"sentiment": {
"magnitude": 0.10000000149011612,
"score": -0.10000000149011612
}
}
],
"sentiment": {
"magnitude": 0.10000000149011612,
"score": -0.10000000149011612
}
},
{
"name": "media",
"type": "OTHER",
"salience": 0.0647100880742073,
"mentions": [
{
"text": {
"content": "media",
"begin_offset": 116
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Hillary",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Hillary_Clinton",
"mid": "/m/0d06m5"
},
"salience": 0.054773446172475815,
"mentions": [
{
"text": {
"content": "Hillary",
"begin_offset": 87
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "b/c",
"type": "OTHER",
"salience": 0.028641967102885246,
"mentions": [
{
"text": {
"content": "b/c",
"begin_offset": 108
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "DemDebate",
"type": "OTHER",
"salience": 0.026638930663466454,
"mentions": [
{
"text": {
"content": "DemDebate",
"begin_offset": 133
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
}
],
"language": "en"
}
and the raw result for that is:
<class 'google.cloud.language_v1.types.AnalyzeEntitySentimentResponse'>
entities {
name: "RT @scarylawyerguy"
type: OTHER
salience: 0.4150770902633667
mentions {
text {
content: "RT @scarylawyerguy"
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "foreign policy"
type: OTHER
salience: 0.19249163568019867
mentions {
text {
content: "foreign policy"
begin_offset: 57
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "Bernie Sanders"
type: PERSON
metadata {
key: "mid"
value: "/m/01_gbv"
}
metadata {
key: "wikipedia_url"
value: "https://en.wikipedia.org/wiki/Bernie_Sanders"
}
salience: 0.13153041899204254
mentions {
text {
content: "Bernie Sanders"
begin_offset: 29
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "question"
type: OTHER
salience: 0.08613643795251846
mentions {
text {
content: "question"
begin_offset: 78
}
type: COMMON
sentiment {
magnitude: 0.10000000149011612
score: -0.10000000149011612
}
}
sentiment {
magnitude: 0.10000000149011612
score: -0.10000000149011612
}
}
entities {
name: "media"
type: OTHER
salience: 0.0647100880742073
mentions {
text {
content: "media"
begin_offset: 116
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "Hillary"
type: PERSON
metadata {
key: "mid"
value: "/m/0d06m5"
}
metadata {
key: "wikipedia_url"
value: "https://en.wikipedia.org/wiki/Hillary_Clinton"
}
salience: 0.054773446172475815
mentions {
text {
content: "Hillary"
begin_offset: 87
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "b/c"
type: OTHER
salience: 0.028641967102885246
mentions {
text {
content: "b/c"
begin_offset: 108
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "DemDebate"
type: OTHER
salience: 0.026638930663466454
mentions {
text {
content: "DemDebate"
begin_offset: 133
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
language: "en"