jsonserializationgoogle-cloud-platformprotocol-buffersgoogle-cloud-nl

some fields get no value when converting the Google Cloud result to json using Protobuf


When I am trying to convert the message output from Google Cloud Natural Language API to json using protobuf, the sentiment and magnitude fields end up with no value at all. If I don't use it and just print them they will have values. How can this be fixed? I tried it with both json_results = MessageToJson(result, preserving_proto_field_name=True) and json_results = MessageToJson(result), and I am not sure why this phenomenon happens.

Here's an example resulting file:

$ cat 10.json 
{
  "entities": [
    {
      "name": "Professor",
      "type": "PERSON",
      "salience": 0.47092151641845703,
      "mentions": [
        {
          "text": {
            "content": "Professor",
            "begin_offset": 47
          },
          "type": "COMMON",
          "sentiment": {}
        }
      ],
      "sentiment": {}
    },
    {
      "name": "Did U of California Fire Tenured Riverside",
      "type": "ORGANIZATION",
      "salience": 0.2889040410518646,
      "mentions": [
        {
          "text": {
            "content": "Did U of California Fire Tenured Riverside",
            "begin_offset": 4
          },
          "type": "PROPER",
          "sentiment": {}
        }
      ],
      "sentiment": {}
    },
    {
      "name": "Ted Cruz",
      "type": "PERSON",
      "metadata": {
        "wikipedia_url": "https://en.wikipedia.org/wiki/Ted_Cruz",
        "mid": "/m/07j6ty"
      },
      "salience": 0.1294257491827011,
      "mentions": [
        {
          "text": {
            "content": "Ted Cruz",
            "begin_offset": 60
          },
          "type": "PROPER",
          "sentiment": {}
        }
      ],
      "sentiment": {}
    },
    {
      "name": "some_url",
      "type": "OTHER",
      "salience": 0.0676858201622963,
      "mentions": [
        {
          "text": {
            "content": "some_url",
            "begin_offset": 92
          },
          "type": "PROPER",
          "sentiment": {}
        }
      ],
      "sentiment": {}
    },
    {
      "name": "Higher Ed",
      "type": "OTHER",
      "metadata": {
        "wikipedia_url": "https://en.wikipedia.org/wiki/Higher_education",
        "mid": "/m/03r55"
      },
      "salience": 0.043062858283519745,
      "mentions": [
        {
          "text": {
            "content": "Higher Ed",
            "begin_offset": 73
          },
          "type": "PROPER",
          "sentiment": {}
        }
      ],
      "sentiment": {}
    }
  ],
  "language": "en"
}

Here's the code:

# copyright 2016 Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This application demonstrates how to perform basic operations with the
Google Cloud Natural Language API

For more information, the documentation at
https://cloud.google.com/natural-language/docs.
"""

import argparse
import sys

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six

import json
from google.protobuf.json_format import MessageToDict, MessageToJson




# [START def_sentiment_text]
def sentiment_text(text):
    """Detects sentiment in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START migration_document_text]
    # [START migration_analyze_sentiment]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    # [END migration_document_text]

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
    # [END migration_analyze_sentiment]
# [END def_sentiment_text]


# [START def_sentiment_file]
def sentiment_file(gcs_uri):
    """Detects sentiment in the file located in Google Cloud Storage."""
    client = language.LanguageServiceClient()

    # Instantiates a plain text document.
    # [START migration_document_gcs_uri]
    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)
    # [END migration_document_gcs_uri]

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
# [END def_sentiment_file]


# [START def_entities_text]
def entities_text(text):
    """Detects entities in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START migration_analyze_entities]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    # entity types from enums.Entity.Type
    entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
                   'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
              entity.metadata.get('wikipedia_url', '-')))
    # [END migration_analyze_entities]
# [END def_entities_text]


# [START def_entities_file]
def entities_file(gcs_uri):
    """Detects entities in the file located in Google Cloud Storage."""
    client = language.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    # entity types from enums.Entity.Type
    entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
                   'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
              entity.metadata.get('wikipedia_url', '-')))
# [END def_entities_file]


# [START def_syntax_text]
def syntax_text(text):
    """Detects syntax in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START migration_analyze_syntax]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')

    for token in tokens:
        print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
                               token.text.content))
    # [END migration_analyze_syntax]
# [END def_syntax_text]


# [START def_syntax_file]
def syntax_file(gcs_uri):
    """Detects syntax in the file located in Google Cloud Storage."""
    client = language.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')

    for token in tokens:
        print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
                               token.text.content))
# [END def_syntax_file]


# [START def_entity_sentiment_text]
def entity_sentiment_text(text, line_number):
    """Detects entity sentiment in the provided text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(
        content=text.encode('utf-8'),
        type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)
    json_results = MessageToJson(result, preserving_proto_field_name=True)

    print(type(result))

    '''
    for entity in result.entities:
        print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
     '''
    print(result)
    file_name = str(line_number)+".json"
    with open(file_name, 'w') as outfile:
        outfile.write(json_results)
        #json.dump(result, outfile)
     #for entity in result.entities:

     #   for mention in entity.mentions:





# [END def_entity_sentiment_text]


def entity_sentiment_file(gcs_uri):
    """Detects entity sentiment in a Google Cloud Storage file."""
    client = language.LanguageServiceClient()

    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))


# [START def_classify_text]
def classify_text(text):
    """Classifies content categories of the provided text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(
        content=text.encode('utf-8'),
        type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_text]


# [START def_classify_file]
def classify_file(gcs_uri):
    """Classifies content categories of the text in a Google Cloud Storage
    file.
    """
    client = language.LanguageServiceClient()

    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_file]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')

    classify_text_parser = subparsers.add_parser(
        'classify-text', help=classify_text.__doc__)
    classify_text_parser.add_argument('text')

    classify_text_parser = subparsers.add_parser(
        'classify-file', help=classify_file.__doc__)
    classify_text_parser.add_argument('gcs_uri')

    sentiment_entities_text_parser = subparsers.add_parser(
        'sentiment-entities-text', help=entity_sentiment_text.__doc__)
    sentiment_entities_text_parser.add_argument('text')
    #added by mona
    sentiment_entities_text_parser.add_argument('line_number')

    sentiment_entities_file_parser = subparsers.add_parser(
        'sentiment-entities-file', help=entity_sentiment_file.__doc__)
    sentiment_entities_file_parser.add_argument('gcs_uri')

    sentiment_text_parser = subparsers.add_parser(
        'sentiment-text', help=sentiment_text.__doc__)
    sentiment_text_parser.add_argument('text')

    sentiment_file_parser = subparsers.add_parser(
        'sentiment-file', help=sentiment_file.__doc__)
    sentiment_file_parser.add_argument('gcs_uri')

    entities_text_parser = subparsers.add_parser(
        'entities-text', help=entities_text.__doc__)
    entities_text_parser.add_argument('text')

    entities_file_parser = subparsers.add_parser(
        'entities-file', help=entities_file.__doc__)
    entities_file_parser.add_argument('gcs_uri')

    syntax_text_parser = subparsers.add_parser(
        'syntax-text', help=syntax_text.__doc__)
    syntax_text_parser.add_argument('text')

    syntax_file_parser = subparsers.add_parser(
        'syntax-file', help=syntax_file.__doc__)
    syntax_file_parser.add_argument('gcs_uri')

    args = parser.parse_args()

    if args.command == 'sentiment-text':
        sentiment_text(args.text)
    elif args.command == 'sentiment-file':
        sentiment_file(args.gcs_uri)
    elif args.command == 'entities-text':
        entities_text(args.text)
    elif args.command == 'entities-file':
        entities_file(args.gcs_uri)
    elif args.command == 'syntax-text':
        syntax_text(args.text)
    elif args.command == 'syntax-file':
        syntax_file(args.gcs_uri)
    elif args.command == 'sentiment-entities-text':
        entity_sentiment_text(args.text, args.line_number)
    elif args.command == 'sentiment-entities-file':
        entity_sentiment_file(args.gcs_uri)
    elif args.command == 'classify-text':
        classify_text(args.text)
    elif args.command == 'classify-file':
        classify_file(args.gcs_uri)

Here's the script I use to run the code:

#!/bin/bash

n=1

while read -u 3 -r line; do
  echo $n "${line::30}"
  python entity_sentiment.py sentiment-entities-text "$line" $n
  ((n++))
done 3< 10tweets.txt

and then

bash -x runjob.sh

also, 10tweets.txt is:

$ cat 10tweets.txt 
Trump on the other hand goes all in on water boarding AND some. #GOPDebate
RT @wpjenna Donald Trump promises that he will not touch the 2nd amendment -- "unless we're going to make it stronger."
Trump 23%, Rubio 19%, Kasich & Bush 14%, Christie 10%, Cruz 9% #NHPrimary
@realDonaldTrump Thank you for saying you won't use vulger language anymore. Talk about Sanders & Clinton. Take Cruz as VP. Mexican votes!!!
RT @SurfPHX Mr. Trump @realDonaldTrump tweeted 25 minutes ago. You all do realize, that our future President hardly sleeps. He's a Fighter and a Worker!
go, Bernie #DemDebate
Sanders calls out Clinton on taking Foreign Policy advice from Warmonger Henry Kissinger some_url via @YouTube
Cruz, Rubio, and the Moral Bankruptcy of Progressive Identity Politics some_url via @NRO
RT @scarylawyerguy "Who does Bernie Sanders listen to on foreign policy." - A question Hillary had to raise b/c the media will not. #DemDebate
Why Did U of California Fire Tenured Riverside Professor? / Ted Cruz and Higher Ed -- ... - some_url

If I just print the results it will show magnitude and sentiment as below:

$ cat 10.json 
Mentions: 
Name: "Professor"
  Begin Offset : 47
  Content : Professor
  Magnitude : 0.0
  Sentiment : 0.0
  Type : 2
Salience: 0.47092151641845703
Sentiment: 

Mentions: 
Name: "Did U of California Fire Tenured Riverside"
  Begin Offset : 4
  Content : Did U of California Fire Tenured Riverside
  Magnitude : 0.0
  Sentiment : 0.0
  Type : 1
Salience: 0.2889040410518646
Sentiment: 

Mentions: 
Name: "Ted Cruz"
  Begin Offset : 60
  Content : Ted Cruz
  Magnitude : 0.0
  Sentiment : 0.0
  Type : 1
Salience: 0.1294257491827011
Sentiment: 

Mentions: 
Name: "some_url"
  Begin Offset : 92
  Content : some_url
  Magnitude : 0.0
  Sentiment : 0.0
  Type : 1
Salience: 0.0676858201622963
Sentiment: 

Mentions: 
Name: "Higher Ed"
  Begin Offset : 73
  Content : Higher Ed
  Magnitude : 0.0
  Sentiment : 0.0
  Type : 1
Salience: 0.043062858283519745
Sentiment: 

Basically, the sentiment field is empty hence there is no way to extract score and magnitude from it like I did previously using print:

print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
print(u'  Sentiment : {}'.format(mention.sentiment.score))

Solution

  • When the numerical values are zero, they get no field in the json. https://github.com/gogo/protobuf/issues/218 seems there is no fix to this yet. But you can check if these fields don't exist, it means their values is zero. Here's an example in which some of the score/magnitudes are zero hence don't exist in the json file and the rest do:

    {
      "entities": [
        {
          "name": "RT @scarylawyerguy",
          "type": "OTHER",
          "salience": 0.4150770902633667,
          "mentions": [
            {
              "text": {
                "content": "RT @scarylawyerguy"
              },
              "type": "PROPER",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "foreign policy",
          "type": "OTHER",
          "salience": 0.19249163568019867,
          "mentions": [
            {
              "text": {
                "content": "foreign policy",
                "begin_offset": 57
              },
              "type": "COMMON",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "Bernie Sanders",
          "type": "PERSON",
          "metadata": {
            "wikipedia_url": "https://en.wikipedia.org/wiki/Bernie_Sanders",
            "mid": "/m/01_gbv"
          },
          "salience": 0.13153041899204254,
          "mentions": [
            {
              "text": {
                "content": "Bernie Sanders",
                "begin_offset": 29
              },
              "type": "PROPER",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "question",
          "type": "OTHER",
          "salience": 0.08613643795251846,
          "mentions": [
            {
              "text": {
                "content": "question",
                "begin_offset": 78
              },
              "type": "COMMON",
              "sentiment": {
                "magnitude": 0.10000000149011612,
                "score": -0.10000000149011612
              }
            }
          ],
          "sentiment": {
            "magnitude": 0.10000000149011612,
            "score": -0.10000000149011612
          }
        },
        {
          "name": "media",
          "type": "OTHER",
          "salience": 0.0647100880742073,
          "mentions": [
            {
              "text": {
                "content": "media",
                "begin_offset": 116
              },
              "type": "COMMON",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "Hillary",
          "type": "PERSON",
          "metadata": {
            "wikipedia_url": "https://en.wikipedia.org/wiki/Hillary_Clinton",
            "mid": "/m/0d06m5"
          },
          "salience": 0.054773446172475815,
          "mentions": [
            {
              "text": {
                "content": "Hillary",
                "begin_offset": 87
              },
              "type": "PROPER",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "b/c",
          "type": "OTHER",
          "salience": 0.028641967102885246,
          "mentions": [
            {
              "text": {
                "content": "b/c",
                "begin_offset": 108
              },
              "type": "COMMON",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        },
        {
          "name": "DemDebate",
          "type": "OTHER",
          "salience": 0.026638930663466454,
          "mentions": [
            {
              "text": {
                "content": "DemDebate",
                "begin_offset": 133
              },
              "type": "PROPER",
              "sentiment": {}
            }
          ],
          "sentiment": {}
        }
      ],
      "language": "en"
    }
    

    and the raw result for that is:

    <class 'google.cloud.language_v1.types.AnalyzeEntitySentimentResponse'>
    entities {
      name: "RT @scarylawyerguy"
      type: OTHER
      salience: 0.4150770902633667
      mentions {
        text {
          content: "RT @scarylawyerguy"
        }
        type: PROPER
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "foreign policy"
      type: OTHER
      salience: 0.19249163568019867
      mentions {
        text {
          content: "foreign policy"
          begin_offset: 57
        }
        type: COMMON
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "Bernie Sanders"
      type: PERSON
      metadata {
        key: "mid"
        value: "/m/01_gbv"
      }
      metadata {
        key: "wikipedia_url"
        value: "https://en.wikipedia.org/wiki/Bernie_Sanders"
      }
      salience: 0.13153041899204254
      mentions {
        text {
          content: "Bernie Sanders"
          begin_offset: 29
        }
        type: PROPER
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "question"
      type: OTHER
      salience: 0.08613643795251846
      mentions {
        text {
          content: "question"
          begin_offset: 78
        }
        type: COMMON
        sentiment {
          magnitude: 0.10000000149011612
          score: -0.10000000149011612
        }
      }
      sentiment {
        magnitude: 0.10000000149011612
        score: -0.10000000149011612
      }
    }
    entities {
      name: "media"
      type: OTHER
      salience: 0.0647100880742073
      mentions {
        text {
          content: "media"
          begin_offset: 116
        }
        type: COMMON
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "Hillary"
      type: PERSON
      metadata {
        key: "mid"
        value: "/m/0d06m5"
      }
      metadata {
        key: "wikipedia_url"
        value: "https://en.wikipedia.org/wiki/Hillary_Clinton"
      }
      salience: 0.054773446172475815
      mentions {
        text {
          content: "Hillary"
          begin_offset: 87
        }
        type: PROPER
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "b/c"
      type: OTHER
      salience: 0.028641967102885246
      mentions {
        text {
          content: "b/c"
          begin_offset: 108
        }
        type: COMMON
        sentiment {
        }
      }
      sentiment {
      }
    }
    entities {
      name: "DemDebate"
      type: OTHER
      salience: 0.026638930663466454
      mentions {
        text {
          content: "DemDebate"
          begin_offset: 133
        }
        type: PROPER
        sentiment {
        }
      }
      sentiment {
      }
    }
    language: "en"