pythonnlpgoogle-cloud-platformgoogle-cloud-nl

Google Cloud NL entity recognizer grouping words together


When attempting to find the entities in a long input of text, Google Cloud's natural language program is grouping together words and then getting their incorrect entity. Here is my program:

def entity_recognizer(nouns):

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/superaitor/Downloads/link"
text = ""
for words in nouns:
    text += words + " "
client = language.LanguageServiceClient()

if isinstance(text, six.binary_type):
    text = text.decode('utf-8')

document = types.Document(
    content=text.encode('utf-8'),
    type=enums.Document.Type.PLAIN_TEXT)

encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
    encoding = enums.EncodingType.UTF16


entity = client.analyze_entities(document, encoding).entities
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
               'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

for entity in entity:
    #if entity_type[entity.type] is "PERSON":
    print(entity_type[entity.type])
    print(entity.name)

Here nouns is a list of words. I then turn that into a string(i've tried multiple ways of doing so, all give the same result), but yet the program spits out output like:

PERSON
liberty secularism etching domain professor lecturer tutor royalty 
government adviser commissioner
OTHER
business view society economy
OTHER
business
OTHER
verge industrialization market system custom shift rationality
OTHER
family kingdom life drunkenness college student appearance income family 
brink poverty life writer variety attitude capitalism age process 
production factory system

Any input on how to fix this?


Solution

  • To analyze entities in a text you can use a sample from the documentation which looks something like this:

    import argparse
    import sys
    
    from google.cloud import language
    from google.cloud.language import enums
    from google.cloud.language import types
    import six
    
    def entities_text(text):
        """Detects entities in the text."""
        client = language.LanguageServiceClient()
    
        if isinstance(text, six.binary_type):
            text = text.decode('utf-8')
    
        # Instantiates a plain text document.
        document = types.Document(
            content=text,
            type=enums.Document.Type.PLAIN_TEXT)
    
        # Detects entities in the document. You can also analyze HTML with:
        #   document.type == enums.Document.Type.HTML
        entities = client.analyze_entities(document).entities
    
        # entity types from enums.Entity.Type
        entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
                       'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
    
        for entity in entities:
            print('=' * 20)
            print(u'{:<16}: {}'.format('name', entity.name))
            print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
            print(u'{:<16}: {}'.format('metadata', entity.metadata))
            print(u'{:<16}: {}'.format('salience', entity.salience))
            print(u'{:<16}: {}'.format('wikipedia_url',
                  entity.metadata.get('wikipedia_url', '-')))
    
    entities_text("Donald Trump is president of United States of America")
    

    The output of this sample is:

    ====================
    name            : Donald Trump
    type            : PERSON
    metadata        : <google.protobuf.pyext._message.ScalarMapContainer object at 0x7fd9d0125170>
    salience        : 0.9564903974533081
    wikipedia_url   : https://en.wikipedia.org/wiki/Donald_Trump
    ====================
    name            : United States of America
    type            : LOCATION
    metadata        : <google.protobuf.pyext._message.ScalarMapContainer object at 0x7fd9d01252b0>
    salience        : 0.04350961744785309
    wikipedia_url   : https://en.wikipedia.org/wiki/United_States
    

    As you can see in this example, Entity Analysis inspects the given text for known entities (proper nouns such as public figures, landmarks, etc.). It's not gonna provide you entity for each word in the text.