pythonpython-3.xnltktext-analysis

Create sentence (row) to POS tags counts (column) matrix from a dataframe


I am trying to build a matrix where the first row will be a part of speech, first column a sentence. values in the matrix should show the number of such POS in a sentence.

So I am creating POS tags in this way:

data = pd.read_csv(open('myfile.csv'),sep=';') 

target = data["label"]
del data["label"]

data.sentence = data.sentence.str.lower() # All strings in data frame to lowercase

for line in data.sentence:
    Line_new= nltk.pos_tag(nltk.word_tokenize(line))
    print(Line_new)

The output is:

[('together', 'RB'), ('with', 'IN'), ('the', 'DT'), ('6th', 'CD'), ('battalion', 'NN'), ('of', 'IN'), ('the', 'DT')]

How can I create a matrix which I have described above from such output?

UPDATE: The desired output is

                   NN  VB    IN    VBZ    DT
 I was there       1   1     1      0     0
 He came there     0   0     1      1     1

myfile.csv:

"A child who is exclusively or predominantly oral (using speech for communication) can experience social isolation from his or her hearing peers, particularly if no one takes the time to explicitly teach them social skills that other children acquire independently by virtue of having normal hearing.";"certain"
"Preliminary Discourse to the Encyclopedia of Diderot";"certain"
"d'Alembert claims that it would be ignorant to perceive that everything could be known about a particular subject.";"certain"
"However, as the overemphasis on parental influence of psychodynamics theory has been strongly criticized in the previous century, modern psychologists adopted interracial contact as a more important determinant than childhood experience on shaping people’s prejudice traits (Stephan & Rosenfield, 1978).";"uncertain"
"this can also be summarized as a distinguish behaviour on the peronnel level";"uncertain"

Solution

  • In Long:

    First let's add some headers to your csv so that it's more humanly-readable when accessing the columns:

    >>> import pandas as pd
    >>> df = pd.read_csv('myfile.csv', delimiter=';')
    >>> df.columns = ['sent', 'tag']
    >>> df['sent']
    0    Preliminary Discourse to the Encyclopedia of D...
    1    d'Alembert claims that it would be ignorant to...
    2    However, as the overemphasis on parental influ...
    3    this can also be summarized as a distinguish b...
    Name: sent, dtype: object
    >>> df['tag']
    0      certain
    1      certain
    2    uncertain
    3    uncertain
    

    Now let's create a function tok_and_tag that does word_tokenize and pos_tag in a chained manner:

    >>> from nltk import word_tokenize, pos_tag
    >>> from functools import partial
    >>> tok_and_tag = lambda x: pos_tag(word_tokenize(x))
    >>> df['sent'][0]
    'Preliminary Discourse to the Encyclopedia of Diderot'
    >>> tok_and_tag(df['sent'][0])
    [('Preliminary', 'JJ'), ('Discourse', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('Encyclopedia', 'NNP'), ('of', 'IN'), ('Diderot', 'NNP')]
    

    Then, we can use df.apply to tokenize and tag the sentence column of the dataframe:

    >>> df['sent'].apply(tok_and_tag)
    0    [(Preliminary, JJ), (Discourse, NNP), (to, TO)...
    1    [(d'Alembert, NN), (claims, NNS), (that, IN), ...
    2    [(However, RB), (,, ,), (as, IN), (the, DT), (...
    3    [(this, DT), (can, MD), (also, RB), (be, VB), ...
    Name: sent, dtype: object
    

    If you want the sentences to be lowercased:

    >>> df['sent'].apply(str.lower)
    0    preliminary discourse to the encyclopedia of d...
    1    d'alembert claims that it would be ignorant to...
    2    however, as the overemphasis on parental influ...
    3    this can also be summarized as a distinguish b...
    Name: sent, dtype: object
    
    >>> df['lower_sent'] = df['sent'].apply(str.lower)
    
    >>> df['lower_sent'].apply(tok_and_tag)
    0    [(preliminary, JJ), (discourse, NN), (to, TO),...
    1    [(d'alembert, NN), (claims, NNS), (that, IN), ...
    2    [(however, RB), (,, ,), (as, IN), (the, DT), (...
    3    [(this, DT), (can, MD), (also, RB), (be, VB), ...
    Name: lower_sent, dtype: object
    

    Additionally, we need some sort of way to get the POS vocabulary, we can do use collections.Counter and itertools.chain to flatten the list of list:

    >>> df['lower_sent']
    0    preliminary discourse to the encyclopedia of d...
    1    d'alembert claims that it would be ignorant to...
    2    however, as the overemphasis on parental influ...
    3    this can also be summarized as a distinguish b...
    Name: lower_sent, dtype: object
    
    >>> df['lower_sent'].apply(tok_and_tag)
    0    [(preliminary, JJ), (discourse, NN), (to, TO),...
    1    [(d'alembert, NN), (claims, NNS), (that, IN), ...
    2    [(however, RB), (,, ,), (as, IN), (the, DT), (...
    3    [(this, DT), (can, MD), (also, RB), (be, VB), ...
    Name: lower_sent, dtype: object
    
    >>> df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag)
    
    >>> tokens, tags = zip(*chain(*df['tagged_sent'].tolist()))
    
    >>> tags
    ('JJ', 'NN', 'TO', 'DT', 'NN', 'IN', 'NN', 'NN', 'NNS', 'IN', 'PRP', 'MD', 'VB', 'JJ', 'TO', 'VB', 'IN', 'NN', 'MD', 'VB', 'VBN', 'IN', 'DT', 'JJ', 'NN', '.', 'RB', ',', 'IN', 'DT', 'NN', 'IN', 'JJ', 'NN', 'IN', 'NNS', 'NN', 'VBZ', 'VBN', 'RB', 'VBN', 'IN', 'DT', 'JJ', 'NN', ',', 'JJ', 'NNS', 'VBD', 'JJ', 'NN', 'IN', 'DT', 'RBR', 'JJ', 'NN', 'IN', 'NN', 'NN', 'IN', 'VBG', 'JJ', 'NN', 'NNS', '(', 'NN', 'CC', 'NN', ',', 'CD', ')', '.', 'DT', 'MD', 'RB', 'VB', 'VBN', 'IN', 'DT', 'JJ', 'NN', 'IN', 'DT', 'NNS', 'NN')
    
    >>> set(tags)
    {'CC', 'VB', ')', 'NNS', ',', 'JJ', 'VBZ', 'DT', 'NN', 'PRP', 'RBR', 'TO', 'VBD', '(', 'VBN', '.', 'MD', 'IN', 'RB', 'VBG', 'CD'}
    >>> possible_tags = sorted(set(tags))
    >>> possible_tags
    ['(', ')', ',', '.', 'CC', 'CD', 'DT', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'PRP', 'RB', 'RBR', 'TO', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ']
    
    >>> possible_tags_counter = Counter({p:0 for p in possible_tags})
    >>> possible_tags_counter
    Counter({'NNS': 0, 'VBZ': 0, 'DT': 0, '(': 0, 'JJ': 0, 'VBD': 0, ')': 0, 'RB': 0, 'VBG': 0, 'RBR': 0, 'VB': 0, 'IN': 0, 'CC': 0, ',': 0, 'PRP': 0, 'CD': 0, 'VBN': 0, '.': 0, 'MD': 0, 'NN': 0, 'TO': 0})
    

    To iterate through each tagged sentence and get the counts of POS:

    >>> df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1]))
    0    {'NN': 3, 'IN': 1, 'TO': 1, 'DT': 1, 'JJ': 1}
    1    {'NN': 3, 'VB': 3, 'PRP': 1, 'TO': 1, 'DT': 1,...
    2    {')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, 'NNS': 3,...
    3    {'DT': 3, 'VB': 1, 'NN': 2, 'VBN': 1, 'NNS': 1...
    Name: tagged_sent, dtype: object
    
    >>> df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1]))
    
    >>> df['pos_counts']
    0    {'NN': 3, 'IN': 1, 'TO': 1, 'DT': 1, 'JJ': 1}
    1    {'NN': 3, 'VB': 3, 'PRP': 1, 'TO': 1, 'DT': 1,...
    2    {')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, 'NNS': 3,...
    3    {'DT': 3, 'VB': 1, 'NN': 2, 'VBN': 1, 'NNS': 1...
    Name: pos_counts, dtype: object
    
    # Now we can add in the POS that don't appears in the sentence with 0 counts:
    
    >>> def add_pos_with_zero_counts(counter, keys_to_add):
    ...     for k in keys_to_add:
    ...         counter[k] = counter.get(k, 0)
    ...     return counter
    ... 
    >>> df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
    0    {'VB': 0, 'IN': 1, 'PRP': 0, 'DT': 1, 'CC': 0,...
    1    {'VB': 3, ')': 0, 'DT': 1, 'CC': 0, 'RB': 0, '...
    2    {'VB': 0, ')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, ...
    3    {'VB': 1, 'IN': 2, 'PRP': 0, 'NN': 2, 'CC': 0,...
    Name: pos_counts, dtype: object
    
    >>> df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
    

    Now flatten the values into the list:

    >>> df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])
    0    [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 0, 0, 0, ...
    1    [0, 0, 0, 1, 0, 0, 1, 3, 2, 2, 3, 1, 1, 0, 0, ...
    2    [1, 1, 3, 1, 1, 1, 3, 7, 6, 0, 11, 3, 0, 2, 1,...
    3    [0, 0, 0, 0, 0, 0, 3, 2, 1, 1, 2, 1, 0, 1, 0, ...
    Name: pos_counts_with_zero, dtype: object
    
    >>> df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])
    

    Now you need to create a new matrix to store the BoW:

    >>> df2 = pd.DataFrame(df['sent_vector'].tolist)
    >>> df2.columns = sorted(possible_tags)
    

    And voila:

    >>> df2
       (  )  ,  .  CC  CD  DT  IN  JJ  MD ...   NNS  PRP  RB  RBR  TO  VB  VBD  \
    0  0  0  0  0   0   0   1   1   1   0 ...     0    0   0    0   1   0    0   
    1  0  0  0  1   0   0   1   3   2   2 ...     1    1   0    0   1   3    0   
    2  1  1  3  1   1   1   3   7   6   0 ...     3    0   2    1   0   0    1   
    3  0  0  0  0   0   0   3   2   1   1 ...     1    0   1    0   0   1    0   
    
       VBG  VBN  VBZ  
    0    0    0    0  
    1    0    1    0  
    2    1    2    1  
    3    0    1    0  
    
    [4 rows x 21 columns]
    

    In short:

    from collections import Counter
    from itertools import chain
    
    import pandas as pd
    
    from nltk import word_tokenize, pos_tag
    
    df = pd.read_csv('myfile.csv', delimiter=';')
    df.columns = ['sent', 'tag']
    
    tok_and_tag = lambda x: pos_tag(word_tokenize(x))
    
    df['lower_sent'] = df['sent'].apply(str.lower)
    df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag)
    
    possible_tags = sorted(set(list(zip(*chain(*df['tagged_sent'])))[1]))
    
    def add_pos_with_zero_counts(counter, keys_to_add):
        for k in keys_to_add:
            counter[k] = counter.get(k, 0)
        return counter
    
    
    # Detailed steps.
    df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1]))
    df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
    df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])
    
    # All in one.
    df['sent_vector'] = df['tagged_sent'].apply(lambda x:
        [count for tag, count in sorted(
            add_pos_with_zero_counts(
                Counter(list(zip(*x))[1]), 
                        possible_tags).most_common()
             )
        ]
    )
    
    df2 = pd.DataFrame(df['sent_vector'].tolist())
    df2.columns = possible_tags