Here is my HMM model class:
class HiddenMarkovModel:
def __init__(self):
pass
def fit(self, train_tokens_tags_list):
"""
train_tokens_tags_list: array of sentences of pairs word-tag (for train)
"""
tags = [tag for sent in train_tokens_tags_list
for (word, tag) in sent]
words = [word for sent in train_tokens_tags_list
for (word, tag) in sent]
tag_num = pd.Series(data = nltk.FreqDist(tags)).sort_index()
word_num = pd.Series(data = nltk.FreqDist(words)).sort_values(ascending=False)
self.tags = tag_num.index
self.words = word_num.index
A = pd.DataFrame({'{}'.format(tag) : [0] * len(tag_num) for tag in tag_num.index}, index=tag_num.index)
B = pd.DataFrame({'{}'.format(tag) : [0] * len(word_num) for tag in tag_num.index}, index=word_num.index)
# Compure matrixes A and B from count of words and tags
# sent - sentence
# sent[i][0] - i word in this sentence, sent[i][1] - i tag in this sentence
for sent in train_tokens_tags_list:
for i in range(len(sent)):
B.loc[sent[i][0], sent[i][1]] += 1 # current i-pair word-tag
if len(sent) - 1 != i: # for last tag there is no next tag
A.loc[sent[i][1], sent[i + 1][1]] += 1 # pair tag-tag
# now to probabilities
# normalize along the row, i.e along all possible next tags
A = A.divide(A.sum(axis=1), axis=0)
# normalize along column, i.e along all possible current words
B = B / np.sum(B, axis=0)
self.A = A
self.B = B
return self
def predict(self, test_tokens_list):
"""
test_tokens_list : array of sentences of pairs word-tag (for test)
"""
predict_tags = OrderedDict({i : np.array([]) for i in range(len(test_tokens_list))})
for i_sent in range(len(test_tokens_list)):
current_sent = test_tokens_list[i_sent] # current sentence
len_sent = len(current_sent) # lenght of sentence
q = np.zeros(shape=(len_sent + 1, len(self.tags)))
q[0] = 1 # zero state (equal distribution for all s)
back_point = np.zeros(shape=(len_sent + 1, len(self.tags))) # # argmax
for t in range(len_sent):
# if we haven't met this word during training,
# we'll use the most popular word with most popular tag instead
if current_sent[t] not in self.words:
# print('this word not in dictionary', current_sent[t])
tagdict = findtags('NOUN', brown_tagged_words)
current_sent[t] = tagdict['NOUN'][0][0]
# using max we choose next tag
for i_s in range(len(self.tags)):
# i_s - index for tag
s = self.tags[i_s] # tag
# formula (1)
probability = np.max(q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s])
q[t + 1][i_s] = np.max(probability)
# argmax formula(1)
# argmax for recreating the sequence of tags
back_point[t + 1][i_s] = (probability).reset_index()[s].idxmax() # index
back_point = back_point.astype('int')
# saving tags and changing their order for real one
back_tag = deque()
current_tag = np.argmax(q[len_sent])
for t in range(len_sent, 0, -1):
back_tag.appendleft(self.tags[current_tag])
current_tag = back_point[t, current_tag]
predict_tags[i_sent] = np.array(back_tag)
return predict_tags
It was actually almost completely written before me. I was to fill in some gaps (above all in the Viterbi algorithm where we compute q and back_point matrices). But I think I did something wrong as my model always predicts something like that:
OrderedDict([(0, array(['.', '.', '.'], dtype='<U1'))])
In the example above I gave it this sentence: [['he', 'can', 'stay']]
I've checked : all these words are in the train data.
I've printed out matrices A and B. Here they are:
A:
B:
This shows that the first part of model is working fine.
These are folmulas I had for help:
This is the data:
brown_tagged_sents = brown.tagged_sents(tagset="universal", categories='humor')
my_brown_tagged_sents = []
for sent in brown_tagged_sents:
my_brown_tagged_sents.append(list(map(lambda x: (x[0].lower(), x[1]), sent)))
my_brown_tagged_sents = np.array(my_brown_tagged_sents)
from sklearn.model_selection import train_test_split
train_sents, test_sents = train_test_split(my_brown_tagged_sents, random_state=0, test_size=0.1)
What could be wrong with my model?
Probability should look like that:
probability = q[t, :] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]
instead of this:
probability = q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]