pythonscikit-learnnlpnltkpos-tagger

Building own classifier based POS tagger using NLTK's SklearnClassifier and ClassifierBasedPOSTagger


I'm trying to build my own classifier based POS tagger using SklearnClassifier and ClassifierBasedPOSTagger. The code that I've tried is given below.

from nltk.corpus import treebank
nltk.download('treebank')

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger

bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
                                      classifier_builder=bnb.train)

# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))

# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))

This code is yielding the following error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Users\ABDULL~1.IMR\AppData\Local\Temp/ipykernel_6580/266992580.py in <module>
      4 
      5 bnb = SklearnClassifier(BernoulliNB())
----> 6 bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
      7                                       classifier_builder=bnb.train)
      8 

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\tag\sequential.py in __init__(self, feature_detector, train, classifier_builder, classifier, backoff, cutoff_prob, verbose)
    637 
    638         if train:
--> 639             self._train(train, classifier_builder, verbose)
    640 
    641     def choose_tag(self, tokens, index, history):

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\tag\sequential.py in _train(self, tagged_corpus, classifier_builder, verbose)
    673         if verbose:
    674             print("Training classifier ({} instances)".format(len(classifier_corpus)))
--> 675         self._classifier = classifier_builder(classifier_corpus)
    676 
    677     def __repr__(self):

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\classify\scikitlearn.py in train(self, labeled_featuresets)
    110 
    111         X, y = list(zip(*labeled_featuresets))
--> 112         X = self._vectorizer.fit_transform(X)
    113         y = self._encoder.fit_transform(y)
    114         self._clf.fit(X, y)

~\Miniconda3\envs\nlp_course\lib\site-packages\sklearn\feature_extraction\_dict_vectorizer.py in fit_transform(self, X, y)
    288             Feature vectors; always 2-d.
    289         
--> 290         return self._transform(X, fitting=True)
    291 
    292     def inverse_transform(self, X, dict_type=dict):

~\Miniconda3\envs\nlp_course\lib\site-packages\sklearn\feature_extraction\_dict_vectorizer.py in _transform(self, X, fitting)
    233                     if feature_name in vocab:
    234                         indices.append(vocab[feature_name])
--> 235                         values.append(self.dtype(v))
    236 
    237             indptr.append(len(indices))

TypeError: float() argument must be a string or a number, not 'NoneType'

How to do it right?


Solution

  • According to the comment from this issue, this is a consequence of a bug in scikit-learn. Scikit-learn's _transform method of DictVectorizer in sklearn/feature_extraction/_dict_vectorizer.py fails when the input argument X contains mappings to None. According to Tom Aarsen, we can now use the following example to make the work done:

    import nltk
    from nltk.corpus import treebank
    
    from nltk.classify import SklearnClassifier
    from sklearn.naive_bayes import BernoulliNB
    from nltk.tag.sequential import ClassifierBasedPOSTagger
    
    nltk.download('treebank')
    
    data = treebank.tagged_sents()
    train_data = data[:3]
    test_data = data[3:]
    
    class CustomClassifierBasedPOSTagger(ClassifierBasedPOSTagger):
    
        def feature_detector(self, tokens, index, history):
            return {
                key: str(value) # Ensure that the feature value is a string. Converts None to 'None'
                for key, value in super().feature_detector(tokens, index, history).items()
            }
    
    bnb = SklearnClassifier(BernoulliNB())
    bnb_tagger = CustomClassifierBasedPOSTagger(train=train_data,
                                                classifier_builder=bnb.train,
                                                verbose=True)
    
    sentence = "This is a sample sentence which I just made for fun."
    # evaluate tagger on test data and sample sentence
    print(bnb_tagger.evaluate(test_data))
    
    # see results on our previously defined sentence
    print(bnb_tagger.tag(nltk.word_tokenize(sentence)))
    

    The output will be like:

    [nltk_data] Downloading package treebank to C:\Users\Tom/nltk_data...
    [nltk_data]   Package treebank is already up-to-date!
    Constructing training corpus for classifier.
    Training classifier (58 instances)
    0.09338289371682999
    [('This', 'NNP'), ('is', 'NNP'), ('a', 'NNP'), ('sample', 'NNP'), ('sentence', 'NNP'), ('which', 'NNP'), ('I', 'NNP'), ('just', 'NNP'), ('made', 'NNP'), ('for', 'NNP'), ('fun', 'NNP'), ('.', 'NNP')]