pythonpython-2.7nltknaivebayesnltk-trainer

Python 2.x - How to get the result of the NLTK Naive Bayes classification through a trainSet and a testSet


I'm building a text parser to identify types of crime that contain the texts. My class was built to load the texts of 2 csv files (one file to train and one file to test). The way it was built the methods in my class are for, to make a rapid processing in the texts, to remove the stopwords, to extract the vector of characteristics and among others. Follow the code below.

import re
import codecs
import csv
import nltk
import sklearn
from sklearn import cross_validation
import pandas as pd


# variaveis
tweets = []
caracteristicas = []
testBase = []
testset = []

# Tweet pre-processing
def preProcessamentoText(tweet):
    # converte para minusculas
    tweet = tweet.lower()

    # remove URLs (www.* ou https?://*)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)

    # remove @username
    tweet = re.sub('@[^\s]+','AT_USER',tweet)

    # remove multiplos espacos em brancos
    tweet = re.sub('[\s]+', ' ', tweet)

    # substitui #work por work
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # trim
    tweet = tweet.strip('\'"')

    return tweet
#end

# list of stopWords
def getStopWords(stopWordListFileName):

    stopWords = []
    stopWords = nltk.corpus.stopwords.words('portuguese')
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = codecs.open(stopWordListFileName, encoding='utf-8')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()

    return stopWords
#end

# Remove repeat letters. Ex.: leeeeento = lento
def removeRepeticao(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

# Feature vector
def getVetorCaracteristicas(tweet):

    featureVector = []
    stopWords = getStopWords('data/stopwords_pt-BR.txt')
    words = tweet.split()
    for w in words:

        # remove letras repetidas
        w = removeRepeticao(w)

        # remove sinais de pontuacao
        w = w.strip('\'"?,.')

        # verifica se a palavra inicia com numero
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)

        # não adiciona se a palavra já existe na lista
        # ou se a palavra começa com número
        # ou tem tamanha menos que 2
        if(w in stopWords or val is None or len(w) <= 2):
            continue
        else:
            featureVector.append(w.lower())

    return featureVector
#end

#load trainset
def carregarTextos():

    global caracteristicas

    inpTexts = csv.reader(open('data/baseTreino.csv', 'rb'), delimiter=',', quotechar='|')
    for row in inpTexts:
        #print row
        sentimento = row[0]
        tweet = row[1]
        textoProcessado = preProcessamentoText(tweet)
        vetorCaracteristicas = getVetorCaracteristicas(textoProcessado)
        caracteristicas.extend(vetorCaracteristicas)
        tweets.append((vetorCaracteristicas,sentimento))
        #print tweets
    #end loop

    # remove entradas duplicadas
    caracteristicas = list(set(caracteristicas))

#load testSet
def test_set():
    global testBase

    #Lendo o conjunto de testes
    testTexts = csv.reader(open('data/baseTestes.csv', 'rb'), delimiter=',', quotechar='|')
    for row in testTexts:
        #print row
        sentimento = row[0]
        tweet = row[1]
        textoProcessado = preProcessamentoText(tweet)
        vetorCaracteristicas = getVetorCaracteristicas(textoProcessado)
        testBase.extend(vetorCaracteristicas)
        testset.append((vetorCaracteristicas,sentimento))
        #print testset

    testBase = list(set(testBase))

#Extraction of characteristics
def extracaoCaracteristicas(tweet):

    #print tweet

    palavras = set(tweet)
    lista = {}
    for palavra in caracteristicas:
        lista['contains(%s)' % palavra] = (palavra in palavras)
    #end loop
    return lista

#Method to classify the text according to the feeling
def classificaTexto(tweet):

    textoProcessado = preProcessamentoText(tweet)
    result = NBClassifier.classify(extracaoCaracteristicas(getVetorCaracteristicas(textoProcessado)))

    #print result
    if (result == 4) :
        print 'Crime não categorizado - ' + tweet
    elif (result == 1):
        print 'Roubo - ' + tweet
    elif(result == 2):
        print 'Homicídio - ' + tweet
    elif(result== 3):
        print 'Tráfico - ' + tweet
    else :
        print 'Não representa um crime - ' + tweet


# Main function
if __name__ == '__main__':
    #load the 2 set (train and test)
    carregarTextos()
    test_set()

    # Extract the feature vector of all tweets in one go
    conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
    conjuntoTeste = nltk.classify.util.apply_features(extracaoCaracteristicas,testset)

    # Train the classifier
    #NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
    #print 'accuracy:', (nltk.classify.util.accuracy(NBClassifier, conjuntoTeste))

    #CrossValidation - Using ScikitLearn and NLTK
    cv = cross_validation.KFold(len(conjuntoTreino), n_folds=10, shuffle=False, random_state=None)
    for traincv, testcv in cv:
        classifier = nltk.NaiveBayesClassifier.train(conjuntoTreino[traincv[0]:traincv[len(traincv)-1]])
        print 'accuracy:', nltk.classify.util.accuracy(classifier, conjuntoTreino[testcv[0]:testcv[len(testcv)-1]])

On Main I used the normal Naive Bayes and saw their accuracy and then the Naive Bayes with cross-validation and saw their accuracy. Now I wanted to test the Naive Bayes already trained upon the CSV containing the texts for the test. In case, test the sort on the test basis.

My method def classificaTexto(tweet):. It's just to do this job, but I'm not even able to use it with the classifier already trained. If I create a text as

texto1 = 'Enviado por um seguidor: Carro roubado no conjunto Augusto Franco'
classificaTexto(texto1)

The method will do its job and sort.

Additional Information: My csv are in this formed. An example: Where the number before the text represents the crime team. It was done so that the method could be used def classificaTexto(tweet):

|1|,|Enviado por um seguidor :Exclusivo.Bom dia.2 caras vestidos de palhaços ontem a noite roubaram as armas dos guardas municipais que faziam a segurança do posto médico aqui no bairro Coroa do Meio!! Polícia nas ruas a procura dos marginais !!!  Surreal isso...|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|2|,|Enviado por um seguidor :Segundo informações acaba de acontecer um homicídio na cidade de Malhador no povoado Boqueval \,vítima de pré nome Ronaldo.|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

Solution

  • You just need to call classify() method from the same object that called train(). One way to do it is by passing the object as method's argument:

    #Method to classify the text according to the feeling
    def classificaTexto(nbc, tweet):
    
        textoProcessado = preProcessamentoText(tweet)
        result = nbc.classify(extracaoCaracteristicas(getVetorCaracteristicas(textoProcessado)))
    
        #print result
        if (result == 4) :
            print 'Crime não categorizado - ' + tweet
        elif (result == 1):
            print 'Roubo - ' + tweet
        elif(result == 2):
            print 'Homicídio - ' + tweet
        elif(result== 3):
            print 'Tráfico - ' + tweet
        else :
            print 'Não representa um crime - ' + tweet
    

    then you should be able to use it like this:

    # Main function
    if __name__ == '__main__':
        #load the 2 set (train and test)
        carregarTextos()
        test_set()
    
        # Extract the feature vector of all tweets in one go
        conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
    
        # Train the classifier
        NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
    
        # Classify tweet
        texto1 = 'Enviado por um seguidor: Carro roubado no conjunto Augusto Franco'
        classificaTexto(NBClassifier, texto1)
    

    UPDATE

    If you want to classify on the output of nltk.classify.util.apply_features(), you can slightly modify classificaTexto():

    def classificaTexto(nbc, data):
    
        for features in data:
            result = nbc.classify(features)
            #print result
            if (result == 4) :
                print 'Crime não categorizado - ' + tweet
            elif (result == 1):
                print 'Roubo - ' + tweet
            elif(result == 2):
                print 'Homicídio - ' + tweet
            elif(result== 3):
                print 'Tráfico - ' + tweet
            else :
                print 'Não representa um crime - ' + tweet
    

    and use it like this:

    # Main function
    if __name__ == '__main__':
        #load the 2 set (train and test)
        carregarTextos()
        test_set()
    
        # Extract the feature vector of all tweets in one go
        conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
        conjuntoTeste = nltk.classify.util.apply_features(extracaoCaracteristicas,testset)
    
        # Train the classifier
        NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
    
        # Classify testset    
        classificaTexto(NBClassifier, conjuntoTeste)
    

    you can also use results = nbc.classify_many(data) if you wish to immediately store the results in a list