pythonmachine-learningnaivebayes

Applying Cross validation in Naive bayes


My dataset is Spam and Ham Filipino Message enter image description here

I divided my dataset into 60% training, 20% testing and 20%validation

Split data into testing, training and Validation

from sklearn.model_selection import train_test_split


data['label'] = (data['label'].replace({'ham'  : 0,
                                         'spam' : 1}))
X_train, X_test, y_train, y_test = train_test_split(data['message'], 
                                                        data['label'], test_size=0.2, random_state=1)
    
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2 
print('Total: {} rows'.format(data.shape[0]))
print('Train: {} rows'.format(X_train.shape[0]))
print(' Test: {} rows'.format(X_test.shape[0]))
print(' Validation: {} rows'.format(X_val.shape[0]))

Train a MultinomialNB from sklearn

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np
naive_bayes = MultinomialNB().fit(train_data,
                                  y_train)
predictions = naive_bayes.predict(test_data)

Evaluate the Model

from sklearn.metrics import (accuracy_score, 
                             precision_score,
                             recall_score, 
                             f1_score)
accuracy_score = accuracy_score(y_test,
                                predictions)
precision_score = precision_score(y_test,
                                  predictions)
recall_score = recall_score(y_test,
                            predictions)
f1_score = f1_score(y_test,
                    predictions)

My problem is in Validation. The error says

warnings.warn("Estimator fit failed. The score on this train-test"

this is how I code my validation, don't know if I'm doing the right thing"

 from sklearn.model_selection import cross_val_score
    
    mnb = MultinomialNB()
    scores = cross_val_score(mnb,X_val,y_val, cv = 10, scoring='accuracy')
    
    print('Cross-validation scores:{}'.format(scores))

Solution

  • I did not get any error or warning. Maybe it can be worked.

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import accuracy_score
    import numpy as np
    from sklearn.metrics import (accuracy_score, 
                                 precision_score,
                                 recall_score, 
                                 f1_score)
    from sklearn.model_selection import cross_val_score
    from sklearn.feature_extraction.text import CountVectorizer
    
    df = pd.read_csv("https://raw.githubusercontent.com/jeffprosise/Machine-Learning/master/Data/ham-spam.csv")
    
    vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
    x = vectorizer.fit_transform(df['Text'])
    y = df['IsSpam']
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2 
    
    print('Total: {} rows'.format(data.shape[0]))
    print('Train: {} rows'.format(X_train.shape[0]))
    print(' Test: {} rows'.format(X_test.shape[0]))
    print(' Validation: {} rows'.format(X_val.shape[0]))
    
    naive_bayes = MultinomialNB().fit(X_train, y_train)
    predictions = naive_bayes.predict(X_test)
    
    accuracy_score = accuracy_score(y_test,predictions)
    precision_score = precision_score(y_test, predictions)
    recall_score = recall_score(y_test, predictions)
    f1_score = f1_score(y_test, predictions)
    
    mnb = MultinomialNB()
    scores = cross_val_score(mnb,X_val,y_val, cv = 10, scoring='accuracy')
    print('Cross-validation scores:{}'.format(scores))
    

    Result:

    Total: 1000 rows
    Train: 600 rows
     Test: 200 rows
     Validation: 200 rows
    Cross-validation scores:[1.   0.95 0.85 1.   1.   0.9  0.9  0.8  0.9  0.9 ]