Read in & clean text
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']
def count_punct(text):
count = sum([1 for char in text if char in string.punctuation])
return round(count/(len(text) - text.count(" ")), 3)*100
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
def clean_text(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
tokens = re.split('\W+', text)
text = [ps.stem(word) for word in tokens if word not in stopwords]
return text
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()
Explore RandomForestClassifier through Holdout Set
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)
I got a persistent error from a guided exercise of an online platform course. The type error is on the rf_model and I don't know what to do.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
----> rf_model = rf.fit(X_train, y_train)
and then at the end of the long feedback, it displays the following:
TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types.
If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example.
Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.
How to resolve this?
To resolve this issue, you can convert the feature names to strings using the 'astype' method.
# Convert feature names to strings
X_train.columns = X_train.columns.astype(str)
# Fit the RandomForestClassifier
rf_model = rf.fit(X_train, y_train)
Please note that if you encounter the same error when using X_test, you should also convert the feature names in X_test to strings before making predictions.
Try this and see if it works for you or not.