So basically, I have this model in sklearn that predicts the survival rate of titanic. its accuracy is around 0.77.
How can I make it better and more accurate?
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
le = LabelEncoder()
sc = StandardScaler()
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df["Embarked"].fillna("N", inplace=True)
train_df['Cabin'] = train_df['Cabin'].str[:1]
train_df['Cabin'].fillna('N', inplace=True)
train_df["Cabin"]
for col in ["Sex", "Embarked", "Cabin"]:
train_df[col] = LabelEncoder().fit_transform(train_df[col])
x = train_df.drop(["PassengerId","Name","Ticket", "Survived"], axis=1)
y = train_df["Survived"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
dt_clf = DecisionTreeClassifier(max_depth= 5, min_samples_leaf= 1,min_samples_split= 2)
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
print(metrics.accuracy_score(y_test, pred))
I filled the na with mean and changed the scaler and the algorithm, but nothing happened.
Handling Missing Values: You've filled the missing values in 'Age' with the mean. To take it a step further, consider using another methods to fill in missing values, such as the median, mode, or even using predictive models.
Feature Engineering: You can create some new features. For example:
Scaling and Encoding: You're currently using LabelEncoder for the categorical variables. Consider using OneHotEncoder or pd.get_dummies for better handling of categorical variables. Also, make sure to apply appropriate scaling techniques.
Feature Selection: You should select the most important features. You can use methods like recursive feature elimination (RFE), feature importance from models, or correlation matrices to drop the less important features.
Model Selection and Hyperparameter Tuning: You're using a DecisionTreeClassifier. Maybe you can try different algorithms and use cross-validation to fine-tune the hyperparameters. This will help you find the best combination for your model.
Cross-Validation and Hyperparameter Tuning: To find the best hyperparameters for your model, consider using GridSearchCV or RandomizedSearchCV.
Look at my reconstructed code:
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# Load Titanic dataset from Seaborn
titanic = sns.load_dataset('titanic')
print("Dataset loaded.")
# Feature engineering
titanic['FamilySize'] = titanic['sibsp'] + titanic['parch']
titanic['IsAlone'] = (titanic['FamilySize'] == 0).astype(int)
titanic['deck'] = titanic['deck'].cat.add_categories('N').fillna('N')
print("Feature engineering completed.")
# Dropping features
dropped_features = ['alive', 'adult_male', 'embark_town', 'alone']
titanic.drop(dropped_features, axis=1, inplace=True)
print(f"Dropped less useful features: {dropped_features}")
# Preprocessing
numeric_features = ['age', 'fare', 'FamilySize']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
categorical_features = ['sex', 'deck', 'embarked', 'who', 'class']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
print("Preprocessing setup completed.")
# Splitting data
X = titanic.drop('survived', axis=1)
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")
# Define models and their hyperparameters
models = {
'RandomForest': {
'model': RandomForestClassifier(random_state=42),
'params': {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [5, 10],
'classifier__min_samples_split': [2, 5],
'classifier__min_samples_leaf': [1, 2]
}
},
'LogisticRegression': {
'model': LogisticRegression(max_iter=1000),
'params': {
'classifier__C': [0.01, 0.1, 1, 10],
'classifier__solver': ['lbfgs', 'liblinear']
}
},
'SVM': {
'model': SVC(),
'params': {
'classifier__C': [0.1, 1, 10],
'classifier__gamma': ['scale', 'auto'],
'classifier__kernel': ['linear', 'rbf']
}
},
'GradientBoosting': {
'model': GradientBoostingClassifier(random_state=42),
'params': {
'classifier__n_estimators': [100, 200],
'classifier__learning_rate': [0.01, 0.1],
'classifier__max_depth': [3, 5]
}
},
'XGBoost': {
'model': XGBClassifier(random_state=42),
'params': {
'classifier__n_estimators': [100, 200],
'classifier__learning_rate': [0.01, 0.1],
'classifier__max_depth': [3, 5]
}
}
}
# Function to perform grid search and return the best model
def perform_grid_search(X_train, y_train, model, params):
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', model)])
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
return grid_search
best_models = {}
for model_name, model_info in models.items():
print(f"Training {model_name}...")
best_models[model_name] = perform_grid_search(X_train, y_train, model_info['model'], model_info['params'])
print(f"{model_name} training completed.")
# Evaluate models
for model_name, model in best_models.items():
best_model = model.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'{model_name} Best Hyperparameters: {model.best_params_}')
print(f'{model_name} Accuracy: {accuracy}')
# Select the best model based on accuracy
best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].best_estimator_.predict(X_test)))
print(f'Best model: {best_model_name} with accuracy {accuracy_score(y_test, best_models[best_model_name].best_estimator_.predict(X_test))}')
My accuracy is about 0.821