I am a novice in machine learning and I am doing a classification task. However, in the experiment, I find that the loss on the training set is decreasing while the loss on the test set is gradually increasing. I am very confused, may I ask why this is?
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
from bert_vector import BertVector
from word2vec_vector import LoadData
class MLP(nn.Module):
def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate=0.1):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim1)
self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
self.fc4 = nn.Linear(hidden_dim3, output_dim)
self.dropout = nn.Dropout(dropout_rate)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.relu(self.fc3(x))
x = self.dropout(x)
x = self.fc4(x)
return x
class MLPTrain:
def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim,
lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1, test_data=None, train_data=None):
self.train_data = train_data
self.test_data = test_data
self.input_dim = input_dim
self.hidden_dim1 = hidden_dim1
self.hidden_dim2 = hidden_dim2
self.hidden_dim3 = hidden_dim3
self.output_dim = output_dim
self.mlp = MLP(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate)
self.criterion = nn.CrossEntropyLoss()
self.lr = lr
self.epochs = epochs
self.batch_size = batch_size
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train(self):
self.mlp.to(self.device)
optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.lr)
train_loss_list = []
test_loss_list = []
train_acc_list = []
test_acc_list = []
for epoch in range(self.epochs):
self.mlp.train()
epoch_loss = 0.0
correct_train = 0
total_train = 0
# Training Loop
for i, (X, y) in enumerate(self.train_data):
X = X.to(self.device)
y = y.to(self.device)
optimizer.zero_grad()
y_pred = self.mlp(X)
loss = self.criterion(y_pred, y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
# Calculate accuracy
_, preds = torch.max(y_pred, 1)
correct_train += (preds == y).sum().item()
total_train += y.size(0)
avg_train_loss = epoch_loss / len(self.train_data)
train_loss_list.append(avg_train_loss)
train_accuracy = correct_train / total_train * 100
train_acc_list.append(train_accuracy)
# Evaluate on the test set
test_loss, test_accuracy = self.evaluate()
# Save test loss and accuracy
test_loss_list.append(test_loss)
test_acc_list.append(test_accuracy)
# Print results every epoch
print(f"Epoch {epoch + 1}/{self.epochs} - Train Loss: {avg_train_loss:.4f}, "
f"Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
# Plot the metrics
self.plot_metrics(train_loss_list, test_loss_list, train_acc_list, test_acc_list)
def evaluate(self):
self.mlp.eval()
with torch.no_grad():
epoch_loss = 0.0
correct_test = 0
total_test = 0
for X, y in self.test_data:
X = X.to(self.device)
y = y.to(self.device)
y_pred = self.mlp(X)
loss = self.criterion(y_pred, y)
epoch_loss += loss.item()
# Calculate accuracy
_, preds = torch.max(y_pred, 1)
correct_test += (preds == y).sum().item()
total_test += y.size(0)
avg_test_loss = epoch_loss / len(self.test_data)
test_accuracy = correct_test / total_test * 100
return avg_test_loss, test_accuracy
def plot_metrics(self, train_loss_list, test_loss_list, train_acc_list, test_acc_list):
# Plot loss and accuracy curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Loss
ax1.plot(range(1, self.epochs + 1), train_loss_list, label='Train Loss')
ax1.plot(range(1, self.epochs + 1), test_loss_list, label='Test Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Curve')
ax1.legend()
# Plot Accuracy
ax2.plot(range(1, self.epochs + 1), train_acc_list, label='Train Accuracy')
ax2.plot(range(1, self.epochs + 1), test_acc_list, label='Test Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.set_title('Accuracy Curve')
ax2.legend()
plt.show()
def get_DataLoader(X, y, batch_size=128):
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)
dataset = TensorDataset(X, y)
data = DataLoader(dataset, batch_size=batch_size, shuffle=True)
return data
def label_encoder(y):
labelEncoder = LabelEncoder()
y_encoded = labelEncoder.fit_transform(y)
class_to_code = dict(zip(labelEncoder.classes_, range(len(labelEncoder.classes_))))
return y_encoded, class_to_code
def start_train(x, y, input_dim, output_dim, lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1):
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
train_data = get_DataLoader(X_train, y_train, batch_size=batch_size)
test_data = get_DataLoader(X_test, y_test, batch_size=batch_size)
trainer = MLPTrain(
train_data=train_data,
test_data=test_data,
input_dim=input_dim,
hidden_dim1=256,
hidden_dim2=128,
hidden_dim3=32,
output_dim=output_dim,
lr=lr,
epochs=epochs,
batch_size=batch_size,
dropout_rate=dropout_rate
)
trainer.train()
def main(llm, lr=0.001, batch_size=128, dropout_rate=0.3, epochs=50):
if llm == "bert":
bert = BertVector()
X1, X2, y = bert.get_bert_vector()
input_dim = X1.shape[1]
else:
word2vec = LoadData(model_type="Skip-Gram-model")
X1, X2, y = word2vec.get_post_vector()
X = np.concatenate((X1, X2), axis=1)
# X = X1 + X2
X = X2
input_dim = X.shape[1]
print("input dim: ", input_dim)
y_encoded, class_to_code = label_encoder(y)
output_dim = len(set(y_encoded))
print("label class num: ", output_dim)
start_train(X2, y_encoded, input_dim, output_dim, lr, epochs, batch_size, dropout_rate)
print(class_to_code)
if __name__ == '__main__':
llm = "word2vec"
main(llm=llm, lr=0.001, batch_size=128, dropout_rate=0.1, epochs=100)
The above is my code, may I ask if there is any problem?
This is generally known as overfitting. Your model has begun to learn (“memorize”) details of the training set specifically, causing it to specialize on the training set at the expense of generalizing to the test set.
There are many ways to avoid overfitting; I’d recommend reading about the phenomenon in more depth so you can understand the relevant concepts more clearly. Some ways that come to mind: