machine-learningclassificationmlp

Why does the test set loss increase gradually during training?


I am a novice in machine learning and I am doing a classification task. However, in the experiment, I find that the loss on the training set is decreasing while the loss on the test set is gradually increasing. I am very confused, may I ask why this is?

enter image description here

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
from bert_vector import BertVector
from word2vec_vector import LoadData


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate=0.1):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return x


class MLPTrain:
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim,
                 lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1, test_data=None, train_data=None):
        self.train_data = train_data
        self.test_data = test_data
        self.input_dim = input_dim
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.output_dim = output_dim
        self.mlp = MLP(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def train(self):
        self.mlp.to(self.device)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.lr)

        train_loss_list = []
        test_loss_list = []
        train_acc_list = []
        test_acc_list = []

        for epoch in range(self.epochs):
            self.mlp.train()
            epoch_loss = 0.0
            correct_train = 0
            total_train = 0

            # Training Loop
            for i, (X, y) in enumerate(self.train_data):
                X = X.to(self.device)
                y = y.to(self.device)

                optimizer.zero_grad()
                y_pred = self.mlp(X)
                loss = self.criterion(y_pred, y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

                # Calculate accuracy
                _, preds = torch.max(y_pred, 1)
                correct_train += (preds == y).sum().item()
                total_train += y.size(0)

            avg_train_loss = epoch_loss / len(self.train_data)
            train_loss_list.append(avg_train_loss)
            train_accuracy = correct_train / total_train * 100
            train_acc_list.append(train_accuracy)

            # Evaluate on the test set
            test_loss, test_accuracy = self.evaluate()

            # Save test loss and accuracy
            test_loss_list.append(test_loss)
            test_acc_list.append(test_accuracy)

            # Print results every epoch
            print(f"Epoch {epoch + 1}/{self.epochs} - Train Loss: {avg_train_loss:.4f}, "
                  f"Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

        # Plot the metrics
        self.plot_metrics(train_loss_list, test_loss_list, train_acc_list, test_acc_list)

    def evaluate(self):
        self.mlp.eval()
        with torch.no_grad():
            epoch_loss = 0.0
            correct_test = 0
            total_test = 0
            for X, y in self.test_data:
                X = X.to(self.device)
                y = y.to(self.device)
                y_pred = self.mlp(X)
                loss = self.criterion(y_pred, y)
                epoch_loss += loss.item()

                # Calculate accuracy
                _, preds = torch.max(y_pred, 1)
                correct_test += (preds == y).sum().item()
                total_test += y.size(0)

            avg_test_loss = epoch_loss / len(self.test_data)
            test_accuracy = correct_test / total_test * 100

            return avg_test_loss, test_accuracy

    def plot_metrics(self, train_loss_list, test_loss_list, train_acc_list, test_acc_list):
        # Plot loss and accuracy curves
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

        # Plot Loss
        ax1.plot(range(1, self.epochs + 1), train_loss_list, label='Train Loss')
        ax1.plot(range(1, self.epochs + 1), test_loss_list, label='Test Loss')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Loss')
        ax1.set_title('Loss Curve')
        ax1.legend()

        # Plot Accuracy
        ax2.plot(range(1, self.epochs + 1), train_acc_list, label='Train Accuracy')
        ax2.plot(range(1, self.epochs + 1), test_acc_list, label='Test Accuracy')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Accuracy')
        ax2.set_title('Accuracy Curve')
        ax2.legend()

        plt.show()


def get_DataLoader(X, y, batch_size=128):
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X, y)
    data = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data


def label_encoder(y):
    labelEncoder = LabelEncoder()
    y_encoded = labelEncoder.fit_transform(y)

    class_to_code = dict(zip(labelEncoder.classes_, range(len(labelEncoder.classes_))))

    return y_encoded, class_to_code


def start_train(x, y, input_dim, output_dim, lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    train_data = get_DataLoader(X_train, y_train, batch_size=batch_size)
    test_data = get_DataLoader(X_test, y_test, batch_size=batch_size)

    trainer = MLPTrain(
        train_data=train_data,
        test_data=test_data,
        input_dim=input_dim,
        hidden_dim1=256,
        hidden_dim2=128,
        hidden_dim3=32,
        output_dim=output_dim,
        lr=lr,
        epochs=epochs,
        batch_size=batch_size,
        dropout_rate=dropout_rate
    )
    trainer.train()


def main(llm, lr=0.001, batch_size=128, dropout_rate=0.3, epochs=50):
    if llm == "bert":
        bert = BertVector()
        X1, X2, y = bert.get_bert_vector()
        input_dim = X1.shape[1]
    else:
        word2vec = LoadData(model_type="Skip-Gram-model")
        X1, X2, y = word2vec.get_post_vector()
    X = np.concatenate((X1, X2), axis=1)
    # X = X1 + X2
    X = X2
    input_dim = X.shape[1]

    print("input dim: ", input_dim)
    y_encoded, class_to_code = label_encoder(y)
    output_dim = len(set(y_encoded))
    print("label class num: ", output_dim)

    start_train(X2, y_encoded, input_dim, output_dim, lr, epochs, batch_size, dropout_rate)

    print(class_to_code)


if __name__ == '__main__':
    llm = "word2vec"
    main(llm=llm, lr=0.001, batch_size=128, dropout_rate=0.1, epochs=100)

The above is my code, may I ask if there is any problem?


Solution

  • This is generally known as overfitting. Your model has begun to learn (“memorize”) details of the training set specifically, causing it to specialize on the training set at the expense of generalizing to the test set.

    There are many ways to avoid overfitting; I’d recommend reading about the phenomenon in more depth so you can understand the relevant concepts more clearly. Some ways that come to mind: