python machine-learning pytorch training-data pytorch-dataloader

Why does the usage of dataloader for train data change the training of the model?

I have a base model in which creation of batches for train data are done manually, as follows.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            X_batch = X_train[start:start + BATCH_SIZE]
            y_batch = y_train[start:start + BATCH_SIZE]
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()

The resulting diminution of MSE, where the convergence ends at MSE = 0.07 and RMSE=0.26, is like

However, when I do batch making automatically using a dataloader, whose code is as follows

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

train_dataloader = DataLoader(list(zip(X_train, y_train)), shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(list(zip(X_test, y_test)), shuffle=False, batch_size=BATCH_SIZE)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            for X_train_batch, y_train_batch in train_dataloader:
                y_pred = model(X_train_batch)
                loss = loss_fn(y_pred, y_train_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()

there are two things that are unexpected:

First, the execution of the second code, in which a dataloader is used, is much slower than that of the first one. Second, the training results, stopping at MSE = 0.02 and RMSE=0.15, are completely different now:

Can one, thus, explain why the results of these two codes are not identical?

Solution

Edit:

A few updates:

First in the dataloader version you should not nested the loop under the tqdm bar (for start in bar). Second you should disable the shuffle=True in train dataloader. With the aforementioned updates the following code can reproduce your initial result (MSE=0.07, RMSE-0.26).

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tqdm
import copy
import random
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

SEED = 12345
BATCH_SIZE= 5
N_EPOCHS = 100

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


dataset = load_iris()
X , y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1,1)

train_dataloader = DataLoader(list(zip(X_train, y_train)), shuffle=False, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(list(zip(X_test, y_test)), shuffle=False, batch_size=BATCH_SIZE)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(dataset.feature_names), 10),
            nn.LeakyReLU(),
            nn.Linear(10, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()
batch_start = torch.arange(0, len(X_train), BATCH_SIZE)

best_mse = np.inf
best_weights = None
history = []

def train(model, loss_fn, optimizer):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")

        for X_train_batch, y_train_batch in train_dataloader:
            y_pred = model(X_train_batch)
            loss = loss_fn(y_pred, y_train_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            bar.set_postfix(mse=float(loss))

def test(model, loss_fn):
    global best_mse
    global best_weights
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(N_EPOCHS):
    train(model, loss_fn, optimizer)
    test(model, loss_fn)

model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(history)
plt.show()

Now let's go to the detail.

(1) Why is it slower: the most important reason is you actually let the model train with more iterations in your initial dataloader code. You traverse the whole dataloader, under a nested loop of tqdm bars (which you initially implemented for manual batching). This also explain why the results of your initial dataloader version was better.

Also you should be aware that dataloader itself comes with certain overhead: The first is the auto collation (check collate_fn in the doc https://pytorch.org/docs/stable/data.html). The auto collation will attempt to proprocess the data recursively (e.g., convert numpy array to torch.Tensor and concatenate them across batches, and this procedure is recursive if the data is a nested dict). The second is multiprocessing - while you haven't used multiprocessing here, the multiprocessing itself in python can be tricky and usually it is expensive to create worker for subprocesses. This would be apparent especially if you use persistent_workers in a multiprocessing DataLoader, which means everytime you finish traversing a DataLoader and need to do it again, the subprocesses will be recreated and there will be more computational overhead.

(2) Another reason why you failed to reproduce the result: your manual version is simply trained on a shuffled train/test split but with determined order (the shuffle only occurs during the split phase).

In the dataloader version, you enable the shuffle=True and that means in each epoch, the order, and in fact the combination of batches could be randomized, and that matters a lot (see here for example: https://stats.stackexchange.com/questions/245502/why-should-we-shuffle-data-while-training-a-neural-network)

Now let's verify whether the order of batches matters: if you keep the shuffle=True in the fixed dataloader example, the output will be MSE= 0.06 and RMSE=0.25.