I'm training a very small NN using the HAM10000 dataset. For loading the data I'm using the DataLoader that ships with PyTorch:
class CocoDetectionWithFilenames(CocoDetection):
def __init__(self, root: str, ann_file: str, transform=None):
super().__init__(root, ann_file, transform)
def get_filename(self, idx: int) -> str:
return self.coco.loadImgs(self.ids[idx])[0]["file_name"]
def get_loaders(root: str, ann_file: str) -> tuple[CocoDetection, DataLoader, DataLoader, DataLoader]:
transform = transforms.Compose([
transforms.ToTensor()
])
dataset = CocoDetectionWithFilenames(
root=root,
ann_file=ann_file,
transform=transform
)
train_size = int(0.7 * len(dataset))
valid_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])
num_workers = os.cpu_count()
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=32,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
prefetch_factor=1024
)
valid_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=32,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
prefetch_factor=1024
)
test_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=32,
shuffle=False,
num_workers=num_workers,
pin_memory=True
)
return dataset, train_loader, valid_loader, test_loader
The thing is, when my training loop runs, the training itself is very fast, but the program spends 95% on the time inbetween epochs - probably loading the data:
def extract_bboxes(targets: list[dict]) -> list[torch.Tensor]:
bboxes = []
for target in targets:
xs, ys, widths, heights = target["bbox"]
for idx, _ in enumerate(xs):
x1, y1, width, height = xs[idx], ys[idx], widths[idx], heights[idx]
# Convert COCO format (x, y, width, height) to (x1, y1, x2, y2)
x2, y2 = x1 + width, y1 + height
bboxes.append(torch.IntTensor([x1, y1, x2, y2]))
return bboxes
num_epochs = 25
train_losses = []
val_losses = []
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for images, targets in train_loader_tqdm:
images = images.to(device)
bboxes = extract_bboxes(targets)
bboxes = torch.stack(bboxes).to(device)
optimizer.zero_grad(set_to_none=True)
outputs = model(images)
loss = criterion(outputs, bboxes)
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_train_loss = running_loss / len(train_loader)
train_losses.append(epoch_train_loss)
print(f"Epoch {epoch + 1}, Loss: {epoch_train_loss}")
model.eval()
As you can see, the training loop code is quite simple, nothing weird happening there.
Try to reduce num_workers
and prefetch_factor
. It may spend all the time fetching that 1024 batches using all threads.