I am training multiple models on multiple GPUs like mobilenet, mobilenetv2
at the same time. After training and evaluating first model, I am getting an error torch.cuda.OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
. I have tried various solutions like below
Code
import time
import pathlib
from os.path import isfile
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import models
from utils import *
from config import config
from data import DataLoader
# for ignore imagenet PIL EXIF UserWarning
import warnings
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
best_acc1 = 0
def main():
global opt, start_epoch, best_acc1
opt = config()
if opt.cuda and not torch.cuda.is_available():
raise Exception('No GPU found, please run without --cuda')
print('\n=> creating model \'{}\''.format(opt.arch))
if opt.arch == 'shufflenet':
model = models.__dict__[opt.arch](opt.dataset, opt.width_mult, opt.groups)
else:
model = models.__dict__[opt.arch](opt.dataset, opt.width_mult)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=opt.lr,
momentum=opt.momentum, weight_decay=opt.weight_decay,
nesterov=True)
start_epoch = 0
n_retrain = 0
if opt.cuda:
torch.cuda.set_device(opt.gpuids[0])
with torch.cuda.device(opt.gpuids[0]):
model = model.cuda()
criterion = criterion.cuda()
model = nn.DataParallel(model, device_ids=opt.gpuids,
output_device=opt.gpuids[0])
cudnn.benchmark = True
# checkpoint file
ckpt_dir = pathlib.Path('checkpoint')
ckpt_file = ckpt_dir / opt.arch / opt.dataset / opt.ckpt
# for resuming training
if opt.resume:
if isfile(ckpt_file):
print('==> Loading Checkpoint \'{}\''.format(opt.ckpt))
checkpoint = load_model(model, ckpt_file, opt)
start_epoch = checkpoint['epoch']
optimizer.load_state_dict(checkpoint['optimizer'])
print('==> Loaded Checkpoint \'{}\' (epoch {})'.format(
opt.ckpt, start_epoch))
else:
print('==> no checkpoint found \'{}\''.format(
opt.ckpt))
return
# Data loading
print('==> Load data..')
train_loader, val_loader = DataLoader(opt.batch_size, opt.workers,
opt.dataset, opt.datapath,
opt.cuda)
# for evaluation
if opt.evaluate:
if isfile(ckpt_file):
print('==> Loading Checkpoint \'{}\''.format(opt.ckpt))
checkpoint = load_model(model, ckpt_file, opt)
start_epoch = checkpoint['epoch']
optimizer.load_state_dict(checkpoint['optimizer'])
print('==> Loaded Checkpoint \'{}\' (epoch {})'.format(
opt.ckpt, start_epoch))
# evaluate on validation set
print('\n===> [ Evaluation ]')
start_time = time.time()
acc1, acc5 = validate(val_loader, model, criterion)
save_eval(['{}-{}-{}'.format(opt.arch, opt.dataset, opt.ckpt[:-4]),
str(acc1)[7:-18], str(acc5)[7:-18]], opt)
elapsed_time = time.time() - start_time
print('====> {:.2f} seconds to evaluate this model\n'.format(
elapsed_time))
return
else:
print('==> no checkpoint found \'{}\''.format(
opt.ckpt))
return
# train...
train_time = 0.0
validate_time = 0.0
for epoch in range(start_epoch, opt.epochs):
adjust_learning_rate(optimizer, epoch, opt.lr)
print('\n==> {}/{} training'.format(opt.arch, opt.dataset))
print('==> Epoch: {}, lr = {}'.format(
epoch, optimizer.param_groups[0]["lr"]))
# train for one epoch
print('===> [ Training ]')
start_time = time.time()
acc1_train, acc5_train = train(train_loader,
epoch=epoch, model=model,
criterion=criterion, optimizer=optimizer)
elapsed_time = time.time() - start_time
train_time += elapsed_time
print('====> {:.2f} seconds to train this epoch\n'.format(
elapsed_time))
# evaluate on validation set
print('===> [ Validation ]')
start_time = time.time()
acc1_valid, acc5_valid = validate(val_loader, model, criterion)
elapsed_time = time.time() - start_time
validate_time += elapsed_time
print('====> {:.2f} seconds to validate this epoch\n'.format(
elapsed_time))
# remember best Acc@1 and save checkpoint and summary csv file
is_best = acc1_valid > best_acc1
best_acc1 = max(acc1_valid, best_acc1)
state = {'epoch': epoch + 1,
'model': model.state_dict(),
'optimizer': optimizer.state_dict()}
summary = [epoch,
str(acc1_train)[7:-18], str(acc5_train)[7:-18],
str(acc1_valid)[7:-18], str(acc5_valid)[7:-18]]
save_model(state, epoch, is_best, opt)
save_summary(summary, opt)
avg_train_time = train_time / (opt.epochs-start_epoch)
avg_valid_time = validate_time / (opt.epochs-start_epoch)
total_train_time = train_time + validate_time
print('====> average training time per epoch: {:,}m {:.2f}s'.format(
int(avg_train_time//60), avg_train_time%60))
print('====> average validation time per epoch: {:,}m {:.2f}s'.format(
int(avg_valid_time//60), avg_valid_time%60))
print('====> training time: {}h {}m {:.2f}s'.format(
int(train_time//3600), int((train_time%3600)//60), train_time%60))
print('====> validation time: {}h {}m {:.2f}s'.format(
int(validate_time//3600), int((validate_time%3600)//60), validate_time%60))
print('====> total training time: {}h {}m {:.2f}s'.format(
int(total_train_time//3600), int((total_train_time%3600)//60), total_train_time%60))
def train(train_loader, **kwargs):
epoch = kwargs.get('epoch')
model = kwargs.get('model')
criterion = kwargs.get('criterion')
optimizer = kwargs.get('optimizer')
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(len(train_loader), batch_time, data_time,
losses, top1, top5, prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
if opt.cuda:
target = target.cuda(non_blocking=True)
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
if i % opt.print_freq == 0:
progress.print(i)
end = time.time()
print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg, top5.avg
def validate(val_loader, model, criterion):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (input, target) in enumerate(val_loader):
if opt.cuda:
target = target.cuda(non_blocking=True)
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
if i % opt.print_freq == 0:
progress.print(i)
end = time.time()
print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg, top5.avg
if __name__ == '__main__':
start_time = time.time()
main()
elapsed_time = time.time() - start_time
print('====> total time: {}h {}m {:.2f}s'.format(
int(elapsed_time//3600), int((elapsed_time%3600)//60), elapsed_time%60))
Solutions
gc.collect()
torch.cuda.empty_cache() # PyTorch thing
Trace-back
==> mobilenet/cifar10 training
==> Epoch: 17, lr = 0.07093217661806457
===> [ Training ]
Epoch: [17][0/9] Time 2.638 ( 2.638) Data 2.527 ( 2.527) Loss 1.1166e+00 (1.1166e+00) Acc@1 59.76 ( 59.76) Acc@5 95.52 ( 95.52)
====> Acc@1 61.468 Acc@5 95.854
====> 4.97 seconds to train this epoch
===> [ Validation ]
Test: [0/2] Time 1.674 ( 1.674) Loss 1.1883e+00 (1.1883e+00) Acc@1 57.50 ( 57.50) Acc@5 95.46 ( 95.46)
====> Acc@1 57.620 Acc@5 95.300
====> 1.84 seconds to validate this epoch
==> mobilenet/cifar10 training
==> Epoch: 18, lr = 0.06951353308570328
===> [ Training ]
Epoch: [18][0/9] Time 2.582 ( 2.582) Data 2.467 ( 2.467) Loss 1.0763e+00 (1.0763e+00) Acc@1 61.83 ( 61.83) Acc@5 96.33 ( 96.33)
====> Acc@1 62.808 Acc@5 96.350
====> 4.92 seconds to train this epoch
===> [ Validation ]
Test: [0/2] Time 1.721 ( 1.721) Loss 1.1518e+00 (1.1518e+00) Acc@1 58.51 ( 58.51) Acc@5 95.67 ( 95.67)
====> Acc@1 58.540 Acc@5 95.560
====> 1.88 seconds to validate this epoch
==> mobilenet/cifar10 training
==> Epoch: 19, lr = 0.06812326242398921
===> [ Training ]
Epoch: [19][0/9] Time 2.441 ( 2.441) Data 2.314 ( 2.314) Loss 1.0599e+00 (1.0599e+00) Acc@1 62.20 ( 62.20) Acc@5 96.34 ( 96.34)
====> Acc@1 63.502 Acc@5 96.530
====> 4.75 seconds to train this epoch
===> [ Validation ]
Test: [0/2] Time 1.664 ( 1.664) Loss 1.1191e+00 (1.1191e+00) Acc@1 59.76 ( 59.76) Acc@5 96.39 ( 96.39)
====> Acc@1 59.460 Acc@5 96.060
====> 1.83 seconds to validate this epoch
====> average training time per epoch: 0m 6.81s
====> average validation time per epoch: 0m 1.88s
====> training time: 0h 2m 16.22s
====> validation time: 0h 0m 37.55s
====> total training time: 0h 2m 53.77s
====> total time: 0h 3m 18.80s
=> creating model 'mobilenet'
==> Load data..
Files already downloaded and verified
Files already downloaded and verified
==> Loading Checkpoint '/home2/coremax/Documents/BoxMix/checkpoint/mobilenet/cifar10/ckpt_best.pth'
==> Loaded Checkpoint '/home2/coremax/Documents/BoxMix/checkpoint/mobilenet/cifar10/ckpt_best.pth' (epoch 20)
===> [ Evaluation ]
Test: [ 0/40] Time 1.680 ( 1.680) Loss 1.0908e+00 (1.0908e+00) Acc@1 64.45 ( 64.45) Acc@5 96.09 ( 96.09)
====> Acc@1 59.460 Acc@5 96.060
====> 2.21 seconds to evaluate this model
====> total time: 0h 0m 6.03s
=> creating model 'mobilenetv2'
==> Load data..
Files already downloaded and verified
Files already downloaded and verified
==> mobilenetv2/cifar10 training
==> Epoch: 0, lr = 0.1
===> [ Training ]
Traceback (most recent call last):
File "/home2/coremax/Documents/BoxMix/main.py", line 257, in <module>
main()
File "/home2/coremax/Documents/BoxMix/main.py", line 117, in main
acc1_train, acc5_train = train(train_loader,
File "/home2/coremax/Documents/BoxMix/main.py", line 187, in train
output = model(input)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/_utils.py", line 543, in reraise
raise exception
torch.cuda.OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
I am training mobilenet
on two GPUs (TESLA V100 16GB)
with a batch size of 6096
which is very bigger but still, I can train my model easily. When I trained multiple models like mobilenet and mobilenetv2
at the same time I am getting replica
error in mobilenetv2
. I tried gc.collect() and torch.cuda.empty_cache()
solution it didn't work for me.
I solved the above problem by significantly decreasing the batch size from 6096
to 256