pythonpytorchcuda

RuntimeError: NCCL Error 2: unhandled system error


I upgraded cuda from 9.0 to 10.2 recently, but when I successfully upgraded, my demo as follows will default by "RuntimeError: NCCL Error 2: unhandled system error".

I dont know why and try to look for answer in github or stackoverflow, but I failed. So I hope someone can help me.

import torch
from torchvision import datasets, transforms
import torchvision
from tqdm import tqdm
 
device_ids = [0, 1] # GPU
BATCH_SIZE = 64
 
transform = transforms.Compose([transforms.ToTensor()])
data_train = datasets.MNIST(root = "./data/",
                            transform=transform,
                            train=True,
                            download=True)
data_test = datasets.MNIST(root="./data/",
                           transform=transform,
                           train=False)
 
data_loader_train = torch.utils.data.DataLoader(dataset=data_train,
                                                
                                                batch_size=BATCH_SIZE * len(device_ids),
                                                shuffle=True,
                                                num_workers=2)
 
data_loader_test = torch.utils.data.DataLoader(dataset=data_test,
                                               batch_size=BATCH_SIZE * len(device_ids),
                                               shuffle=True,
                                               num_workers=2)
 
 
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = torch.nn.Sequential(
        torch.nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
        torch.nn.ReLU(),
        torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(stride=2, kernel_size=2),
    )
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(14 * 14 * 128, 1024),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.5),
            torch.nn.Linear(1024, 10)
    )
    def forward(self, x):
        x = self.conv1(x)
        x = x.view(-1, 14 * 14 * 128)
        x = self.dense(x)
        return x
 
 
model = Model()

model = torch.nn.DataParallel(model, device_ids=device_ids)

model = model.cuda(device=device_ids[0])
 
cost = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
from time import sleep
n_epochs = 50
for epoch in range(n_epochs):
    running_loss = 0.0
    running_correct = 0
    print("Epoch {}/{}".format(epoch, n_epochs))
    print("-"*10)
    for data in tqdm(data_loader_train):
        X_train, y_train = data
        
        X_train, y_train = X_train.cuda(device=device_ids[0]), y_train.cuda(device=device_ids[0])
        outputs = model(X_train)
        _,pred = torch.max(outputs.data, 1)
        optimizer.zero_grad()
        loss = cost(outputs, y_train)
 
        loss.backward()
        optimizer.step()
        running_loss += loss.data.item()
        running_correct += torch.sum(pred == y_train.data)
    testing_correct = 0
    for data in data_loader_test:
        X_test, y_test = data
        
        X_test, y_test = X_test.cuda(device=device_ids[0]), y_test.cuda(device=device_ids[0])
        outputs = model(X_test)
        _, pred = torch.max(outputs.data, 1)
        testing_correct += torch.sum(pred == y_test.data)
    print("Loss is:{:.4f}, Train Accuracy is:{:.4f}%, Test Accuracy is:{:.4f}".format(torch.true_divide(running_loss, len(data_train)),
                                                                                      torch.true_divide(100*running_correct, len(data_train)),
                                                                                      torch.true_divide(100*testing_correct, len(data_test))))
torch.save(model.state_dict(), "model_parameter.pkl")

The follows are the Info of the error.

Epoch 0/50
----------
  0%|                                                                                                                                                                               | 0/469 [00:00<?, ?it/s]7aea7ed215cf:50693:50693 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation

7aea7ed215cf:50693:50693 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 00/02 :    0   1
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 01/02 :    0   1
7aea7ed215cf:50693:50809 [1] NCCL INFO Setting affinity for GPU 1 to 3ff003ff
7aea7ed215cf:50693:50808 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
7aea7ed215cf:50693:50808 [0] NCCL INFO Setting affinity for GPU 0 to 3ff003ff
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)

7aea7ed215cf:50693:50809 [1] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50809 [1] NCCL INFO include/shm.h:41 -> 2

7aea7ed215cf:50693:50809 [1] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-0-1 (size 9637888)
7aea7ed215cf:50693:50809 [1] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:778 -> 2

7aea7ed215cf:50693:50808 [0] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50808 [0] NCCL INFO include/shm.h:41 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:904 -> 2

7aea7ed215cf:50693:50808 [0] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-1-0 (size 9637888)
7aea7ed215cf:50693:50808 [0] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:778 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:904 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50693 [0] NCCL INFO init.cc:973 -> 2
  0%|                                                                                                                                                                               | 0/469 [00:03<?, ?it/s]
Traceback (most recent call last):
  File "test.py", line 73, in <module>
    outputs = model(X_train)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 167, in forward
    replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 172, in replicate
    return replicate(module, device_ids, not torch.is_grad_enabled())
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
    param_copies = _broadcast_coalesced_reshape(params, devices, detach)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 71, in _broadcast_coalesced_reshape
    tensor_copies = Broadcast.apply(devices, *tensors)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/_functions.py", line 23, in forward
    outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 2: unhandled system error

Solution

  • This is apparently caused by newer versions of nccl including a data pathway which uses linux shared memory for internode communication (see here). If that system is misconfigured or unavailable, then you might see this issue in any codebase which uses nccl.

    Your two choices to fix this are

    1. Correctly set up the linux tmpfs system
    2. Use the NCCL_SHM_DISABLE environment variable to prevent nccl from trying to use this data pathway (see in the documentation here). This will force nccl to fall back to a potentially slower data pathway.