I am using a residual neural network for a classification task. Somehow adding or omitting a ReLU activation causes the autograd to fail. I would be grateful for any insights on the reason for this? It cannot make any sense of it. ReLU is not an inplace operation, is it? Error message:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
Here is the network architecture. The 3rd to last line is what causes the issue when not commented out.
class ResidualBlock(nn.Module):
def __init__(self, num_filters, kernel_size):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv1d(num_filters, num_filters, kernel_size=kernel_size, padding='same')
self.bn1 = nn.BatchNorm1d(num_filters)
self.conv2 = nn.Conv1d(num_filters, num_filters, kernel_size=kernel_size, padding='same')
self.bn2 = nn.BatchNorm1d(num_filters)
def forward(self, x):
shortcut = x
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out = F.relu(out) # causes the issue when not commented out
out += shortcut
return out
Below is a minimal working example. I am using Python 3.12 and torch 2.5.1.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
# Define the ResidualBlock
class ResidualBlock(nn.Module):
def __init__(self, num_filters, kernel_size):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv1d(num_filters, num_filters, kernel_size=kernel_size, padding='same')
self.bn1 = nn.BatchNorm1d(num_filters)
self.conv2 = nn.Conv1d(num_filters, num_filters, kernel_size=kernel_size, padding='same')
self.bn2 = nn.BatchNorm1d(num_filters)
def forward(self, x):
shortcut = x
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out = F.relu(out) # causes the issue
out += shortcut
return out
class SimpleModel(nn.Module):
def __init__(self, num_filters, kernel_size):
super(SimpleModel, self).__init__()
self.res_block = ResidualBlock(num_filters, kernel_size)
self.fc = nn.Linear(num_filters, 1)
def forward(self, x):
x = self.res_block(x)
x = x.mean(dim=2)
x = self.fc(x)
return x
torch.manual_seed(42)
num_samples = 1000
sequence_length = 32
num_filters = 16
X = torch.randn(num_samples, num_filters, sequence_length) # Random input
y = torch.sum(X, dim=(1, 2), keepdim=True) # Simple target (sum of all values)
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model = SimpleModel(num_filters=num_filters, kernel_size=3)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 5
for epoch in range(epochs):
model.train()
epoch_loss = 0.0
for batch_X, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.4f}")
print("Training complete!")
The inplace operation is this:
out += shortcut
The relu
needs its own output to compute its gradient! Thus you are doing an inplace operation on the output of the relu, which it needed to compute its gradient in the backwards pass.
replacing it with
out = out + shortcut
should solve your problem.
In general, try to avoid using inplace functions in pytorch (such as +=
unless you know what you are doing
more details:
if you look at the pytorch code, the backwards pass for the relu is auto-generated from the following bit of code in pytorch/tools/autograd/derivatives.yaml
- name: relu(Tensor self) -> Tensor
self: threshold_backward(grad, result, 0)
result: auto_element_wise
What this does is it
grad
result
grad
where result
is bigger than 0
and 0
otherwise.Thus it does need the output (technically, it could have stored its input instead, but this is how it is implemented)