pythonmachine-learningdeep-learningpytorchyolo

The “Forward/Backward Passage Size” is too large for the pytorch model (Yolov3)


I'm writing Yolov3 in Pytorch. Architecture: https://i.sstatic.net/mncjfiDs.png

Code:

class Convolutional(nn.Module):  # DBL
  def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding=1):
    super().__init__()
    self._stack = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.LeakyReLU(0.1),
    )
  def forward(self, x):
    return self._stack(x)

class Detection(nn.Module):
  def __init__(self, in_channels: int, C, B):  # where A is number of anchors
    super().__init__()
    out_channels = in_channels // 2
    self.stack = nn.Sequential(
        Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, padding=0),
        Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3),
        nn.Conv2d(in_channels=in_channels, out_channels=((B * 5) + C), kernel_size=1, padding=0)
    )

  def forward(self, x):
    out = self.stack(x)
    print(out.shape)
    return out

class FPN(nn.Module):
  def __init__(self, in_channels, out_channels):
    super().__init__()
    self.conv1 = Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, padding=0)
    self.up = nn.ConvTranspose2d(out_channels, out_channels, kernel_size=2, stride=2)
    self.conv3 = Convolutional(in_channels=(out_channels * 2), out_channels=out_channels, kernel_size=3)

  def forward(self, x, skip):
    x = self.conv1(x)
    x = self.up(x)
    out = torch.cat([x, skip], dim=1)
    out = self.conv3(out)
    return out


class DBLx5(nn.Module):
  def __init__(self, in_channels):
    super().__init__()
    out_channels = in_channels // 2
    self.stack = nn.Sequential(
        Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1),
        Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3, stride=1),
        Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1),
        Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3, stride=1),
        Convolutional(in_channels=in_channels, out_channels=in_channels, kernel_size=1, stride=1)
    )

  def forward(self, x):
    out = self.stack(x)
    return x

class Residual(nn.Module):  # ResUnit
  def __init__(self, in_channels: int):
    super().__init__()
    self._conv_stack = nn.Sequential(
        Convolutional(in_channels=in_channels, out_channels=(in_channels // 2), kernel_size=1, padding=0),
        Convolutional(in_channels=(in_channels // 2), out_channels=in_channels, kernel_size=3, padding=1),
    )

  def forward(self, x):
    out = self._conv_stack(x)
    return x + out


class Darknet(nn.Module):
  def __init__(self):
    super().__init__()
    # Res11
    self.stack_list1 = list()
    self.stack_list1 = [
        Convolutional(in_channels=3, out_channels=32, kernel_size=3),
        Convolutional(in_channels=32, out_channels=64, kernel_size=3, stride=2),
        Residual(64),
        Convolutional(in_channels=64, out_channels=128, kernel_size=3, stride=2),
    ]
    for _ in range(2):
      self.stack_list1.append(Residual(128))
    self.stack_list1.append(
        Convolutional(in_channels=128, out_channels=256, kernel_size=3, stride=2)
        )
    for _ in range(8):
      self.stack_list1.append(Residual(256))
    # Res 8
    self.stack_list2 = list()
    self.stack_list2.append(
        Convolutional(in_channels=256, out_channels=512, kernel_size=3, stride=2)
        )
    for _ in range(8):
      self.stack_list2.append(Residual(512))
    # Res4
    self.stack_list3 = list()
    self.stack_list3.append(
        Convolutional(in_channels=512, out_channels=1024, kernel_size=3, stride=2)
        )
    for _ in range(4):
      self.stack_list3.append(Residual(1024))

    self.stack_list1 = nn.Sequential(*self.stack_list1)
    self.stack_list2 = nn.Sequential(*self.stack_list2)
    self.stack_list3 = nn.Sequential(*self.stack_list3)

  def forward(self, x):
    out1 = self.stack_list1(x)
    out2 = self.stack_list2(out1)
    out3 = self.stack_list3(out2)
    return out1, out2, out3

class YOLOv3(nn.Module):
  def __init__(self, C, B=2):
    super().__init__()
    self.darknet = Darknet()  # out size 19x19
    self.dbl5_1 = DBLx5(1024)
    self.dbl5_2 = DBLx5(512)
    self.detection_1 = Detection(1024, C, B)
    self.detection_2 = Detection(512, C, B)
    self.detection_3 = Detection(256, C, B)
    self.fpn_1 = FPN(1024, 512)
    self.fpn_2 = FPN(512, 256)

  def forward(self, x):
    # x1 - 256x256
    # x2 - 512x512
    # x3 - 1024x1024
    x1, x2, x3 = self.darknet(x)
    out1 = self.dbl5_1(x3)

    out2 = self.fpn_1(out1, x2)
    out2 = self.dbl5_2(out2)

    out3 = self.fpn_2(out2, x1)

    out1 = self.detection_1(out1)
    out2 = self.detection_2(out2)
    out3 = self.detection_3(out3)

    return out1, out2, out3

Torchsummary gives some unrealistically large model size. When training to obtain model predictions, it gives a CUDA out of memory error. I think the problem is Darknet-53. Separately, the model weighs normally, but if you import it into Yolo, this huge number appears. (Image size is 3x416x416) Summary output:

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 32, 416, 416]             864
       BatchNorm2d-2         [-1, 32, 416, 416]              64
         LeakyReLU-3         [-1, 32, 416, 416]               0
     Convolutional-4         [-1, 32, 416, 416]               0
            Conv2d-5         [-1, 64, 208, 208]          18,432
       BatchNorm2d-6         [-1, 64, 208, 208]             128
         LeakyReLU-7         [-1, 64, 208, 208]               0
     Convolutional-8         [-1, 64, 208, 208]               0
            Conv2d-9         [-1, 32, 208, 208]           2,048
      BatchNorm2d-10         [-1, 32, 208, 208]              64
        LeakyReLU-11         [-1, 32, 208, 208]               0
    Convolutional-12         [-1, 32, 208, 208]               0
           Conv2d-13         [-1, 64, 208, 208]          18,432
      BatchNorm2d-14         [-1, 64, 208, 208]             128
        LeakyReLU-15         [-1, 64, 208, 208]               0
    Convolutional-16         [-1, 64, 208, 208]               0
         Residual-17         [-1, 64, 208, 208]               0
           Conv2d-18        [-1, 128, 104, 104]          73,728
      BatchNorm2d-19        [-1, 128, 104, 104]             256
        LeakyReLU-20        [-1, 128, 104, 104]               0
    Convolutional-21        [-1, 128, 104, 104]               0
           Conv2d-22         [-1, 64, 104, 104]           8,192
      BatchNorm2d-23         [-1, 64, 104, 104]             128
        LeakyReLU-24         [-1, 64, 104, 104]               0
    Convolutional-25         [-1, 64, 104, 104]               0
           Conv2d-26        [-1, 128, 104, 104]          73,728
      BatchNorm2d-27        [-1, 128, 104, 104]             256
        LeakyReLU-28        [-1, 128, 104, 104]               0
    Convolutional-29        [-1, 128, 104, 104]               0
         Residual-30        [-1, 128, 104, 104]               0
           Conv2d-31         [-1, 64, 104, 104]           8,192
      BatchNorm2d-32         [-1, 64, 104, 104]             128
        LeakyReLU-33         [-1, 64, 104, 104]               0
    Convolutional-34         [-1, 64, 104, 104]               0
           Conv2d-35        [-1, 128, 104, 104]          73,728
      BatchNorm2d-36        [-1, 128, 104, 104]             256
        LeakyReLU-37        [-1, 128, 104, 104]               0
    Convolutional-38        [-1, 128, 104, 104]               0
         Residual-39        [-1, 128, 104, 104]               0
           Conv2d-40          [-1, 256, 52, 52]         294,912
      BatchNorm2d-41          [-1, 256, 52, 52]             512
        LeakyReLU-42          [-1, 256, 52, 52]               0
    Convolutional-43          [-1, 256, 52, 52]               0
           Conv2d-44          [-1, 128, 52, 52]          32,768
      BatchNorm2d-45          [-1, 128, 52, 52]             256
        LeakyReLU-46          [-1, 128, 52, 52]               0
    Convolutional-47          [-1, 128, 52, 52]               0
           Conv2d-48          [-1, 256, 52, 52]         294,912
      BatchNorm2d-49          [-1, 256, 52, 52]             512
        LeakyReLU-50          [-1, 256, 52, 52]               0
    Convolutional-51          [-1, 256, 52, 52]               0
         Residual-52          [-1, 256, 52, 52]               0
           Conv2d-53          [-1, 128, 52, 52]          32,768
      BatchNorm2d-54          [-1, 128, 52, 52]             256
        LeakyReLU-55          [-1, 128, 52, 52]               0
    Convolutional-56          [-1, 128, 52, 52]               0
           Conv2d-57          [-1, 256, 52, 52]         294,912
      BatchNorm2d-58          [-1, 256, 52, 52]             512
        LeakyReLU-59          [-1, 256, 52, 52]               0
    Convolutional-60          [-1, 256, 52, 52]               0
         Residual-61          [-1, 256, 52, 52]               0
           Conv2d-62          [-1, 128, 52, 52]          32,768
      BatchNorm2d-63          [-1, 128, 52, 52]             256
        LeakyReLU-64          [-1, 128, 52, 52]               0
    Convolutional-65          [-1, 128, 52, 52]               0
           Conv2d-66          [-1, 256, 52, 52]         294,912
      BatchNorm2d-67          [-1, 256, 52, 52]             512
        LeakyReLU-68          [-1, 256, 52, 52]               0
    Convolutional-69          [-1, 256, 52, 52]               0
         Residual-70          [-1, 256, 52, 52]               0
           Conv2d-71          [-1, 128, 52, 52]          32,768
      BatchNorm2d-72          [-1, 128, 52, 52]             256
        LeakyReLU-73          [-1, 128, 52, 52]               0
    Convolutional-74          [-1, 128, 52, 52]               0
           Conv2d-75          [-1, 256, 52, 52]         294,912
      BatchNorm2d-76          [-1, 256, 52, 52]             512
        LeakyReLU-77          [-1, 256, 52, 52]               0
    Convolutional-78          [-1, 256, 52, 52]               0
         Residual-79          [-1, 256, 52, 52]               0
           Conv2d-80          [-1, 128, 52, 52]          32,768
      BatchNorm2d-81          [-1, 128, 52, 52]             256
        LeakyReLU-82          [-1, 128, 52, 52]               0
    Convolutional-83          [-1, 128, 52, 52]               0
           Conv2d-84          [-1, 256, 52, 52]         294,912
      BatchNorm2d-85          [-1, 256, 52, 52]             512
        LeakyReLU-86          [-1, 256, 52, 52]               0
    Convolutional-87          [-1, 256, 52, 52]               0
         Residual-88          [-1, 256, 52, 52]               0
           Conv2d-89          [-1, 128, 52, 52]          32,768
      BatchNorm2d-90          [-1, 128, 52, 52]             256
        LeakyReLU-91          [-1, 128, 52, 52]               0
    Convolutional-92          [-1, 128, 52, 52]               0
           Conv2d-93          [-1, 256, 52, 52]         294,912
      BatchNorm2d-94          [-1, 256, 52, 52]             512
        LeakyReLU-95          [-1, 256, 52, 52]               0
    Convolutional-96          [-1, 256, 52, 52]               0
         Residual-97          [-1, 256, 52, 52]               0
           Conv2d-98          [-1, 128, 52, 52]          32,768
      BatchNorm2d-99          [-1, 128, 52, 52]             256
       LeakyReLU-100          [-1, 128, 52, 52]               0
   Convolutional-101          [-1, 128, 52, 52]               0
          Conv2d-102          [-1, 256, 52, 52]         294,912
     BatchNorm2d-103          [-1, 256, 52, 52]             512
       LeakyReLU-104          [-1, 256, 52, 52]               0
   Convolutional-105          [-1, 256, 52, 52]               0
        Residual-106          [-1, 256, 52, 52]               0
          Conv2d-107          [-1, 128, 52, 52]          32,768
     BatchNorm2d-108          [-1, 128, 52, 52]             256
       LeakyReLU-109          [-1, 128, 52, 52]               0
   Convolutional-110          [-1, 128, 52, 52]               0
          Conv2d-111          [-1, 256, 52, 52]         294,912
     BatchNorm2d-112          [-1, 256, 52, 52]             512
       LeakyReLU-113          [-1, 256, 52, 52]               0
   Convolutional-114          [-1, 256, 52, 52]               0
        Residual-115          [-1, 256, 52, 52]               0
          Conv2d-116          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-117          [-1, 512, 26, 26]           1,024
       LeakyReLU-118          [-1, 512, 26, 26]               0
   Convolutional-119          [-1, 512, 26, 26]               0
          Conv2d-120          [-1, 256, 26, 26]         131,072
     BatchNorm2d-121          [-1, 256, 26, 26]             512
       LeakyReLU-122          [-1, 256, 26, 26]               0
   Convolutional-123          [-1, 256, 26, 26]               0
          Conv2d-124          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-125          [-1, 512, 26, 26]           1,024
       LeakyReLU-126          [-1, 512, 26, 26]               0
   Convolutional-127          [-1, 512, 26, 26]               0
        Residual-128          [-1, 512, 26, 26]               0
          Conv2d-129          [-1, 256, 26, 26]         131,072
     BatchNorm2d-130          [-1, 256, 26, 26]             512
       LeakyReLU-131          [-1, 256, 26, 26]               0
   Convolutional-132          [-1, 256, 26, 26]               0
          Conv2d-133          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-134          [-1, 512, 26, 26]           1,024
       LeakyReLU-135          [-1, 512, 26, 26]               0
   Convolutional-136          [-1, 512, 26, 26]               0
        Residual-137          [-1, 512, 26, 26]               0
          Conv2d-138          [-1, 256, 26, 26]         131,072
     BatchNorm2d-139          [-1, 256, 26, 26]             512
       LeakyReLU-140          [-1, 256, 26, 26]               0
   Convolutional-141          [-1, 256, 26, 26]               0
          Conv2d-142          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-143          [-1, 512, 26, 26]           1,024
       LeakyReLU-144          [-1, 512, 26, 26]               0
   Convolutional-145          [-1, 512, 26, 26]               0
        Residual-146          [-1, 512, 26, 26]               0
          Conv2d-147          [-1, 256, 26, 26]         131,072
     BatchNorm2d-148          [-1, 256, 26, 26]             512
       LeakyReLU-149          [-1, 256, 26, 26]               0
   Convolutional-150          [-1, 256, 26, 26]               0
          Conv2d-151          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-152          [-1, 512, 26, 26]           1,024
       LeakyReLU-153          [-1, 512, 26, 26]               0
   Convolutional-154          [-1, 512, 26, 26]               0
        Residual-155          [-1, 512, 26, 26]               0
          Conv2d-156          [-1, 256, 26, 26]         131,072
     BatchNorm2d-157          [-1, 256, 26, 26]             512
       LeakyReLU-158          [-1, 256, 26, 26]               0
   Convolutional-159          [-1, 256, 26, 26]               0
          Conv2d-160          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-161          [-1, 512, 26, 26]           1,024
       LeakyReLU-162          [-1, 512, 26, 26]               0
   Convolutional-163          [-1, 512, 26, 26]               0
        Residual-164          [-1, 512, 26, 26]               0
          Conv2d-165          [-1, 256, 26, 26]         131,072
     BatchNorm2d-166          [-1, 256, 26, 26]             512
       LeakyReLU-167          [-1, 256, 26, 26]               0
   Convolutional-168          [-1, 256, 26, 26]               0
          Conv2d-169          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-170          [-1, 512, 26, 26]           1,024
       LeakyReLU-171          [-1, 512, 26, 26]               0
   Convolutional-172          [-1, 512, 26, 26]               0
        Residual-173          [-1, 512, 26, 26]               0
          Conv2d-174          [-1, 256, 26, 26]         131,072
     BatchNorm2d-175          [-1, 256, 26, 26]             512
       LeakyReLU-176          [-1, 256, 26, 26]               0
   Convolutional-177          [-1, 256, 26, 26]               0
          Conv2d-178          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-179          [-1, 512, 26, 26]           1,024
       LeakyReLU-180          [-1, 512, 26, 26]               0
   Convolutional-181          [-1, 512, 26, 26]               0
        Residual-182          [-1, 512, 26, 26]               0
          Conv2d-183          [-1, 256, 26, 26]         131,072
     BatchNorm2d-184          [-1, 256, 26, 26]             512
       LeakyReLU-185          [-1, 256, 26, 26]               0
   Convolutional-186          [-1, 256, 26, 26]               0
          Conv2d-187          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-188          [-1, 512, 26, 26]           1,024
       LeakyReLU-189          [-1, 512, 26, 26]               0
   Convolutional-190          [-1, 512, 26, 26]               0
        Residual-191          [-1, 512, 26, 26]               0
          Conv2d-192         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-193         [-1, 1024, 13, 13]           2,048
       LeakyReLU-194         [-1, 1024, 13, 13]               0
   Convolutional-195         [-1, 1024, 13, 13]               0
          Conv2d-196          [-1, 512, 13, 13]         524,288
     BatchNorm2d-197          [-1, 512, 13, 13]           1,024
       LeakyReLU-198          [-1, 512, 13, 13]               0
   Convolutional-199          [-1, 512, 13, 13]               0
          Conv2d-200         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-201         [-1, 1024, 13, 13]           2,048
       LeakyReLU-202         [-1, 1024, 13, 13]               0
   Convolutional-203         [-1, 1024, 13, 13]               0
        Residual-204         [-1, 1024, 13, 13]               0
          Conv2d-205          [-1, 512, 13, 13]         524,288
     BatchNorm2d-206          [-1, 512, 13, 13]           1,024
       LeakyReLU-207          [-1, 512, 13, 13]               0
   Convolutional-208          [-1, 512, 13, 13]               0
          Conv2d-209         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-210         [-1, 1024, 13, 13]           2,048
       LeakyReLU-211         [-1, 1024, 13, 13]               0
   Convolutional-212         [-1, 1024, 13, 13]               0
        Residual-213         [-1, 1024, 13, 13]               0
          Conv2d-214          [-1, 512, 13, 13]         524,288
     BatchNorm2d-215          [-1, 512, 13, 13]           1,024
       LeakyReLU-216          [-1, 512, 13, 13]               0
   Convolutional-217          [-1, 512, 13, 13]               0
          Conv2d-218         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-219         [-1, 1024, 13, 13]           2,048
       LeakyReLU-220         [-1, 1024, 13, 13]               0
   Convolutional-221         [-1, 1024, 13, 13]               0
        Residual-222         [-1, 1024, 13, 13]               0
          Conv2d-223          [-1, 512, 13, 13]         524,288
     BatchNorm2d-224          [-1, 512, 13, 13]           1,024
       LeakyReLU-225          [-1, 512, 13, 13]               0
   Convolutional-226          [-1, 512, 13, 13]               0
          Conv2d-227         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-228         [-1, 1024, 13, 13]           2,048
       LeakyReLU-229         [-1, 1024, 13, 13]               0
   Convolutional-230         [-1, 1024, 13, 13]               0
        Residual-231         [-1, 1024, 13, 13]               0
         Darknet-232  [[-1, 256, 52, 52], [-1, 512, 26, 26], [-1, 1024, 13, 13]]               0
          Conv2d-233          [-1, 512, 15, 15]         524,288
     BatchNorm2d-234          [-1, 512, 15, 15]           1,024
       LeakyReLU-235          [-1, 512, 15, 15]               0
   Convolutional-236          [-1, 512, 15, 15]               0
          Conv2d-237         [-1, 1024, 15, 15]       4,718,592
     BatchNorm2d-238         [-1, 1024, 15, 15]           2,048
       LeakyReLU-239         [-1, 1024, 15, 15]               0
   Convolutional-240         [-1, 1024, 15, 15]               0
          Conv2d-241          [-1, 512, 17, 17]         524,288
     BatchNorm2d-242          [-1, 512, 17, 17]           1,024
       LeakyReLU-243          [-1, 512, 17, 17]               0
   Convolutional-244          [-1, 512, 17, 17]               0
          Conv2d-245         [-1, 1024, 17, 17]       4,718,592
     BatchNorm2d-246         [-1, 1024, 17, 17]           2,048
       LeakyReLU-247         [-1, 1024, 17, 17]               0
   Convolutional-248         [-1, 1024, 17, 17]               0
          Conv2d-249         [-1, 1024, 19, 19]       1,048,576
     BatchNorm2d-250         [-1, 1024, 19, 19]           2,048
       LeakyReLU-251         [-1, 1024, 19, 19]               0
   Convolutional-252         [-1, 1024, 19, 19]               0
           DBLx5-253         [-1, 1024, 13, 13]               0
          Conv2d-254          [-1, 512, 13, 13]         524,288
     BatchNorm2d-255          [-1, 512, 13, 13]           1,024
       LeakyReLU-256          [-1, 512, 13, 13]               0
   Convolutional-257          [-1, 512, 13, 13]               0
 ConvTranspose2d-258          [-1, 512, 26, 26]       1,049,088
          Conv2d-259          [-1, 512, 26, 26]       4,718,592
     BatchNorm2d-260          [-1, 512, 26, 26]           1,024
       LeakyReLU-261          [-1, 512, 26, 26]               0
   Convolutional-262          [-1, 512, 26, 26]               0
             FPN-263          [-1, 512, 26, 26]               0
          Conv2d-264          [-1, 256, 28, 28]         131,072
     BatchNorm2d-265          [-1, 256, 28, 28]             512
       LeakyReLU-266          [-1, 256, 28, 28]               0
   Convolutional-267          [-1, 256, 28, 28]               0
          Conv2d-268          [-1, 512, 28, 28]       1,179,648
     BatchNorm2d-269          [-1, 512, 28, 28]           1,024
       LeakyReLU-270          [-1, 512, 28, 28]               0
   Convolutional-271          [-1, 512, 28, 28]               0
          Conv2d-272          [-1, 256, 30, 30]         131,072
     BatchNorm2d-273          [-1, 256, 30, 30]             512
       LeakyReLU-274          [-1, 256, 30, 30]               0
   Convolutional-275          [-1, 256, 30, 30]               0
          Conv2d-276          [-1, 512, 30, 30]       1,179,648
     BatchNorm2d-277          [-1, 512, 30, 30]           1,024
       LeakyReLU-278          [-1, 512, 30, 30]               0
   Convolutional-279          [-1, 512, 30, 30]               0
          Conv2d-280          [-1, 512, 32, 32]         262,144
     BatchNorm2d-281          [-1, 512, 32, 32]           1,024
       LeakyReLU-282          [-1, 512, 32, 32]               0
   Convolutional-283          [-1, 512, 32, 32]               0
           DBLx5-284          [-1, 512, 26, 26]               0
          Conv2d-285          [-1, 256, 26, 26]         131,072
     BatchNorm2d-286          [-1, 256, 26, 26]             512
       LeakyReLU-287          [-1, 256, 26, 26]               0
   Convolutional-288          [-1, 256, 26, 26]               0
 ConvTranspose2d-289          [-1, 256, 52, 52]         262,400
          Conv2d-290          [-1, 256, 52, 52]       1,179,648
     BatchNorm2d-291          [-1, 256, 52, 52]             512
       LeakyReLU-292          [-1, 256, 52, 52]               0
   Convolutional-293          [-1, 256, 52, 52]               0
             FPN-294          [-1, 256, 52, 52]               0
          Conv2d-295          [-1, 512, 13, 13]         524,288
     BatchNorm2d-296          [-1, 512, 13, 13]           1,024
       LeakyReLU-297          [-1, 512, 13, 13]               0
   Convolutional-298          [-1, 512, 13, 13]               0
          Conv2d-299         [-1, 1024, 13, 13]       4,718,592
     BatchNorm2d-300         [-1, 1024, 13, 13]           2,048
       LeakyReLU-301         [-1, 1024, 13, 13]               0
   Convolutional-302         [-1, 1024, 13, 13]               0
          Conv2d-303           [-1, 18, 13, 13]          18,450
       Detection-304           [-1, 18, 13, 13]               0
          Conv2d-305          [-1, 256, 26, 26]         131,072
     BatchNorm2d-306          [-1, 256, 26, 26]             512
       LeakyReLU-307          [-1, 256, 26, 26]               0
   Convolutional-308          [-1, 256, 26, 26]               0
          Conv2d-309          [-1, 512, 26, 26]       1,179,648
     BatchNorm2d-310          [-1, 512, 26, 26]           1,024
       LeakyReLU-311          [-1, 512, 26, 26]               0
   Convolutional-312          [-1, 512, 26, 26]               0
          Conv2d-313           [-1, 18, 26, 26]           9,234
       Detection-314           [-1, 18, 26, 26]               0
          Conv2d-315          [-1, 128, 52, 52]          32,768
     BatchNorm2d-316          [-1, 128, 52, 52]             256
       LeakyReLU-317          [-1, 128, 52, 52]               0
   Convolutional-318          [-1, 128, 52, 52]               0
          Conv2d-319          [-1, 256, 52, 52]         294,912
     BatchNorm2d-320          [-1, 256, 52, 52]             512
       LeakyReLU-321          [-1, 256, 52, 52]               0
   Convolutional-322          [-1, 256, 52, 52]               0
          Conv2d-323           [-1, 18, 52, 52]           4,626
       Detection-324           [-1, 18, 52, 52]               0
================================================================
Total params: 69,802,262
Trainable params: 69,802,262
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 1.98
Forward/backward pass size (MB): 316329755939.75
Params size (MB): 266.27
Estimated Total Size (MB): 316329756208.00
--------------------------------------------------------------------------

I don't understand what the error is. There can't be such a big model. Error:

OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 7.06 MiB is free. Process 5914 has 14.74 GiB memory in use. Of the allocated memory 14.53 GiB is allocated by PyTorch, and 88.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.

I don’t understand why the model is so huge.


Solution

  • I tried to reproduce the error and got the same issue. The issue comes from torchsummary, so you might want to open an issue there.

    However, I am able to run a forward and backward pass through the model without any issue using a GPU with 8GB of VRAM (and a batch size of one).

    If you still encounter memory issue, you should try to reduce your batch size.

    I took the liberty to clean up your code:

    import torch
    from torch import nn, Tensor
    
    
    class Convolutional(nn.Module):
        def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding: int = 0) -> None:
            super().__init__()
    
            self.stack = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.LeakyReLU(0.1)
            )
    
        def forward(self, x: Tensor) -> Tensor:
            return self.stack(x)
    
    
    class Detection(nn.Module):
        def __init__(self, in_channels: int, C: int, B: int) -> None:
            super().__init__()
    
            out_channels = in_channels // 2
            self.stack = nn.Sequential(
                Convolutional(in_channels, out_channels, 1),
                Convolutional(out_channels, in_channels, 3, padding=1),
                nn.Conv2d(in_channels, 5 * B + C, 1)
            )
    
        def forward(self, x: Tensor) -> Tensor:
            return self.stack(x)
    
    
    class FPN(nn.Module):
        def __init__(self, in_channels: int, out_channels: int) -> None:
            super().__init__()
    
            self.conv1 = Convolutional(in_channels, out_channels, 1)
            self.up = nn.ConvTranspose2d(out_channels, out_channels, 2, stride=2)
            self.conv3 = Convolutional(2 * out_channels, out_channels, 3, padding=1)
    
        def forward(self, x: Tensor, skip: Tensor) -> Tensor:
            x = self.conv1(x)
            x = self.up(x)
            out = torch.cat((x, skip), 1)
            return self.conv3(out)
    
    
    class DBLx5(nn.Module):
        def __init__(self, in_channels: int) -> None:
            super().__init__()
    
            out_channels = in_channels // 2
            self.stack = nn.Sequential(
                Convolutional(in_channels, out_channels, 1),
                Convolutional(out_channels, in_channels, 3, padding=1),
                Convolutional(in_channels, out_channels, 1),
                Convolutional(out_channels, in_channels, 3, padding=1),
                Convolutional(in_channels, in_channels, 1)
            )
    
        def forward(self, x: Tensor) -> Tensor:
            return self.stack(x)
    
    
    class Residual(nn.Module):
        def __init__(self, in_channels: int) -> None:
            super().__init__()
    
            self.stack = nn.Sequential(
                Convolutional(in_channels, in_channels // 2, 1),
                Convolutional(in_channels // 2, in_channels, 3, padding=1)
            )
    
        def forward(self, x: Tensor) -> Tensor:
            return self.stack(x) + x
    
    
    class Darknet(nn.Module):
        def __init__(self) -> None:
            super().__init__()
    
            self.stack1 = nn.Sequential(
                Convolutional(3, 32, 3, padding=1),
                Convolutional(32, 64, 3, padding=1, stride=2),
                Residual(64),
                Convolutional(64, 128, 3, padding=1, stride=2),
                *(Residual(128) for _ in range(2)),
                Convolutional(128, 256, 3, padding=1, stride=2),
                *(Residual(256) for _ in range(8))
            )
    
            self.stack2 = nn.Sequential(
                Convolutional(256, 512, 3, padding=1, stride=2),
                *(Residual(512) for _ in range(8))
            )
    
            self.stack3 = nn.Sequential(
                Convolutional(512, 1024, 3, padding=1, stride=2),
                *(Residual(1024) for _ in range(4))
            )
    
        def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
            out1 = self.stack1(x)
            out2 = self.stack2(out1)
            out3 = self.stack3(out2)
    
            return out1, out2, out3
    
    
    class YOLOv3(nn.Module):
        def __init__(self, C: int, B: int = 2) -> None:
            super().__init__()
    
            self.darknet = Darknet()
            self.dbl5_1 = DBLx5(1024)
            self.dbl5_2 = DBLx5(512)
    
            self.detection_1 = Detection(1024, C, B)
            self.detection_2 = Detection(512, C, B)
            self.detection_3 = Detection(256, C, B)
    
            self.fpn_1 = FPN(1024, 512)
            self.fpn_2 = FPN(512, 256)
    
        def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
            x1, x2, x3 = self.darknet(x)
    
            out1 = self.dbl5_1(x3)
    
            out2 = self.fpn_1(out1, x2)
            out2 = self.dbl5_2(out2)
    
            out3 = self.fpn_2(out2, x1)
    
            out1 = self.detection_1(out1)
            out2 = self.detection_2(out2)
            out3 = self.detection_3(out3)
    
            return out1, out2, out3