I'm writing Yolov3 in Pytorch. Architecture: https://i.sstatic.net/mncjfiDs.png
Code:
class Convolutional(nn.Module): # DBL
def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding=1):
super().__init__()
self._stack = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1),
)
def forward(self, x):
return self._stack(x)
class Detection(nn.Module):
def __init__(self, in_channels: int, C, B): # where A is number of anchors
super().__init__()
out_channels = in_channels // 2
self.stack = nn.Sequential(
Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, padding=0),
Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3),
nn.Conv2d(in_channels=in_channels, out_channels=((B * 5) + C), kernel_size=1, padding=0)
)
def forward(self, x):
out = self.stack(x)
print(out.shape)
return out
class FPN(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv1 = Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, padding=0)
self.up = nn.ConvTranspose2d(out_channels, out_channels, kernel_size=2, stride=2)
self.conv3 = Convolutional(in_channels=(out_channels * 2), out_channels=out_channels, kernel_size=3)
def forward(self, x, skip):
x = self.conv1(x)
x = self.up(x)
out = torch.cat([x, skip], dim=1)
out = self.conv3(out)
return out
class DBLx5(nn.Module):
def __init__(self, in_channels):
super().__init__()
out_channels = in_channels // 2
self.stack = nn.Sequential(
Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1),
Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3, stride=1),
Convolutional(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1),
Convolutional(in_channels=out_channels, out_channels=in_channels, kernel_size=3, stride=1),
Convolutional(in_channels=in_channels, out_channels=in_channels, kernel_size=1, stride=1)
)
def forward(self, x):
out = self.stack(x)
return x
class Residual(nn.Module): # ResUnit
def __init__(self, in_channels: int):
super().__init__()
self._conv_stack = nn.Sequential(
Convolutional(in_channels=in_channels, out_channels=(in_channels // 2), kernel_size=1, padding=0),
Convolutional(in_channels=(in_channels // 2), out_channels=in_channels, kernel_size=3, padding=1),
)
def forward(self, x):
out = self._conv_stack(x)
return x + out
class Darknet(nn.Module):
def __init__(self):
super().__init__()
# Res11
self.stack_list1 = list()
self.stack_list1 = [
Convolutional(in_channels=3, out_channels=32, kernel_size=3),
Convolutional(in_channels=32, out_channels=64, kernel_size=3, stride=2),
Residual(64),
Convolutional(in_channels=64, out_channels=128, kernel_size=3, stride=2),
]
for _ in range(2):
self.stack_list1.append(Residual(128))
self.stack_list1.append(
Convolutional(in_channels=128, out_channels=256, kernel_size=3, stride=2)
)
for _ in range(8):
self.stack_list1.append(Residual(256))
# Res 8
self.stack_list2 = list()
self.stack_list2.append(
Convolutional(in_channels=256, out_channels=512, kernel_size=3, stride=2)
)
for _ in range(8):
self.stack_list2.append(Residual(512))
# Res4
self.stack_list3 = list()
self.stack_list3.append(
Convolutional(in_channels=512, out_channels=1024, kernel_size=3, stride=2)
)
for _ in range(4):
self.stack_list3.append(Residual(1024))
self.stack_list1 = nn.Sequential(*self.stack_list1)
self.stack_list2 = nn.Sequential(*self.stack_list2)
self.stack_list3 = nn.Sequential(*self.stack_list3)
def forward(self, x):
out1 = self.stack_list1(x)
out2 = self.stack_list2(out1)
out3 = self.stack_list3(out2)
return out1, out2, out3
class YOLOv3(nn.Module):
def __init__(self, C, B=2):
super().__init__()
self.darknet = Darknet() # out size 19x19
self.dbl5_1 = DBLx5(1024)
self.dbl5_2 = DBLx5(512)
self.detection_1 = Detection(1024, C, B)
self.detection_2 = Detection(512, C, B)
self.detection_3 = Detection(256, C, B)
self.fpn_1 = FPN(1024, 512)
self.fpn_2 = FPN(512, 256)
def forward(self, x):
# x1 - 256x256
# x2 - 512x512
# x3 - 1024x1024
x1, x2, x3 = self.darknet(x)
out1 = self.dbl5_1(x3)
out2 = self.fpn_1(out1, x2)
out2 = self.dbl5_2(out2)
out3 = self.fpn_2(out2, x1)
out1 = self.detection_1(out1)
out2 = self.detection_2(out2)
out3 = self.detection_3(out3)
return out1, out2, out3
Torchsummary gives some unrealistically large model size. When training to obtain model predictions, it gives a CUDA out of memory error. I think the problem is Darknet-53. Separately, the model weighs normally, but if you import it into Yolo, this huge number appears. (Image size is 3x416x416) Summary output:
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 416, 416] 864
BatchNorm2d-2 [-1, 32, 416, 416] 64
LeakyReLU-3 [-1, 32, 416, 416] 0
Convolutional-4 [-1, 32, 416, 416] 0
Conv2d-5 [-1, 64, 208, 208] 18,432
BatchNorm2d-6 [-1, 64, 208, 208] 128
LeakyReLU-7 [-1, 64, 208, 208] 0
Convolutional-8 [-1, 64, 208, 208] 0
Conv2d-9 [-1, 32, 208, 208] 2,048
BatchNorm2d-10 [-1, 32, 208, 208] 64
LeakyReLU-11 [-1, 32, 208, 208] 0
Convolutional-12 [-1, 32, 208, 208] 0
Conv2d-13 [-1, 64, 208, 208] 18,432
BatchNorm2d-14 [-1, 64, 208, 208] 128
LeakyReLU-15 [-1, 64, 208, 208] 0
Convolutional-16 [-1, 64, 208, 208] 0
Residual-17 [-1, 64, 208, 208] 0
Conv2d-18 [-1, 128, 104, 104] 73,728
BatchNorm2d-19 [-1, 128, 104, 104] 256
LeakyReLU-20 [-1, 128, 104, 104] 0
Convolutional-21 [-1, 128, 104, 104] 0
Conv2d-22 [-1, 64, 104, 104] 8,192
BatchNorm2d-23 [-1, 64, 104, 104] 128
LeakyReLU-24 [-1, 64, 104, 104] 0
Convolutional-25 [-1, 64, 104, 104] 0
Conv2d-26 [-1, 128, 104, 104] 73,728
BatchNorm2d-27 [-1, 128, 104, 104] 256
LeakyReLU-28 [-1, 128, 104, 104] 0
Convolutional-29 [-1, 128, 104, 104] 0
Residual-30 [-1, 128, 104, 104] 0
Conv2d-31 [-1, 64, 104, 104] 8,192
BatchNorm2d-32 [-1, 64, 104, 104] 128
LeakyReLU-33 [-1, 64, 104, 104] 0
Convolutional-34 [-1, 64, 104, 104] 0
Conv2d-35 [-1, 128, 104, 104] 73,728
BatchNorm2d-36 [-1, 128, 104, 104] 256
LeakyReLU-37 [-1, 128, 104, 104] 0
Convolutional-38 [-1, 128, 104, 104] 0
Residual-39 [-1, 128, 104, 104] 0
Conv2d-40 [-1, 256, 52, 52] 294,912
BatchNorm2d-41 [-1, 256, 52, 52] 512
LeakyReLU-42 [-1, 256, 52, 52] 0
Convolutional-43 [-1, 256, 52, 52] 0
Conv2d-44 [-1, 128, 52, 52] 32,768
BatchNorm2d-45 [-1, 128, 52, 52] 256
LeakyReLU-46 [-1, 128, 52, 52] 0
Convolutional-47 [-1, 128, 52, 52] 0
Conv2d-48 [-1, 256, 52, 52] 294,912
BatchNorm2d-49 [-1, 256, 52, 52] 512
LeakyReLU-50 [-1, 256, 52, 52] 0
Convolutional-51 [-1, 256, 52, 52] 0
Residual-52 [-1, 256, 52, 52] 0
Conv2d-53 [-1, 128, 52, 52] 32,768
BatchNorm2d-54 [-1, 128, 52, 52] 256
LeakyReLU-55 [-1, 128, 52, 52] 0
Convolutional-56 [-1, 128, 52, 52] 0
Conv2d-57 [-1, 256, 52, 52] 294,912
BatchNorm2d-58 [-1, 256, 52, 52] 512
LeakyReLU-59 [-1, 256, 52, 52] 0
Convolutional-60 [-1, 256, 52, 52] 0
Residual-61 [-1, 256, 52, 52] 0
Conv2d-62 [-1, 128, 52, 52] 32,768
BatchNorm2d-63 [-1, 128, 52, 52] 256
LeakyReLU-64 [-1, 128, 52, 52] 0
Convolutional-65 [-1, 128, 52, 52] 0
Conv2d-66 [-1, 256, 52, 52] 294,912
BatchNorm2d-67 [-1, 256, 52, 52] 512
LeakyReLU-68 [-1, 256, 52, 52] 0
Convolutional-69 [-1, 256, 52, 52] 0
Residual-70 [-1, 256, 52, 52] 0
Conv2d-71 [-1, 128, 52, 52] 32,768
BatchNorm2d-72 [-1, 128, 52, 52] 256
LeakyReLU-73 [-1, 128, 52, 52] 0
Convolutional-74 [-1, 128, 52, 52] 0
Conv2d-75 [-1, 256, 52, 52] 294,912
BatchNorm2d-76 [-1, 256, 52, 52] 512
LeakyReLU-77 [-1, 256, 52, 52] 0
Convolutional-78 [-1, 256, 52, 52] 0
Residual-79 [-1, 256, 52, 52] 0
Conv2d-80 [-1, 128, 52, 52] 32,768
BatchNorm2d-81 [-1, 128, 52, 52] 256
LeakyReLU-82 [-1, 128, 52, 52] 0
Convolutional-83 [-1, 128, 52, 52] 0
Conv2d-84 [-1, 256, 52, 52] 294,912
BatchNorm2d-85 [-1, 256, 52, 52] 512
LeakyReLU-86 [-1, 256, 52, 52] 0
Convolutional-87 [-1, 256, 52, 52] 0
Residual-88 [-1, 256, 52, 52] 0
Conv2d-89 [-1, 128, 52, 52] 32,768
BatchNorm2d-90 [-1, 128, 52, 52] 256
LeakyReLU-91 [-1, 128, 52, 52] 0
Convolutional-92 [-1, 128, 52, 52] 0
Conv2d-93 [-1, 256, 52, 52] 294,912
BatchNorm2d-94 [-1, 256, 52, 52] 512
LeakyReLU-95 [-1, 256, 52, 52] 0
Convolutional-96 [-1, 256, 52, 52] 0
Residual-97 [-1, 256, 52, 52] 0
Conv2d-98 [-1, 128, 52, 52] 32,768
BatchNorm2d-99 [-1, 128, 52, 52] 256
LeakyReLU-100 [-1, 128, 52, 52] 0
Convolutional-101 [-1, 128, 52, 52] 0
Conv2d-102 [-1, 256, 52, 52] 294,912
BatchNorm2d-103 [-1, 256, 52, 52] 512
LeakyReLU-104 [-1, 256, 52, 52] 0
Convolutional-105 [-1, 256, 52, 52] 0
Residual-106 [-1, 256, 52, 52] 0
Conv2d-107 [-1, 128, 52, 52] 32,768
BatchNorm2d-108 [-1, 128, 52, 52] 256
LeakyReLU-109 [-1, 128, 52, 52] 0
Convolutional-110 [-1, 128, 52, 52] 0
Conv2d-111 [-1, 256, 52, 52] 294,912
BatchNorm2d-112 [-1, 256, 52, 52] 512
LeakyReLU-113 [-1, 256, 52, 52] 0
Convolutional-114 [-1, 256, 52, 52] 0
Residual-115 [-1, 256, 52, 52] 0
Conv2d-116 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-117 [-1, 512, 26, 26] 1,024
LeakyReLU-118 [-1, 512, 26, 26] 0
Convolutional-119 [-1, 512, 26, 26] 0
Conv2d-120 [-1, 256, 26, 26] 131,072
BatchNorm2d-121 [-1, 256, 26, 26] 512
LeakyReLU-122 [-1, 256, 26, 26] 0
Convolutional-123 [-1, 256, 26, 26] 0
Conv2d-124 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-125 [-1, 512, 26, 26] 1,024
LeakyReLU-126 [-1, 512, 26, 26] 0
Convolutional-127 [-1, 512, 26, 26] 0
Residual-128 [-1, 512, 26, 26] 0
Conv2d-129 [-1, 256, 26, 26] 131,072
BatchNorm2d-130 [-1, 256, 26, 26] 512
LeakyReLU-131 [-1, 256, 26, 26] 0
Convolutional-132 [-1, 256, 26, 26] 0
Conv2d-133 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-134 [-1, 512, 26, 26] 1,024
LeakyReLU-135 [-1, 512, 26, 26] 0
Convolutional-136 [-1, 512, 26, 26] 0
Residual-137 [-1, 512, 26, 26] 0
Conv2d-138 [-1, 256, 26, 26] 131,072
BatchNorm2d-139 [-1, 256, 26, 26] 512
LeakyReLU-140 [-1, 256, 26, 26] 0
Convolutional-141 [-1, 256, 26, 26] 0
Conv2d-142 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-143 [-1, 512, 26, 26] 1,024
LeakyReLU-144 [-1, 512, 26, 26] 0
Convolutional-145 [-1, 512, 26, 26] 0
Residual-146 [-1, 512, 26, 26] 0
Conv2d-147 [-1, 256, 26, 26] 131,072
BatchNorm2d-148 [-1, 256, 26, 26] 512
LeakyReLU-149 [-1, 256, 26, 26] 0
Convolutional-150 [-1, 256, 26, 26] 0
Conv2d-151 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-152 [-1, 512, 26, 26] 1,024
LeakyReLU-153 [-1, 512, 26, 26] 0
Convolutional-154 [-1, 512, 26, 26] 0
Residual-155 [-1, 512, 26, 26] 0
Conv2d-156 [-1, 256, 26, 26] 131,072
BatchNorm2d-157 [-1, 256, 26, 26] 512
LeakyReLU-158 [-1, 256, 26, 26] 0
Convolutional-159 [-1, 256, 26, 26] 0
Conv2d-160 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-161 [-1, 512, 26, 26] 1,024
LeakyReLU-162 [-1, 512, 26, 26] 0
Convolutional-163 [-1, 512, 26, 26] 0
Residual-164 [-1, 512, 26, 26] 0
Conv2d-165 [-1, 256, 26, 26] 131,072
BatchNorm2d-166 [-1, 256, 26, 26] 512
LeakyReLU-167 [-1, 256, 26, 26] 0
Convolutional-168 [-1, 256, 26, 26] 0
Conv2d-169 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-170 [-1, 512, 26, 26] 1,024
LeakyReLU-171 [-1, 512, 26, 26] 0
Convolutional-172 [-1, 512, 26, 26] 0
Residual-173 [-1, 512, 26, 26] 0
Conv2d-174 [-1, 256, 26, 26] 131,072
BatchNorm2d-175 [-1, 256, 26, 26] 512
LeakyReLU-176 [-1, 256, 26, 26] 0
Convolutional-177 [-1, 256, 26, 26] 0
Conv2d-178 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-179 [-1, 512, 26, 26] 1,024
LeakyReLU-180 [-1, 512, 26, 26] 0
Convolutional-181 [-1, 512, 26, 26] 0
Residual-182 [-1, 512, 26, 26] 0
Conv2d-183 [-1, 256, 26, 26] 131,072
BatchNorm2d-184 [-1, 256, 26, 26] 512
LeakyReLU-185 [-1, 256, 26, 26] 0
Convolutional-186 [-1, 256, 26, 26] 0
Conv2d-187 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-188 [-1, 512, 26, 26] 1,024
LeakyReLU-189 [-1, 512, 26, 26] 0
Convolutional-190 [-1, 512, 26, 26] 0
Residual-191 [-1, 512, 26, 26] 0
Conv2d-192 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-193 [-1, 1024, 13, 13] 2,048
LeakyReLU-194 [-1, 1024, 13, 13] 0
Convolutional-195 [-1, 1024, 13, 13] 0
Conv2d-196 [-1, 512, 13, 13] 524,288
BatchNorm2d-197 [-1, 512, 13, 13] 1,024
LeakyReLU-198 [-1, 512, 13, 13] 0
Convolutional-199 [-1, 512, 13, 13] 0
Conv2d-200 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-201 [-1, 1024, 13, 13] 2,048
LeakyReLU-202 [-1, 1024, 13, 13] 0
Convolutional-203 [-1, 1024, 13, 13] 0
Residual-204 [-1, 1024, 13, 13] 0
Conv2d-205 [-1, 512, 13, 13] 524,288
BatchNorm2d-206 [-1, 512, 13, 13] 1,024
LeakyReLU-207 [-1, 512, 13, 13] 0
Convolutional-208 [-1, 512, 13, 13] 0
Conv2d-209 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-210 [-1, 1024, 13, 13] 2,048
LeakyReLU-211 [-1, 1024, 13, 13] 0
Convolutional-212 [-1, 1024, 13, 13] 0
Residual-213 [-1, 1024, 13, 13] 0
Conv2d-214 [-1, 512, 13, 13] 524,288
BatchNorm2d-215 [-1, 512, 13, 13] 1,024
LeakyReLU-216 [-1, 512, 13, 13] 0
Convolutional-217 [-1, 512, 13, 13] 0
Conv2d-218 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-219 [-1, 1024, 13, 13] 2,048
LeakyReLU-220 [-1, 1024, 13, 13] 0
Convolutional-221 [-1, 1024, 13, 13] 0
Residual-222 [-1, 1024, 13, 13] 0
Conv2d-223 [-1, 512, 13, 13] 524,288
BatchNorm2d-224 [-1, 512, 13, 13] 1,024
LeakyReLU-225 [-1, 512, 13, 13] 0
Convolutional-226 [-1, 512, 13, 13] 0
Conv2d-227 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-228 [-1, 1024, 13, 13] 2,048
LeakyReLU-229 [-1, 1024, 13, 13] 0
Convolutional-230 [-1, 1024, 13, 13] 0
Residual-231 [-1, 1024, 13, 13] 0
Darknet-232 [[-1, 256, 52, 52], [-1, 512, 26, 26], [-1, 1024, 13, 13]] 0
Conv2d-233 [-1, 512, 15, 15] 524,288
BatchNorm2d-234 [-1, 512, 15, 15] 1,024
LeakyReLU-235 [-1, 512, 15, 15] 0
Convolutional-236 [-1, 512, 15, 15] 0
Conv2d-237 [-1, 1024, 15, 15] 4,718,592
BatchNorm2d-238 [-1, 1024, 15, 15] 2,048
LeakyReLU-239 [-1, 1024, 15, 15] 0
Convolutional-240 [-1, 1024, 15, 15] 0
Conv2d-241 [-1, 512, 17, 17] 524,288
BatchNorm2d-242 [-1, 512, 17, 17] 1,024
LeakyReLU-243 [-1, 512, 17, 17] 0
Convolutional-244 [-1, 512, 17, 17] 0
Conv2d-245 [-1, 1024, 17, 17] 4,718,592
BatchNorm2d-246 [-1, 1024, 17, 17] 2,048
LeakyReLU-247 [-1, 1024, 17, 17] 0
Convolutional-248 [-1, 1024, 17, 17] 0
Conv2d-249 [-1, 1024, 19, 19] 1,048,576
BatchNorm2d-250 [-1, 1024, 19, 19] 2,048
LeakyReLU-251 [-1, 1024, 19, 19] 0
Convolutional-252 [-1, 1024, 19, 19] 0
DBLx5-253 [-1, 1024, 13, 13] 0
Conv2d-254 [-1, 512, 13, 13] 524,288
BatchNorm2d-255 [-1, 512, 13, 13] 1,024
LeakyReLU-256 [-1, 512, 13, 13] 0
Convolutional-257 [-1, 512, 13, 13] 0
ConvTranspose2d-258 [-1, 512, 26, 26] 1,049,088
Conv2d-259 [-1, 512, 26, 26] 4,718,592
BatchNorm2d-260 [-1, 512, 26, 26] 1,024
LeakyReLU-261 [-1, 512, 26, 26] 0
Convolutional-262 [-1, 512, 26, 26] 0
FPN-263 [-1, 512, 26, 26] 0
Conv2d-264 [-1, 256, 28, 28] 131,072
BatchNorm2d-265 [-1, 256, 28, 28] 512
LeakyReLU-266 [-1, 256, 28, 28] 0
Convolutional-267 [-1, 256, 28, 28] 0
Conv2d-268 [-1, 512, 28, 28] 1,179,648
BatchNorm2d-269 [-1, 512, 28, 28] 1,024
LeakyReLU-270 [-1, 512, 28, 28] 0
Convolutional-271 [-1, 512, 28, 28] 0
Conv2d-272 [-1, 256, 30, 30] 131,072
BatchNorm2d-273 [-1, 256, 30, 30] 512
LeakyReLU-274 [-1, 256, 30, 30] 0
Convolutional-275 [-1, 256, 30, 30] 0
Conv2d-276 [-1, 512, 30, 30] 1,179,648
BatchNorm2d-277 [-1, 512, 30, 30] 1,024
LeakyReLU-278 [-1, 512, 30, 30] 0
Convolutional-279 [-1, 512, 30, 30] 0
Conv2d-280 [-1, 512, 32, 32] 262,144
BatchNorm2d-281 [-1, 512, 32, 32] 1,024
LeakyReLU-282 [-1, 512, 32, 32] 0
Convolutional-283 [-1, 512, 32, 32] 0
DBLx5-284 [-1, 512, 26, 26] 0
Conv2d-285 [-1, 256, 26, 26] 131,072
BatchNorm2d-286 [-1, 256, 26, 26] 512
LeakyReLU-287 [-1, 256, 26, 26] 0
Convolutional-288 [-1, 256, 26, 26] 0
ConvTranspose2d-289 [-1, 256, 52, 52] 262,400
Conv2d-290 [-1, 256, 52, 52] 1,179,648
BatchNorm2d-291 [-1, 256, 52, 52] 512
LeakyReLU-292 [-1, 256, 52, 52] 0
Convolutional-293 [-1, 256, 52, 52] 0
FPN-294 [-1, 256, 52, 52] 0
Conv2d-295 [-1, 512, 13, 13] 524,288
BatchNorm2d-296 [-1, 512, 13, 13] 1,024
LeakyReLU-297 [-1, 512, 13, 13] 0
Convolutional-298 [-1, 512, 13, 13] 0
Conv2d-299 [-1, 1024, 13, 13] 4,718,592
BatchNorm2d-300 [-1, 1024, 13, 13] 2,048
LeakyReLU-301 [-1, 1024, 13, 13] 0
Convolutional-302 [-1, 1024, 13, 13] 0
Conv2d-303 [-1, 18, 13, 13] 18,450
Detection-304 [-1, 18, 13, 13] 0
Conv2d-305 [-1, 256, 26, 26] 131,072
BatchNorm2d-306 [-1, 256, 26, 26] 512
LeakyReLU-307 [-1, 256, 26, 26] 0
Convolutional-308 [-1, 256, 26, 26] 0
Conv2d-309 [-1, 512, 26, 26] 1,179,648
BatchNorm2d-310 [-1, 512, 26, 26] 1,024
LeakyReLU-311 [-1, 512, 26, 26] 0
Convolutional-312 [-1, 512, 26, 26] 0
Conv2d-313 [-1, 18, 26, 26] 9,234
Detection-314 [-1, 18, 26, 26] 0
Conv2d-315 [-1, 128, 52, 52] 32,768
BatchNorm2d-316 [-1, 128, 52, 52] 256
LeakyReLU-317 [-1, 128, 52, 52] 0
Convolutional-318 [-1, 128, 52, 52] 0
Conv2d-319 [-1, 256, 52, 52] 294,912
BatchNorm2d-320 [-1, 256, 52, 52] 512
LeakyReLU-321 [-1, 256, 52, 52] 0
Convolutional-322 [-1, 256, 52, 52] 0
Conv2d-323 [-1, 18, 52, 52] 4,626
Detection-324 [-1, 18, 52, 52] 0
================================================================
Total params: 69,802,262
Trainable params: 69,802,262
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 1.98
Forward/backward pass size (MB): 316329755939.75
Params size (MB): 266.27
Estimated Total Size (MB): 316329756208.00
--------------------------------------------------------------------------
I don't understand what the error is. There can't be such a big model. Error:
OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 7.06 MiB is free. Process 5914 has 14.74 GiB memory in use. Of the allocated memory 14.53 GiB is allocated by PyTorch, and 88.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.
I don’t understand why the model is so huge.
I tried to reproduce the error and got the same issue. The issue comes from torchsummary
, so you might want to open an issue there.
However, I am able to run a forward and backward pass through the model without any issue using a GPU with 8GB of VRAM (and a batch size of one).
If you still encounter memory issue, you should try to reduce your batch size.
I took the liberty to clean up your code:
import torch
from torch import nn, Tensor
class Convolutional(nn.Module):
def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding: int = 0) -> None:
super().__init__()
self.stack = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1)
)
def forward(self, x: Tensor) -> Tensor:
return self.stack(x)
class Detection(nn.Module):
def __init__(self, in_channels: int, C: int, B: int) -> None:
super().__init__()
out_channels = in_channels // 2
self.stack = nn.Sequential(
Convolutional(in_channels, out_channels, 1),
Convolutional(out_channels, in_channels, 3, padding=1),
nn.Conv2d(in_channels, 5 * B + C, 1)
)
def forward(self, x: Tensor) -> Tensor:
return self.stack(x)
class FPN(nn.Module):
def __init__(self, in_channels: int, out_channels: int) -> None:
super().__init__()
self.conv1 = Convolutional(in_channels, out_channels, 1)
self.up = nn.ConvTranspose2d(out_channels, out_channels, 2, stride=2)
self.conv3 = Convolutional(2 * out_channels, out_channels, 3, padding=1)
def forward(self, x: Tensor, skip: Tensor) -> Tensor:
x = self.conv1(x)
x = self.up(x)
out = torch.cat((x, skip), 1)
return self.conv3(out)
class DBLx5(nn.Module):
def __init__(self, in_channels: int) -> None:
super().__init__()
out_channels = in_channels // 2
self.stack = nn.Sequential(
Convolutional(in_channels, out_channels, 1),
Convolutional(out_channels, in_channels, 3, padding=1),
Convolutional(in_channels, out_channels, 1),
Convolutional(out_channels, in_channels, 3, padding=1),
Convolutional(in_channels, in_channels, 1)
)
def forward(self, x: Tensor) -> Tensor:
return self.stack(x)
class Residual(nn.Module):
def __init__(self, in_channels: int) -> None:
super().__init__()
self.stack = nn.Sequential(
Convolutional(in_channels, in_channels // 2, 1),
Convolutional(in_channels // 2, in_channels, 3, padding=1)
)
def forward(self, x: Tensor) -> Tensor:
return self.stack(x) + x
class Darknet(nn.Module):
def __init__(self) -> None:
super().__init__()
self.stack1 = nn.Sequential(
Convolutional(3, 32, 3, padding=1),
Convolutional(32, 64, 3, padding=1, stride=2),
Residual(64),
Convolutional(64, 128, 3, padding=1, stride=2),
*(Residual(128) for _ in range(2)),
Convolutional(128, 256, 3, padding=1, stride=2),
*(Residual(256) for _ in range(8))
)
self.stack2 = nn.Sequential(
Convolutional(256, 512, 3, padding=1, stride=2),
*(Residual(512) for _ in range(8))
)
self.stack3 = nn.Sequential(
Convolutional(512, 1024, 3, padding=1, stride=2),
*(Residual(1024) for _ in range(4))
)
def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
out1 = self.stack1(x)
out2 = self.stack2(out1)
out3 = self.stack3(out2)
return out1, out2, out3
class YOLOv3(nn.Module):
def __init__(self, C: int, B: int = 2) -> None:
super().__init__()
self.darknet = Darknet()
self.dbl5_1 = DBLx5(1024)
self.dbl5_2 = DBLx5(512)
self.detection_1 = Detection(1024, C, B)
self.detection_2 = Detection(512, C, B)
self.detection_3 = Detection(256, C, B)
self.fpn_1 = FPN(1024, 512)
self.fpn_2 = FPN(512, 256)
def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
x1, x2, x3 = self.darknet(x)
out1 = self.dbl5_1(x3)
out2 = self.fpn_1(out1, x2)
out2 = self.dbl5_2(out2)
out3 = self.fpn_2(out2, x1)
out1 = self.detection_1(out1)
out2 = self.detection_2(out2)
out3 = self.detection_3(out3)
return out1, out2, out3