I'm trying to training F-RCNN based on coco dataset on my images. Image size is 512X512
I've tested dataloader separately and it works and prints the batch images and BB details
Also i've tried to print the loss in NN and it does print the batch_mean
as well and after that ERROR occurs.
img_process = v2.Compose(
[
v2.ToTensor(),
v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
]
)
class SCocoDetection(datasets.CocoDetection):
def __init__(
self,
image_directory_path: str,
annotation_path : str,
train: bool = True,
image_processor = None
):
super().__init__(image_directory_path, annotation_path)
self.image_processor = image_processor
def __getitem__(self, idx):
image, annotations = super().__getitem__(idx)
images, targets = [], []
image_id = self.ids[idx]
for ann in annotations:
bbox = ann['bbox']
#small = (bbox[:, 2] * bbox[:, 3]) <= (image.size[1] * image.size[0] * 0.001)
small = (bbox[2] * bbox[3]) <= (512 * 512 * 0.001)
#print(small)
if small:
bbox = torch.tensor(bbox).unsqueeze(0).float()
boxes = ops.box_convert(bbox, in_fmt='xywh', out_fmt='xyxy')
boxes = boxes.float()
if (boxes[0][0] < boxes[0][2]) and (boxes[0][1] < boxes[0][3]):
output_dict = self.image_processor({"image": image, "boxes": boxes})
images.append(output_dict['image'])
targets.append({
'boxes': output_dict['boxes'],
'labels': torch.ones(len(boxes), dtype=int)
})
else:
print(f"Invalid box : {boxes}")
#print(f"image_id : {image_id} , idx : {idx} , targets :{targets}")
return images, targets
TRAIN_DATASET = SCocoDetection(
image_directory_path='047/v2_coco_train/images',
annotation_path='047/v2_coco_train/result.json',
image_processor=img_process,
train=True)
VAL_DATASET = SCocoDetection(
image_directory_path='047/v2_coco_test/images',
annotation_path= '047/v2_coco_test/result.json',
image_processor=img_process,
train=False)
print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
#print("Number of test examples:", len(TEST_DATASET))
def collate_fn(batch):
return tuple(zip(*batch))
TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET,collate_fn = collate_fn, batch_size=2, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=VAL_DATASET,collate_fn = collate_fn, batch_size=4, shuffle=True)
import numpy as np
class CocoDNN(L.LightningModule):
def __init__(self):
super().__init__()
self.model = models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights="DEFAULT")
def forward(self, images, targets=None):
return self.model(images, targets)
def training_step(self, batch, batch_idx):
imgs, annot = batch
print(f"Batch :{batch_idx}")
batch_losses = []
for img_b, annot_b in zip(imgs, annot):
print(len(img_b), len(annot_b))
if len(img_b) == 0:
continue
loss_dict = self.model(img_b, annot_b)
losses = sum(loss for loss in loss_dict.values())
#print(losses)
batch_losses.append(losses)
batch_mean = torch.mean(torch.stack(batch_losses))
#print(batch_mean)
self.log('train_loss', batch_mean)
def configure_optimizers(self):
return optim.SGD(self.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)
dnn = CocoDNN()
trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
trainer.fit(model=dnn, train_dataloaders=TRAIN_DATALOADER)
### Error messages and logs
{
"name": "RuntimeError",
"message": "view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.",
"stack": "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[192], line 3
1 dnn = CocoDNN()
2 trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
----> 3 trainer.fit(model=dnn, train_dataloaders=TRAIN_DATALOADER)
File site-packages/lightning/pytorch/trainer/trainer.py:538, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
536 self.state.status = TrainerStatus.RUNNING
537 self.training = True
--> 538 call._call_and_handle_interrupt(
539 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
540 )
File site-packages/lightning/pytorch/trainer/call.py:47, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
45 if trainer.strategy.launcher is not None:
46 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 47 return trainer_fn(*args, **kwargs)
49 except _TunerExitException:
50 _call_teardown_hook(trainer)
File site-packages/lightning/pytorch/trainer/trainer.py:574, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
567 assert self.state.fn is not None
568 ckpt_path = self._checkpoint_connector._select_ckpt_path(
569 self.state.fn,
570 ckpt_path,
571 model_provided=True,
572 model_connected=self.lightning_module is not None,
573 )
--> 574 self._run(model, ckpt_path=ckpt_path)
576 assert self.state.stopped
577 self.training = False
File site-packages/lightning/pytorch/trainer/trainer.py:981, in Trainer._run(self, model, ckpt_path)
976 self._signal_connector.register_signal_handlers()
978 # ----------------------------
979 # RUN THE TRAINER
980 # ----------------------------
--> 981 results = self._run_stage()
983 # ----------------------------
984 # POST-Training CLEAN UP
985 # ----------------------------
986 log.debug(f\"{self.__class__.__name__}: trainer tearing down\")
File site-packages/lightning/pytorch/trainer/trainer.py:1025, in Trainer._run_stage(self)
1023 self._run_sanity_check()
1024 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1025 self.fit_loop.run()
1026 return None
1027 raise RuntimeError(f\"Unexpected state {self.state}\")
File site-packages/lightning/pytorch/loops/fit_loop.py:205, in _FitLoop.run(self)
203 try:
204 self.on_advance_start()
--> 205 self.advance()
206 self.on_advance_end()
207 self._restarting = False
File site-packages/lightning/pytorch/loops/fit_loop.py:363, in _FitLoop.advance(self)
361 with self.trainer.profiler.profile(\"run_training_epoch\"):
362 assert self._data_fetcher is not None
--> 363 self.epoch_loop.run(self._data_fetcher)
File site-packages/lightning/pytorch/loops/training_epoch_loop.py:140, in _TrainingEpochLoop.run(self, data_fetcher)
138 while not self.done:
139 try:
--> 140 self.advance(data_fetcher)
141 self.on_advance_end(data_fetcher)
142 self._restarting = False
File site-packages/lightning/pytorch/loops/training_epoch_loop.py:250, in _TrainingEpochLoop.advance(self, data_fetcher)
247 with trainer.profiler.profile(\"run_training_batch\"):
248 if trainer.lightning_module.automatic_optimization:
249 # in automatic optimization, there can only be one optimizer
--> 250 batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
251 else:
252 batch_output = self.manual_optimization.run(kwargs)
File site-packages/lightning/pytorch/loops/optimization/automatic.py:190, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs)
183 closure()
185 # ------------------------------
186 # BACKWARD PASS
187 # ------------------------------
188 # gradient update with accumulated gradients
189 else:
--> 190 self._optimizer_step(batch_idx, closure)
192 result = closure.consume_result()
193 if result.loss is None:
File site-packages/lightning/pytorch/loops/optimization/automatic.py:268, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure)
265 self.optim_progress.optimizer.step.increment_ready()
267 # model hook
--> 268 call._call_lightning_module_hook(
269 trainer,
270 \"optimizer_step\",
271 trainer.current_epoch,
272 batch_idx,
273 optimizer,
274 train_step_and_backward_closure,
275 )
277 if not should_accumulate:
278 self.optim_progress.optimizer.step.increment_completed()
File site-packages/lightning/pytorch/trainer/call.py:167, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
164 pl_module._current_fx_name = hook_name
166 with trainer.profiler.profile(f\"[LightningModule]{pl_module.__class__.__name__}.{hook_name}\"):
--> 167 output = fn(*args, **kwargs)
169 # restore current_fx when nested context
170 pl_module._current_fx_name = prev_fx_name
File site-packages/lightning/pytorch/core/module.py:1306, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
1275 def optimizer_step(
1276 self,
1277 epoch: int,
(...)
1280 optimizer_closure: Optional[Callable[[], Any]] = None,
1281 ) -> None:
1282 r\"\"\"Override this method to adjust the default way the :class:`~lightning.pytorch.trainer.trainer.Trainer` calls
1283 the optimizer.
1284
(...)
1304
1305 \"\"\"
-> 1306 optimizer.step(closure=optimizer_closure)
File site-packages/lightning/pytorch/core/optimizer.py:153, in LightningOptimizer.step(self, closure, **kwargs)
150 raise MisconfigurationException(\"When `optimizer.step(closure)` is called, the closure should be callable\")
152 assert self._strategy is not None
--> 153 step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
155 self._on_after_step()
157 return step_output
File site-packages/lightning/pytorch/strategies/strategy.py:238, in Strategy.optimizer_step(self, optimizer, closure, model, **kwargs)
236 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
237 assert isinstance(model, pl.LightningModule)
--> 238 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File site-packages/lightning/pytorch/plugins/precision/precision.py:122, in Precision.optimizer_step(self, optimizer, model, closure, **kwargs)
120 \"\"\"Hook to run the optimizer step.\"\"\"
121 closure = partial(self._wrap_closure, model, optimizer, closure)
--> 122 return optimizer.step(closure=closure, **kwargs)
File site-packages/torch/optim/optimizer.py:487, in Optimizer.profile_hook_step.<locals>.wrapper(*args, **kwargs)
482 else:
483 raise RuntimeError(
484 f\"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}.\"
485 )
--> 487 out = func(*args, **kwargs)
488 self._optimizer_step_code()
490 # call optimizer step post hooks
File site-packages/torch/optim/optimizer.py:91, in _use_grad_for_differentiable.<locals>._use_grad(self, *args, **kwargs)
89 torch.set_grad_enabled(self.defaults[\"differentiable\"])
90 torch._dynamo.graph_break()
---> 91 ret = func(self, *args, **kwargs)
92 finally:
93 torch._dynamo.graph_break()
File site-packages/torch/optim/sgd.py:112, in SGD.step(self, closure)
110 if closure is not None:
111 with torch.enable_grad():
--> 112 loss = closure()
114 for group in self.param_groups:
115 params: List[Tensor] = []
File site-packages/lightning/pytorch/plugins/precision/precision.py:108, in Precision._wrap_closure(self, model, optimizer, closure)
95 def _wrap_closure(
96 self,
97 model: \"pl.LightningModule\",
98 optimizer: Steppable,
99 closure: Callable[[], Any],
100 ) -> Any:
101 \"\"\"This double-closure allows makes sure the ``closure`` is executed before the ``on_before_optimizer_step``
102 hook is called.
103
(...)
106
107 \"\"\"
--> 108 closure_result = closure()
109 self._after_closure(model, optimizer)
110 return closure_result
File site-packages/lightning/pytorch/loops/optimization/automatic.py:144, in Closure.__call__(self, *args, **kwargs)
142 @override
143 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 144 self._result = self.closure(*args, **kwargs)
145 return self._result.loss
File site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File site-packages/lightning/pytorch/loops/optimization/automatic.py:138, in Closure.closure(self, *args, **kwargs)
135 self._zero_grad_fn()
137 if self._backward_fn is not None and step_output.closure_loss is not None:
--> 138 self._backward_fn(step_output.closure_loss)
140 return step_output
File site-packages/lightning/pytorch/loops/optimization/automatic.py:239, in _AutomaticOptimization._make_backward_fn.<locals>.backward_fn(loss)
238 def backward_fn(loss: Tensor) -> None:
--> 239 call._call_strategy_hook(self.trainer, \"backward\", loss, optimizer)
File site-packages/lightning/pytorch/trainer/call.py:319, in _call_strategy_hook(trainer, hook_name, *args, **kwargs)
316 return None
318 with trainer.profiler.profile(f\"[Strategy]{trainer.strategy.__class__.__name__}.{hook_name}\"):
--> 319 output = fn(*args, **kwargs)
321 # restore current_fx when nested context
322 pl_module._current_fx_name = prev_fx_name
File site-packages/lightning/pytorch/strategies/strategy.py:212, in Strategy.backward(self, closure_loss, optimizer, *args, **kwargs)
209 assert self.lightning_module is not None
210 closure_loss = self.precision_plugin.pre_backward(closure_loss, self.lightning_module)
--> 212 self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
214 closure_loss = self.precision_plugin.post_backward(closure_loss, self.lightning_module)
215 self.post_backward(closure_loss)
File site-packages/lightning/pytorch/plugins/precision/precision.py:72, in Precision.backward(self, tensor, model, optimizer, *args, **kwargs)
52 @override
53 def backward( # type: ignore[override]
54 self,
(...)
59 **kwargs: Any,
60 ) -> None:
61 r\"\"\"Performs the actual backpropagation.
62
63 Args:
(...)
70
71 \"\"\"
---> 72 model.backward(tensor, *args, **kwargs)
File site-packages/lightning/pytorch/core/module.py:1101, in LightningModule.backward(self, loss, *args, **kwargs)
1099 self._fabric.backward(loss, *args, **kwargs)
1100 else:
-> 1101 loss.backward(*args, **kwargs)
File site-packages/torch/_tensor.py:581, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
571 if has_torch_function_unary(self):
572 return handle_torch_function(
573 Tensor.backward,
574 (self,),
(...)
579 inputs=inputs,
580 )
--> 581 torch.autograd.backward(
582 self, gradient, retain_graph, create_graph, inputs=inputs
583 )
File site-packages/torch/autograd/__init__.py:347, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
342 retain_graph = create_graph
344 # The reason we repeat the same comment below is that
345 # some Python versions print out the first line of a multi-line function
346 # calls in the traceback and some print out the last line
--> 347 _engine_run_backward(
348 tensors,
349 grad_tensors_,
350 retain_graph,
351 create_graph,
352 inputs,
353 allow_unreachable=True,
354 accumulate_grad=True,
355 )
File site-packages/torch/autograd/graph.py:825, in _engine_run_backward(t_outputs, *args, **kwargs)
823 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
824 try:
--> 825 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
826 t_outputs, *args, **kwargs
827 ) # Calls into the C++ engine to run the backward pass
828 finally:
829 if attach_logging_hooks:
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead."
}
#- PyTorch Lightning Version (e.g., 2.4.0): 2.4.0
#- PyTorch Version (e.g., 2.4): 2.5.1
#- Python version (e.g., 3.12): 3.11
#- OS (e.g., Linux): MacOS
#- CUDA/cuDNN version:
#- GPU models and configuration: MPS
#- How you installed Lightning(`conda`, `pip`, source): pip
No response
For me, I was using Mac M3 and MPS
acceleration. When i tested with CPU it worked fine.
Issue opened : https://github.com/pytorch/vision/issues/8706#issuecomment-2455363423