
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 128 but got size 256.I always get "mismatch error:" Always factor of 2

I have this code:

import logging
import os
import sys
import tempfile
from glob import glob

import torch
from torch.cuda.amp import autocast, GradScaler
from PIL import Image
from torch.utils.tensorboard import SummaryWriter

import monai
from import create_test_image_2d, list_data_collate, decollate_batch, DataLoader
from monai.inferers import sliding_window_inference
from monai.metrics import DiceMetric
from monai.transforms import (
from monai.visualize import plot_2d_or_3d_image

def main(tempdir):
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Check and convert data format only once
    converted_raw_dict, converted_analyzed_dict = check_and_convert_format(updated_raw_dict, new_analyzed_dict)

    # Check if the dictionaries contain the same length or not, then create train val test:
    if len(converted_raw_dict) != len(converted_analyzed_dict):
        raise ValueError("The lengths of converted_raw_dict and converted_analyzed_dict do not match.")

    num_images = len(converted_raw_dict)

    # Calculate the number of images for training, validation, and test, e.g., using an 80-10-10 split
    raw_image_list = list(converted_raw_dict.items())
    analyzed_image_list = list(converted_analyzed_dict.items())

    # Calculate the number of images for training, validation, and test
    train_percentage = 0.8
    val_percentage = 0.1
    num_images = len(raw_image_list)
    num_train_images = int(num_images * train_percentage)
    num_val_images = int(num_images * val_percentage)

    # Select images for training, validation, and test
    train_files = [{"img": image, "seg": converted_analyzed_dict[name]} for name, image in raw_image_list[:num_train_images]]
    val_files = [{"img": image, "seg": converted_analyzed_dict[name]} for name, image in raw_image_list[num_train_images:num_train_images + num_val_images]]
    test_files = [{"img": image, "seg": converted_analyzed_dict[name]} for name, image in raw_image_list[num_train_images + num_val_images:]]

    # define transforms for image and segmentation
    train_transforms = Compose(
            EnsureChannelFirstd(keys=["img", "seg"], channel_dim=-1),  # Use channel_dim=-1 for NumPy arrays
            ScaleIntensityd(keys=["img", "seg"]),
            DivisiblePadd(keys=["img", "seg"],k=16),
    val_transforms = Compose(
            EnsureChannelFirstd(keys=["img", "seg"], channel_dim=-1),  # Use channel_dim=-1 for NumPy arrays
            ScaleIntensityd(keys=["img", "seg"]),
            DivisiblePadd(keys=["img", "seg"],k=16),

    # define dataset, data loader
    check_ds =, transform=train_transforms)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    check_loader = DataLoader(check_ds, batch_size=1, num_workers=1, collate_fn=list_data_collate)
    check_data = monai.utils.misc.first(check_loader)
    print(check_data["img"].shape, check_data["seg"].shape)

    # create a training data loader
    train_ds =, transform=train_transforms)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    train_loader = DataLoader(
    # create a validation data loader
    val_ds =, transform=val_transforms)
    val_loader = DataLoader(val_ds, batch_size=1, num_workers=1, collate_fn=list_data_collate)
    dice_metric = DiceMetric(include_background=True, reduction="mean", get_not_nans=False)
    post_trans = Compose([Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
    # create UNet, DiceLoss and Adam optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Print the shape of your input image for debugging
    #input_image_shape = train_files[0]["img"].shape
    #print("Input image shape:", input_image_shape)

    model = monai.networks.nets.UNet(
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
    loss_function = monai.losses.DiceLoss(sigmoid=True)
    optimizer = torch.optim.Adam(model.parameters(), 1e-3)

    # start a typical PyTorch training
    val_interval = 5
    best_metric = -1
    best_metric_epoch = -1
    epoch_loss_values = list()
    metric_values = list()
    writer = SummaryWriter()
    # Define the number of mini-batches to accumulate gradients over
    accumulation_steps = 4  # You can adjust this value based on your GPU memory capacity
    scaler = GradScaler()
    for epoch in range(10):
        print("-" * 10)
        print(f"epoch {epoch + 1}/{10}")
        epoch_loss = 0
        step = 0
        accumulated_loss = 0  # Initialize accumulated loss
        for batch_data in train_loader:
            step += 1
            inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
            with autocast():
              outputs = model(inputs)
              loss = loss_function(outputs, labels)
              loss /= accumulation_steps


            if step % accumulation_steps == 0:
                epoch_loss += accumulated_loss.item()  # Accumulated loss for logging
                accumulated_loss = 0  # Reset accumulated loss

                accumulated_loss += loss  # Accumulate the loss

            epoch_len = len(train_ds) // (train_loader.batch_size * accumulation_steps)
            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
            writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step)

        # Handle any remaining accumulated loss
        if step % accumulation_steps != 0:
            epoch_loss += accumulated_loss.item()

        epoch_loss /= step
        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")

        if (epoch + 1) % val_interval == 0:
            with torch.no_grad():
                val_images = None
                val_labels = None
                val_outputs = None
                for val_data in val_loader:
                    val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device)
                    roi_size = (64, 64)
                    sw_batch_size = 4
                    val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model)
                    val_outputs = [post_trans(i) for i in decollate_batch(val_outputs)]
                    # compute metric for current iteration
                    dice_metric(y_pred=val_outputs, y=val_labels)
                # aggregate the final mean dice result
                metric = dice_metric.aggregate().item()
                # reset the status for next validation round
                if metric > best_metric:
                    best_metric = metric
                    best_metric_epoch = epoch + 1
          , "best_metric_model_segmentation2d_dict.pth")
                    print("saved new best metric model")
                    "current epoch: {} current mean dice: {:.4f} best mean dice: {:.4f} at epoch {}".format(
                        epoch + 1, metric, best_metric, best_metric_epoch
                writer.add_scalar("val_mean_dice", metric, epoch + 1)
                # Print the shape of inputs, labels, and outputs
                print("Input shape:", inputs.shape)
                print("Label shape:", labels.shape)
                print("Output shape:", outputs.shape)

                # plot the last model output as GIF image in TensorBoard with the corresponding image and label
                plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="image")
                plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="label")
                plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="output")

    print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")

if __name__ == "__main__":
    with tempfile.TemporaryDirectory() as tempdir:

I have these tensors for my network: torch.Size([1, 1536, 1152]) torch.Size([1, 1536, 1152]).

But I always get the runtime error:

RuntimeError: Sizes of tensors must match except in dimension 1.
Expected size 128 but got size 256 for tensor number 1 in the list.

If I change my channel sizes, say I multiplied them by 2, I get Expected size 256 but got size 512 in that case. I am pretty sure I am making an obvious mistake but I could not find what specifically.

I have been trying to test a simple UNet with using Monai and Google Colab, but I am having tensor shape mismatch error. I just tried to train a network into overfitting with little images so that I know I am on te right track. But having either syntax or network architecture related issues. Could not fix it. I changed channels=(16, 32, 64, 128, 256) into channels=(32, 64, 128, 256, 512) or any other multiple, but got thhe same runtimeerror, just with different values as size and expected size.


  • it is the dimension problem of the input data.

    inputs and labels shoud not have three dimension.

    the dimesion should be torch.Size([1, 1, 1536, 1152]) torch.Size([1, 1, 1536, 1152]) if the batch size is 1.

    or torch.Size([10, 1, 1536, 1152]) torch.Size([10, 1, 1536, 1152]) if the batch size is 10.

    you forget the batch dimension in the first dimension.

    the codes below works fine

    import logging
    import os
    import sys
    import tempfile
    from glob import glob
    import torch
    from torch.cuda.amp import autocast, GradScaler
    from PIL import Image
    from torch.utils.tensorboard import SummaryWriter
    import monai
    def main(tempdir):
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = monai.networks.nets.UNet(
            channels=(16, 32, 64, 128, 256),
            strides=(2, 2, 2, 2),
        loss_function = monai.losses.DiceLoss(sigmoid=True)
        optimizer = torch.optim.Adam(model.parameters(), 1e-3)
        # start a typical PyTorch training
        val_interval = 5
        best_metric = -1
        best_metric_epoch = -1
        epoch_loss_values = list()
        metric_values = list()
        writer = SummaryWriter()
        # Define the number of mini-batches to accumulate gradients over
        accumulation_steps = 4  # You can adjust this value based on your GPU memory capacity
        scaler = GradScaler()
        for epoch in range(10):
            print("-" * 10)
            print(f"epoch {epoch + 1}/{10}")
            epoch_loss = 0
            step = 0
            accumulated_loss = 0  # Initialize accumulated loss
            # for batch_data in train_loader:
            if True:
                step += 1
                # inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device)
                inputs = torch.zeros((1, 1, 1536, 1152)).to(device)
                labels = torch.zeros((1, 1, 1536, 1152)).to(device)
                with autocast():
                  outputs = model(inputs)
                  loss = loss_function(outputs, labels)
                  loss /= accumulation_steps
    if __name__ == "__main__":
        with tempfile.TemporaryDirectory() as tempdir: