tensorflowkerasdeep-learningvgg-netsiamese-network

Shape mismatch with vgg16 keras: expected ndim=4, found ndim=2, shape received [None, None]


In trying to learn keras and deep learning, I want to create an image matting algorithm that uses an architecture similar to a modified autoencoder, where it takes two image inputs (a source image and a user-generated trimap) and produces one image output (the alpha values of the image foreground). The encoder part (of both inputs) is simple feature extraction using pre-trained VGG16. I want to train the decoder using the low-res alphamatting.com dataset.

Running the attached code produces an error: ValueError: Input 0 of layer block1_conv1 is incompatible with the layer: expected ndim=4, found ndim=2. Full shape received: [None, None]

I'm having trouble understanding this error. I verified that my twin_gen closure is producing image batches of shape (22, 256,256,3) for both inputs, so I would guess that the issue is that I have somehow created the model wrong, but I don't see where the error is. Can anyone help shed some light on how I could be seeing this error?

import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2DTranspose, Concatenate, BatchNormalization, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator


def DeConvBlock(input, num_output):
    x = Conv2DTranspose(num_output, kernel_size=3, strides=2, activation='relu', padding='same')(input)
    x = BatchNormalization()(x)
    x = Conv2DTranspose(num_output, kernel_size=3, strides=1, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2DTranspose(num_output, kernel_size=3, strides=1, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    return x


img_input = Input((256, 256, 3))
img_vgg16 = VGG16(include_top=False, weights='imagenet')
img_vgg16._name = 'img_vgg16'
img_vgg16.trainable = False


tm_input = Input((256, 256, 3))
tm_vgg16 = VGG16(include_top=False, weights='imagenet')
tm_vgg16._name = 'tm_vgg16'
tm_vgg16.trainable = False

img_vgg16 = img_vgg16(img_input)
tm_vgg16 = tm_vgg16(tm_input)
x = Concatenate()([img_vgg16, tm_vgg16])
x = DeConvBlock(x, 512)
x = DeConvBlock(x, 256)
x = DeConvBlock(x, 128)
x = DeConvBlock(x, 64)
x = DeConvBlock(x, 32)
x = Conv2DTranspose(1, kernel_size=3, strides=1, activation='sigmoid', padding='same')(x)


m = Model(inputs=[img_input, tm_input], outputs=x)
m.summary()
m.compile(optimizer='adam', loss='mean_squared_error')

gen = ImageDataGenerator(width_shift_range=0.1, rotation_range=30, height_shift_range=0.1, horizontal_flip=True, validation_split=0.2, preprocessing_function=preprocess_input)
SEED = 49


def twin_gen(generator, subset):
    gen_img = generator.flow_from_directory('./data', classes=['input_training_lowres'], seed=SEED, shuffle=False, subset=subset, color_mode='rgb')
    gen_map = generator.flow_from_directory('./data/trimap_training_lowres', classes=['Trimap1'], seed=SEED, shuffle=False, subset=subset, color_mode='rgb')
    gen_truth = generator.flow_from_directory('./data', classes=['gt_training_lowres'], seed=SEED, shuffle=False, subset=subset, color_mode='rgb')

    while True:
        img = gen_img.__next__()
        tm = gen_map.__next__()
        gt = gen_truth.__next__()
        yield [[img, tm], gt]


train_gen = twin_gen(gen, 'training')
val_gen = twin_gen(gen, 'validation')


checkpoint_filepath = 'checkpoint'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='auto',
    save_freq='epoch',
    save_best_only=True)


r = m.fit(train_gen, validation_data=val_gen, epochs=10, callbacks=[checkpoint])

Solution

  • First you didn't specify the input shape of VGG16 and you set include_top=False, so the default input shape will be (None, None ,3) for channels_last case.

    PS: You can check the source code of keras.applications.VGG16 and keras.applications.imagenet_utils.obtain_input_shape for detail.

    As you can see the output None shape by calling model.summary():

    __________________________________________________________________________________________________
    Layer (type)                    Output Shape         Param #     Connected to                     
    ==================================================================================================
    input_1 (InputLayer)            [(None, 256, 256, 3) 0                                            
    __________________________________________________________________________________________________
    input_3 (InputLayer)            [(None, 256, 256, 3) 0                                            
    __________________________________________________________________________________________________
    img_vgg16 (Functional)          (None, None, None, 5 14714688    input_1[0][0]                    
    __________________________________________________________________________________________________
    tm_vgg16 (Functional)           (None, None, None, 5 14714688    input_3[0][0]                    
    __________________________________________________________________________________________________
    concatenate (Concatenate)       (None, 8, 8, 1024)   0           img_vgg16[0][0]                  
                                                                     tm_vgg16[0][0]                   
    __________________________________________________________________________________________________
             
    

    To fix this you can simply set input_shape=(256, 256, 3) in VGG16, and calling model.summary() will now give you:

    __________________________________________________________________________________________________
    Layer (type)                    Output Shape         Param #     Connected to
    ==================================================================================================
    input_1 (InputLayer)            [(None, 256, 256, 3) 0
    __________________________________________________________________________________________________
    input_3 (InputLayer)            [(None, 256, 256, 3) 0
    __________________________________________________________________________________________________
    img_vgg16 (Functional)          (None, 8, 8, 512)    14714688    input_1[0][0]
    __________________________________________________________________________________________________
    tm_vgg16 (Functional)           (None, 8, 8, 512)    14714688    input_3[0][0]
    __________________________________________________________________________________________________
    concatenate (Concatenate)       (None, 8, 8, 1024)   0           img_vgg16[0][0]
                                                                     tm_vgg16[0][0]
    __________________________________________________________________________________________________
                
    

    The main cause of the error is when you calling __next__() it return tuple of two array (data, label) with shape ((batch_size, 256, 256, 3), (batch_size, 1)), but we really just want first one.

    Also the data generator should yield tuple not list otherwise there will be no gradients provided for any variable, because the fit function expect (inputs, targets) as returning of data generator.

    And you have another problem that your model's output shape is (batch_size, 256, 256, 1) but your gen_truth elements shape is (batch_size, 256, 256, 3) when you load the gen_truth image with color_mode='rgb', in order to get same shape with model's output you should load gen_truth using color_mode='grayscale' if you have grayscale image or load it using color_mode='rgba' and get the last channel value if you want using alpha value (I just guess it from the description in your question, but you should get the idea)

    Example code that running without any problem:

    import tensorflow as tf
    from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Conv2DTranspose, Concatenate, BatchNormalization, Input
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    
    def DeConvBlock(input, num_output):
        x = Conv2DTranspose(num_output, kernel_size=3, strides=2, activation='relu', padding='same')(input)
        x = BatchNormalization()(x)
        x = Conv2DTranspose(num_output, kernel_size=3, strides=1, activation='relu', padding='same')(x)
        x = BatchNormalization()(x)
        x = Conv2DTranspose(num_output, kernel_size=3, strides=1, activation='relu', padding='same')(x)
        x = BatchNormalization()(x)
        return x
    
    img_input = Input((256, 256, 3))
    img_vgg16 = VGG16(include_top=False, input_shape=(256, 256, 3), weights='imagenet')
    img_vgg16._name = 'img_vgg16'
    img_vgg16.trainable = False
    
    tm_input = Input((256, 256, 3))
    tm_vgg16 = VGG16(include_top=False, input_shape=(256, 256, 3), weights='imagenet')
    tm_vgg16._name = 'tm_vgg16'
    tm_vgg16.trainable = False
    
    img_vgg16 = img_vgg16(img_input)
    tm_vgg16 = tm_vgg16(tm_input)
    x = Concatenate()([img_vgg16, tm_vgg16])
    x = DeConvBlock(x, 512)
    x = DeConvBlock(x, 256)
    x = DeConvBlock(x, 128)
    x = DeConvBlock(x, 64)
    x = DeConvBlock(x, 32)
    x = Conv2DTranspose(1, kernel_size=3, strides=1, activation='sigmoid', padding='same')(x)
    
    m = Model(inputs=[img_input, tm_input], outputs=x)
    m.summary()
    m.compile(optimizer='adam', loss='mse')
    
    gen = ImageDataGenerator(width_shift_range=0.1, rotation_range=30, height_shift_range=0.1, horizontal_flip=True, validation_split=0.2, preprocessing_function=preprocess_input)
    SEED = 49
    
    def twin_gen(generator, subset):
        gen_img = generator.flow_from_directory('./data', classes=['input_training_lowres'], seed=SEED, shuffle=False, subset=subset, color_mode='rgb')
        gen_map = generator.flow_from_directory('./data/trimap_training_lowres', classes=['Trimap1'], seed=SEED, shuffle=False, subset=subset, color_mode='rgb')
        gen_truth = generator.flow_from_directory('./data', classes=['gt_training_lowres'], seed=SEED, shuffle=False, subset=subset, color_mode='grayscale')
    
        while True:
            img = gen_img.__next__()[0]
            tm = gen_map.__next__()[0]
            gt = gen_truth.__next__()[0]
            yield ([img, tm], gt)
    
    train_gen = twin_gen(gen, 'training')
    
    r = m.fit(train_gen, steps_per_epoch=5, epochs=3)