pythontensorflowkerastensorflow2.0dcgan

How to ensure TensorFlow Generator upsampling process creates seed with full coverage of random noise?


I'm working on adapting the code from the tensorflow 2.0 dcGAN tutorial (https://www.tensorflow.org/beta/tutorials/generative/dcgan) to a spectrogram of audio signals. I'm using libroasa chroma_cqt to convert the raw audio data into a WxHx2 matrix, and using that as the input. When I attempt to create the seed matrix by upscaling random noise, the result I'm getting is an alternating bands in time-space of random noise and 0s and a thin black bar on the top (see image).bared noise

I've adapted the original tutorial code to work with various sized images with good results for the seed image and the ultimate output, but the same principals are not leading me anywhere with the 3 dimensional data. How can I ensure that I'm making a seed with appropriate coverage, and not continuing the issue while actually training the model?

from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

tf.__version__

import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display

import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

sr = 44100/2
sample_path = os.getcwd()


def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float32', input_shape=(361,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Reshape((2 ,7, 19, 128)))
    assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size

    model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
    assert model.output_shape == (None, 2, 42, 19, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
    assert model.output_shape == (None, 2, 126, 361, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv3DTranspose(1, (2, 5, 5), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, 2, 252, 361, 1)

    return model


generator = make_generator_model()
noise = tf.random.normal([1, 361])
generated_audio = generator(noise, training=False)


D = []
for x in range(len(generated_audio[0][0])):
    this_line = []    
    for y in range(len(generated_audio[0][0][x])):
        this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
    D.append(this_line)
D = np.asarray(D)


librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
                          sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\\image_at_epoch_fuzz.png')
plt.show()


print(D.shape)

I'm outputting a visual representation of the audio noise, which should look like an image that is completely fuzzy. Instead I'm getting alternating noise and big black vertical bars and that thin black bar on the top.

The question is ultimately what rules do I need to follow to match the generator seed, kernel size, and strides? Can someone provide an example of how to programmatically ensure there isn't a mismatch in the strides and kernel size for a given number of layers?


Solution

  • So the problem was ultimately about the relationship between the convolution kernel_size and the strides (for a better explanation of each term see the Conv3DTranspose section herehttps://keras.io/layers/convolutional/). The dense layer was just fine to start with. In the original code the following Conv3DTranspose lines the kernel_size doesn't cover the stride in height direction (5<6) and width direction (5<19)

    model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
    
    model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
    

    The problem is fixed by making sure the minimum dimensions of kernel_size matche the chosen strides dimensions. Here is the fixed up code:

    from __future__ import absolute_import, division, print_function, unicode_literals
    
    import tensorflow as tf
    
    tf.__version__
    
    import numpy as np
    import os
    from tensorflow.keras import layers
    import librosa
    import librosa.display
    
    import matplotlib.pyplot as plt
    
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    
    sr = 44100/2
    sample_path = os.getcwd()
    
    
    def make_generator_model():
        model = tf.keras.Sequential()
        model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float64', input_shape=(50,)))
        model.add(layers.BatchNormalization())
        model.add(layers.LeakyReLU())
    
        model.add(layers.Reshape((2 ,7, 19, 128)))
        #assert model.output_shape == (None,2, 7, 9, 128) # Note: None is the batch size
    
        model.add(layers.Conv3DTranspose(128, (1, 6, 1), strides=(1, 6, 1), padding='same', use_bias=False))
        #assert model.output_shape == (None, 2, 42, 19, 128)
        model.add(layers.BatchNormalization())
        model.add(layers.LeakyReL())
    
    
        model.add(layers.Conv3DTranspose(128, (1, 3, 19), strides=(1, 3, 19), padding='same', use_bias=False))
        #assert model.output_shape == (None, 2, 126, 361, 128)
        model.add(layers.BatchNormalization())
        model.add(layers.LeakyReLU())
    
        model.add(layers.Conv3DTranspose(1, (1, 2, 1), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
        #assert model.output_shape == (None, 2, 252, 361, 1)
    
        return model
    
    
    generator = make_generator_model()
    noise = tf.random.normal([1, 50])
    generated_audio = generator(noise, training=False)
    
    
    D = []
    for x in range(len(generated_audio[0][0])):
        this_line = []    
        for y in range(len(generated_audio[0][0][x])):
            this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
        D.append(this_line)
    D = np.asarray(D)
    
    
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
                              sr=sr, x_axis='time', y_axis='cqt_note')
    plt.axis('off')
    plt.savefig(sample_path + '\\image_at_epoch_fuzz.png')
    plt.show()
    
    
    print(D.shape)
    

    result: properly upsampled noise image