I'm working on adapting the code from the tensorflow 2.0 dcGAN tutorial (https://www.tensorflow.org/beta/tutorials/generative/dcgan) to a spectrogram of audio signals. I'm using libroasa chroma_cqt to convert the raw audio data into a WxHx2 matrix, and using that as the input. When I attempt to create the seed matrix by upscaling random noise, the result I'm getting is an alternating bands in time-space of random noise and 0s and a thin black bar on the top (see image).
I've adapted the original tutorial code to work with various sized images with good results for the seed image and the ultimate output, but the same principals are not leading me anywhere with the 3 dimensional data. How can I ensure that I'm making a seed with appropriate coverage, and not continuing the issue while actually training the model?
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float32', input_shape=(361,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (2, 5, 5), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 361])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)
I'm outputting a visual representation of the audio noise, which should look like an image that is completely fuzzy. Instead I'm getting alternating noise and big black vertical bars and that thin black bar on the top.
The question is ultimately what rules do I need to follow to match the generator seed, kernel size, and strides? Can someone provide an example of how to programmatically ensure there isn't a mismatch in the strides and kernel size for a given number of layers?
So the problem was ultimately about the relationship between the convolution kernel_size and the strides (for a better explanation of each term see the Conv3DTranspose section herehttps://keras.io/layers/convolutional/). The dense layer was just fine to start with. In the original code the following Conv3DTranspose lines the kernel_size doesn't cover the stride in height direction (5<6) and width direction (5<19)
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
The problem is fixed by making sure the minimum dimensions of kernel_size matche the chosen strides dimensions. Here is the fixed up code:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float64', input_shape=(50,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
#assert model.output_shape == (None,2, 7, 9, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (1, 6, 1), strides=(1, 6, 1), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReL())
model.add(layers.Conv3DTranspose(128, (1, 3, 19), strides=(1, 3, 19), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (1, 2, 1), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
#assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 50])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)