I want to classify audio files using a CNN. Usually you can do this by converting them to Spectrograms and then training these.
This works fine and is described in Tensorflow's tutorial, for example.
However, I am now looking for a solution that does not compute the spectrograms in advance of the training, but rather computes them in the model itself through a layer.
Why this?
Because I would like to improve the settings of the Spectrograms by hyperparameter optimization. Or, ideally, the network itself should learn the best parameters.
I have built a small example. The first part is exactly as it can be found in the Tensorflow tutorial.
import os
import pathlib
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import models
DATASET_PATH = 'data/mini_speech_commands'
data_dir = pathlib.Path(DATASET_PATH)
if not data_dir.exists():
tf.keras.utils.get_file(
'mini_speech_commands.zip',
origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
extract=True,
cache_dir='.', cache_subdir='data')
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
print('Number of total examples:', num_samples)
print('Example file tensor:', filenames[0])
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[commands != 'README.md']
print('Commands:', commands)
train_files = filenames[:6400]
val_files = filenames[6400: 6400 + 800]
test_files = filenames[-800:]
print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))
def decode_audio(audio_binary):
audio, _ = tf.audio.decode_wav(contents=audio_binary)
return tf.squeeze(audio, axis=-1)
def get_label(file_path):
parts = tf.strings.split(
input=file_path,
sep=os.path.sep)
return parts[-2]
def get_waveform_and_label(file_path):
label = get_label(file_path)
audio_binary = tf.io.read_file(file_path)
waveform = decode_audio(audio_binary)
return waveform, label
AUTOTUNE = tf.data.AUTOTUNE
files_ds = tf.data.Dataset.from_tensor_slices(train_files)
waveform_ds = files_ds.map(
map_func=get_waveform_and_label,
num_parallel_calls=AUTOTUNE)
I tried to create a subclass of Layers that does what I want.
class SpectrogramTransform(layers.Layer):
def __init__(self, fs=52000, nperseg=64, noverlap=32, nfft=16000):
super(SpectrogramTransform, self).__init__()
self.fs = fs
self.nperseg = nperseg
self.noverlap = noverlap
self.nfft = nfft
def calculate_spectrogram(self, inputs):
_, _, Sxx = signal.spectrogram(
x=inputs,
fs=self.fs,
nfft=self.nfft,
nperseg=self.nperseg,
noverlap=self.noverlap,
mode="magnitude",
)
return Sxx
def call(self, inputs):
# convert tensor to numpy
sess = tf.compat.v1.InteractiveSession()
inputs = inputs.eval(session=sess)
Sxx = self.calculate_spectrogram(inputs)
# convert numpy back to tensor
Sxx = tf.convert_to_tensor(Sxx, dtype=tf.float32)
return Sxx
test_file = tf.io.read_file(DATASET_PATH+'/down/0a9f9af7_nohash_0.wav')
test_audio, _ = tf.audio.decode_wav(contents=test_file)
print(test_audio.shape)
num_labels = len(commands)
model = models.Sequential([
layers.Input(shape=test_audio.shape),
SpectrogramTransform(),
layers.Conv2D(32, 3, activation='relu'),
layers.Conv2D(64, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Dropout(0.25),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dropout(0.5),
layers.Dense(num_labels),
])
There seems to be at least something wrong with the dimensions. But apparently there is even more wrong :)
But I don't get a very meaningful error massage either.
Call arguments received by layer "spectrogram_transform" (type SpectrogramTransform): • inputs=tf.Tensor(shape=(None, 13654, 1), dtype=float32)
Does anyone of you have an idea or something similar already done? Thank you very much in advance.
Thank you @djinn, that didn't fix my problem, but it's generally quite a good suggestion for improvement.
However, I have now found a solution by myself. I guess the problem was in coverting between TF tensors and Numpy arrays. Now, I do the whole Spectrogram calculation directly on the tensors. See documentation of tfio.audio.spectrogram
Here is the result of how I changed my class. However, I also made some small changes to my dataset, but they are not that important here. But if someone is also interested in it, let me know and I'll add it.
class SpectrogramTransform(layers.Layer):
def __init__(self, window=512, stride=256, nfft=16000):
super(SpectrogramTransform, self).__init__()
self.window = window
self.stride = stride
self.nfft = nfft
def call(self, waveform):
return tfio.audio.spectrogram(input=waveform, nfft=self.nfft, window=self.window, stride=self.stride)