I am classifying audio signals to emotion class and using a model from hugginface which only takes in 8 seconds of audio. so i split the audio into 8 second files.
I have split the file 'A' in to a1,a2,a3,a4,a5,a6,a7,a8 now i used the model to classify each audio but i need to find the overall class of audio file 'A' by taking the mean of the predictions of files a1-a8. how can i do this please help.
names = []
# resize the audio files into <8 seconds for prediction using Ridzuan/Audio_Emotion_Classifier(huggingface)
for file in tqdm(Path("D:/program/SER_DATA_sample/exg/").glob("**/*.wav")):
name = os.path.basename(file).split('.')[0]
names.append(names)
names_df = pd.DataFrame(names)
temp = name
path = []
audio_path = 'C:/Users/XTEND/PycharmProjects/DATA_EVAL/RESAMPLE/'
dir_list = os.listdir(audio_path)
# label = os.path.basename(audio_path).split('_')[1].split('.')[0]
audio = AudioSegment.from_file(file)
length_audio = len(audio)
print("Length of Audio File", length_audio)
start = 0
# In Milliseconds, this will cut 7 Sec of audio
threshold = 7000
end = 0
counter = 0
num_split = length_audio/threshold
print(num_split)
while start < len(audio):
end += threshold
print(start, end)
file = audio[start:end]
filename = f'RESAMPLE/{counter}{name}.wav'
file.export(filename, format="wav")
print(file)
counter += 1
start += threshold
file_path = 'C:/Users/XTEND/PycharmProjects/DATA_EVAL/RESAMPLE/'
# file_path = 'D:/program/XTEND_AUDIO_DATASET/ps/'
dir_list = os.listdir(file_path)
number_files = len(dir_list)
print(number_files)
emo_df = []
paths = []
count = 1
for i in dir_list:
audio_data = file_path + i
paths.append(audio_data)
audio_path_df = pd.DataFrame(paths, columns=['Path'])
count += 1
print(count, audio_data)
data_ori, sample_rate = librosa.load(audio_data)
data, _ = librosa.effects.trim(data_ori)
test = prepare_test(audio_data)
pred = classifier.predict(test)
pred_df = pd.DataFrame(pred.T, index=['anger', 'happiness', 'neutral', 'sadness', 'surprised'],
columns=['Scores'])
print(pred_df)
emo = pred_df[pred_df['Scores'] == pred_df.max().values[0]].index[0]
print(emo)
emo_df.append(emo)
found the solution by messing with the code. so posting it for anyone who can use it.
cnt = 0
emo_max_list = []
file_path = []
for file in tqdm(Path("path/ of/ input/ audio/data/").glob("**/*.wav")):
name = os.path.basename(file).split('.')[0]
cnt += 1
file_path.append(file)
file_path_df = pd.DataFrame(file_path)
myaudio = AudioSegment.from_file(file)
chunk_length_ms = 7000
chunks = make_chunks(myaudio, chunk_length_ms)
if chunk_length_ms < 1000:
os.remove(myaudio) # delete all files
# Export all the individual chunks as wav files
count = 0
emo_list = []
chunk_path = []
for i, chunk in enumerate(chunks):
duration_seconds = len(chunk) / 1000 # Duration in seconds
if duration_seconds >= 3:
chunk_name = f'path/to/store/audio/chunks/{count}{name}.wav'.format(i)
chunk_file_path = chunk_name
chunk_path.append(chunk_name)
print("exporting", cnt, chunk_name)
chunk.export(chunk_name, format="wav")
else:
print(f"Skipping chunk {i} - duration less than 2 seconds")
count += 1
# predict the emotion of chunks
test = prepare_test(chunk_name)
pred = classifier.predict(test)
pred_df = pd.DataFrame(pred.T, index=['anger', 'happiness', 'neutral', 'sadness', 'surprised'],
columns=['Scores'])
print(pred_df)
emo = pred_df[pred_df['Scores'] == pred_df.max().values[0]].index[0]
print(emo)
emo_list.append(emo)
chunks_path = pd.DataFrame(chunk_path, columns=['path'])
emo_list_df = pd.DataFrame(emo_list, columns=['emotion'])
chunk_pred = pd.concat([chunks_path, emo_list_df], axis=1)
# overall emotion of each file
emo_max = max(set(emo_list), key=emo_list.count()
print(emo_max)
emo_max_list.append(emo_max)
what i did was i split the audio files into chunks according to this then i only pass the chunks greater than 2 seconds or whatever the length u want. Then did the prediction one by one which creates a list of predicted emotions. using this found the maximum occurrence of emotions.