pythonlistloopsmachine-learningaudio

split an audio file into chunks, skip the chunks less than desired time duration, and predict emotion for the entire audio file


I am classifying audio signals to emotion class and using a model from hugginface which only takes in 8 seconds of audio. so i split the audio into 8 second files.

I have split the file 'A' in to a1,a2,a3,a4,a5,a6,a7,a8 now i used the model to classify each audio but i need to find the overall class of audio file 'A' by taking the mean of the predictions of files a1-a8. how can i do this please help.

names = []
# resize the audio files into <8 seconds for prediction using Ridzuan/Audio_Emotion_Classifier(huggingface)
for file in tqdm(Path("D:/program/SER_DATA_sample/exg/").glob("**/*.wav")):
    name = os.path.basename(file).split('.')[0]
    names.append(names)
    names_df = pd.DataFrame(names)

    temp = name
    path = []
    audio_path = 'C:/Users/XTEND/PycharmProjects/DATA_EVAL/RESAMPLE/'
    dir_list = os.listdir(audio_path)
    # label = os.path.basename(audio_path).split('_')[1].split('.')[0]

    audio = AudioSegment.from_file(file)
    length_audio = len(audio)
    print("Length of Audio File", length_audio)

    start = 0
    # In Milliseconds, this will cut 7 Sec of audio
    threshold = 7000
    end = 0
    counter = 0
    num_split = length_audio/threshold
    print(num_split)

    while start < len(audio):
        end += threshold
        print(start, end)
        file = audio[start:end]
        filename = f'RESAMPLE/{counter}{name}.wav'
        file.export(filename, format="wav")
        print(file)
        counter += 1
        start += threshold

        file_path = 'C:/Users/XTEND/PycharmProjects/DATA_EVAL/RESAMPLE/'
        # file_path = 'D:/program/XTEND_AUDIO_DATASET/ps/'
        dir_list = os.listdir(file_path)
        number_files = len(dir_list)
        print(number_files)

    emo_df = []
    paths = []
    count = 1

    for i in dir_list:
        audio_data = file_path + i
        paths.append(audio_data)
        audio_path_df = pd.DataFrame(paths, columns=['Path'])
        count += 1
        print(count, audio_data)

        data_ori, sample_rate = librosa.load(audio_data)
        data, _ = librosa.effects.trim(data_ori)

        test = prepare_test(audio_data)
        pred = classifier.predict(test)
        pred_df = pd.DataFrame(pred.T, index=['anger', 'happiness', 'neutral', 'sadness', 'surprised'],
                               columns=['Scores'])
        print(pred_df)
        emo = pred_df[pred_df['Scores'] == pred_df.max().values[0]].index[0]
        print(emo)
        emo_df.append(emo)

Solution

  • found the solution by messing with the code. so posting it for anyone who can use it.

    cnt = 0
        emo_max_list = []
        file_path = []
        for file in tqdm(Path("path/ of/ input/ audio/data/").glob("**/*.wav")):
            name = os.path.basename(file).split('.')[0]
            cnt += 1
        
            file_path.append(file)
            file_path_df = pd.DataFrame(file_path)
        
            myaudio = AudioSegment.from_file(file)
            chunk_length_ms = 7000
            chunks = make_chunks(myaudio, chunk_length_ms)
            if chunk_length_ms < 1000:
                os.remove(myaudio)  # delete all files
            # Export all the individual chunks as wav files
            count = 0
        
            emo_list = []
            chunk_path = []
            for i, chunk in enumerate(chunks):
                duration_seconds = len(chunk) / 1000  # Duration in seconds
                if duration_seconds >= 3:
                    chunk_name = f'path/to/store/audio/chunks/{count}{name}.wav'.format(i)
                    chunk_file_path = chunk_name
                    chunk_path.append(chunk_name)
                    print("exporting", cnt, chunk_name)
                    chunk.export(chunk_name, format="wav")
                else:
                    print(f"Skipping chunk {i} - duration less than 2 seconds")
                count += 1
        
                # predict the emotion of chunks
                test = prepare_test(chunk_name)
                pred = classifier.predict(test)
                pred_df = pd.DataFrame(pred.T, index=['anger', 'happiness', 'neutral', 'sadness', 'surprised'],
                                       columns=['Scores'])
                print(pred_df)
                emo = pred_df[pred_df['Scores'] == pred_df.max().values[0]].index[0]
                print(emo)
                emo_list.append(emo)
        
            chunks_path = pd.DataFrame(chunk_path, columns=['path'])
            emo_list_df = pd.DataFrame(emo_list, columns=['emotion'])
            chunk_pred = pd.concat([chunks_path, emo_list_df], axis=1)
        
            # overall emotion of each file
            emo_max = max(set(emo_list), key=emo_list.count()
            print(emo_max)
            emo_max_list.append(emo_max)
        
      
    

    what i did was i split the audio files into chunks according to this then i only pass the chunks greater than 2 seconds or whatever the length u want. Then did the prediction one by one which creates a list of predicted emotions. using this found the maximum occurrence of emotions.