I want to increase the volume of a specific frequency in a wav file, making them louder (more audible) then the rest of the frequencies.
What I've done so far (or at least I believe to) is to find the specific frequency I want to pitch up, I am stuck in the part I have to increase its gain, it is adding a annoying buzzing sound into my audio.
Here is the code:
import array
from pydub import AudioSegment
from pydub.generators import Sine
import numpy as np
from scipy.fft import fft
import sezone.fmanagement as file_management
import math
def find_frequencies_in_segment(segment: AudioSegment) -> (array, tuple):
raw_data = np.array(segment.get_array_of_samples())
fft_result = fft(raw_data)
freqs = np.fft.fftfreq(len(fft_result), d=1/segment.frame_rate)
return (freqs, fft_result)
def create_audio_sample() -> AudioSegment:
# in Hz
FREQUENCY_LOW = 60
FREQUENCY_MEDIUM = 100
tone_low: AudioSegment = Sine(FREQUENCY_LOW).to_audio_segment(duration=500)
tone_medium: AudioSegment = Sine(FREQUENCY_MEDIUM).to_audio_segment(duration=125)
audio: AudioSegment = tone_low + tone_medium
audio = audio - 20 #making it more quiet, so it'd be easier for me notice any difference in the volume after processing.
return audio
def calc_scale_factor(original_sample: AudioSegment, fft_result: array) -> float:
max_original_amp = np.array(original_sample.get_array_of_samples()).max()
max_fft_amp = np.array(fft_result).max()
return max_original_amp/max_fft_amp
def find_indexes_with_bandwidth(samples: array, amplitude: int, bandwith: int) -> np.ndarray:
indexes = []
for i in range(len(samples)):
min_value = samples[i] - bandwith
max_value = samples[i] + bandwith
if amplitude >= min_value and amplitude <= max_value:
indexes.append(i)
return indexes
def original_sample_index_for_frequency(audio: AudioSegment, frequency: int) -> array:
# Apply FFT to get the frequencies
(freqs, fft_result) = find_frequencies_in_segment(audio)
# spectral magnitude
amps = np.abs(fft_result)
# indexes containing my desired frequency
filtered_indexes = [i for i in range(len(freqs)) if math.floor(freqs[i]) == frequency]
# the amplitude of the desired frequency. From the spectral magnitude returned by the FFT
freq_amp = amps[filtered_indexes[0]]
# gets the corresponding amplitude in my original audio sample array
original_amp = freq_amp * calc_scale_factor(audio, amps)
original_samples = audio.get_array_of_samples()
indexes = find_indexes_with_bandwidth(original_samples, original_amp, 2)
# the indexes in my original array sample corresponding to the frequency i want to pitch up.
return indexes
def apply_gain_pydub(audio: AudioSegment, indexes: array) -> AudioSegment:
original_order = audio.get_sample_slice(0, 0)
processed_index = -1
for i in indexes:
# the part of the audio I dont want to change
original_order += audio.get_sample_slice(processed_index + 1, i - 1)# todo verify index 0
# the part of the audio I want to change
audio_to_change = audio.get_sample_slice(i, i+1)
changed_audio = audio_to_change.apply_gain(20)
# putting both audios together, in sequence
original_order += changed_audio
processed_index = i
# do the same as the loop above, but for the remainig part of the audio.
original_order += audio.get_sample_slice(processed_index + 1, len(audio.get_array_of_samples()) - 1)
original_order.export("../output/new_audio_pydub.wav", format="wav")
return original_order
def apply_gain_manually(audio: AudioSegment, indexes: array) -> AudioSegment:
original_samples = audio.get_array_of_samples()
# loop for the indexes applying gain (pitching the volume up)
for i in indexes:
gain = 10**(20 / 20.0)
original_samples[i] = int(original_samples[i] * gain)
reconstructed_audio = AudioSegment(original_samples.tobytes(),
frame_rate=audio.frame_rate,
sample_width=2,
channels=1)
reconstructed_audio.export("../output/new_audio_manually.wav", format="wav")
return reconstructed_audio
def main():
audio = create_audio_sample()
audio.export("../output/audio_sample.wav", format="wav")
# it gives me the indexes of the frequencies i want to pitch up.
# in this case, 60Hz
indexes = original_sample_index_for_frequency(audio, 60)
# do the same thing, but in differente ways to see if there is any difference
audio_pydub = apply_gain_pydub(audio, indexes)
audio_manually = apply_gain_manually(audio, indexes)
# making sure none audio has been clipped.
print("maximum possible value: ", (2 ** 15) - 1)# 16 bits
print("audio max: ", np.array(audio.get_array_of_samples()).max())
print("audio_pydub max: ", np.array(audio_pydub.get_array_of_samples()).max())
print("audio_manually max: ", np.array(audio_manually.get_array_of_samples()).max())
# Both audios have a buzzing sound after the processing.
if __name__ == '__main__':
print("\n")
main()
print("\n")
I want to understand:
As I am still learning the basics about audio processing, any background or context information would be helpful.
The code that you have does filtering on the frequency/FFT domain. This can easily lead to artifacts as you have experienced. Instead you should try to do the filtering in the original time/waveform domain. This avoid domain conversions and is much less vulnerable to strange artifacts.
The general filter that boosts a specific frequency range is called a bandpass filter. This can be designed with https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.iirdesign.html
For very narrow bandpass filters one can use a peak filter / resonant bandpass filter. These can be designed with https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.iirpeak.html