azureaudiospeech-recognitionweb-audio-apiazure-speech

Web Audio API preprocessing not improving Azure Speech SDK recognition accuracy for real-time meeting transcription


I'm working on a real-time speech-to-text application where microphone input is processed through Web Audio API before being sent to Azure Speech SDK. The main issue is that some audio content is being skipped or not recognized, which I suspect is due to noise issues. The problems occur especially when:

Using microphone input Using speakers at low volume There is background noise

To try to fix this, I implemented audio preprocessing using Web Audio API:

// Audio Preprocessing Implementation
const audioContext = new AudioContext();
const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
const source = audioContext.createMediaStreamSource(mediaStream);

// Dynamic gain adjustment
const gainNode = audioContext.createGain();
gainNode.gain.value = 5;

const analyser = audioContext.createAnalyser();
analyser.fftSize = 512;
source.connect(analyser);

// Dynamic gain adjustment based on input levels
function adjustMicGain(): void {
  const buffer = new Uint8Array(analyser.frequencyBinCount);
  analyser.getByteFrequencyData(buffer);
  const avgVolume = buffer.reduce((a, b) => a + b, 0) / buffer.length;

  if (avgVolume < 5) {
    gainNode.gain.value = Math.min(gainNode.gain.value + 1.0, 15);
  } else if (avgVolume < 15) {
    gainNode.gain.value = Math.min(gainNode.gain.value + 0.8, 12);
  } else if (avgVolume < 25) {
    gainNode.gain.value = Math.min(gainNode.gain.value + 0.5, 10);
  } else if (avgVolume < 35) {
    gainNode.gain.value = Math.min(gainNode.gain.value + 0.3, 8);
  } else if (avgVolume > 45) {
    gainNode.gain.value = Math.max(gainNode.gain.value - 0.2, 4);
  }
  requestAnimationFrame(adjustMicGain);
}

adjustMicGain();

// Bandpass filter for noise reduction
const biquadFilter = audioContext.createBiquadFilter();
biquadFilter.type = "bandpass";
biquadFilter.frequency.setValueAtTime(2500, audioContext.currentTime);
biquadFilter.Q.setValueAtTime(1.5, audioContext.currentTime);

const destination = audioContext.createMediaStreamDestination();
source.connect(gainNode).connect(biquadFilter).connect(destination);

const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(destination.stream);

Questions

Is my Web Audio API implementation correct for handling these issues? What other techniques/solutions are commonly used to improve speech recognition quality when dealing with:

Microphone input Low volume audio Background noise Skipped/missed audio content

Are there better alternatives to Web Audio API for audio preprocessing in speech recognition applications?

Environment

Speech Recognition: Azure Speech SDK Audio input: Both microphone and speaker audio


Solution

  • Dynamic gain adjustment logic in the given code is well thought out, but the gain changes might still be too abrupt and could amplify noise in low-volume conditions.

    gainNode.gain.setTargetAtTime(targetGain, audioContext.currentTime, 0.1);  
    

    Use MediaRecorder to record the processed audio and inspect it for issues like clipping or distortion.

    const recorder = new MediaRecorder(destination.stream);  
    recorder.ondataavailable = (e) => {  
    const audioBlob = e.data;  
    const audioURL = URL.createObjectURL(audioBlob);  
    const audio = new Audio(audioURL);  
    audio.play();   
    };  
    recorder.start();  
    

    Modified Code:

    const audioContext = new AudioContext();  
    const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });  
    const source = audioContext.createMediaStreamSource(mediaStream);
    
    const gainNode = audioContext.createGain();  
    gainNode.gain.value = 5;
    
    const highpassFilter = audioContext.createBiquadFilter();  
    highpassFilter.type = "highpass";  
    highpassFilter.frequency.setValueAtTime(100, audioContext.currentTime);
    
    const bandpassFilter = audioContext.createBiquadFilter();  
    bandpassFilter.type = "bandpass";  
    bandpassFilter.frequency.setValueAtTime(1000, audioContext.currentTime);  
    bandpassFilter.Q.setValueAtTime(0.9, audioContext.currentTime);
    
    const analyser = audioContext.createAnalyser();  
    analyser.fftSize = 256; 
    function adjustMicGain() {  
    const buffer = new Uint8Array(analyser.frequencyBinCount);  
    analyser.getByteTimeDomainData(buffer);  
    const avgVolume = buffer.reduce((a, b) => a + b, 0) / buffer.length;
    
    let targetGain = gainNode.gain.value;  
    if (avgVolume < 50) targetGain = Math.min(gainNode.gain.value + 0.5, 10);  
    else if (avgVolume > 200) targetGain = Math.max(gainNode.gain.value - 0.5, 2);
    
    gainNode.gain.setTargetAtTime(targetGain, audioContext.currentTime, 0.1);  
    requestAnimationFrame(adjustMicGain);  
    }
    
    adjustMicGain();
    
    const destination = audioContext.createMediaStreamDestination();  
    source  
    .connect(highpassFilter)  
    .connect(bandpassFilter)  
    .connect(gainNode)  
    .connect(analyser)  
    .connect(destination);
    
    const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(destination.stream);  
    

    Result:

    sample Output

    Console log:

    [Wed Feb 19 2025 15:48:30 GMT+0530 (India Standard Time)] "GET /" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"  
    [Wed Feb 19 2025 15:48:30 GMT+0530 (India Standard Time)] "GET /favicon.ico" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"  
    [Wed Feb 19 2025 15:48:30 GMT+0530 (India Standard Time)] "GET /index.js" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"  
    [Wed Feb 19 2025 15:49:27 GMT+0530 (India Standard Time)] "GET /" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"  
    [Wed Feb 19 2025 15:49:27 GMT+0530 (India Standard Time)] "GET /favicon.ico" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"  
    [Wed Feb 19 2025 15:49:27 GMT+0530 (India Standard Time)] "GET /index.js" 200 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"