azurestreamspeech-to-text

Using Azure Speech To Text Service where I'm giving input as memory stream but getting error "NOMATCH: Speech could not be recognized"


I'm using Microsoft.CognitiveServices.Speech speech to text service where I'm giving input as MemoryStream instead of file input using a custom api. However I get the error "NOMATCH: Speech could not be recognized". The code works when I'm using a file input where I read the file and give the input as FileStream. Here is the code I'm using:

    public static async Task<string> RecognizeSpeechFromStreamAsync(Stream audioStream)
    {
        try
        {
            byte channels = 1;
            byte bitsPerSample = 16;
            uint samplesPerSecond = 16000; // or 8000  
            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels);

            var contosoStream = new ContosoAudioStream(audioStream);
            var audioConfig = AudioConfig.FromStreamInput(contosoStream, audioFormat);

            var speechConfig = SpeechConfig.FromSubscription(speechKey, speechRegion);
            speechConfig.SpeechRecognitionLanguage = "en-US";

            using (var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig))
            {
                Console.WriteLine("Starting speech recognition from stream...");
                var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync();

                if (speechRecognitionResult.Reason == ResultReason.RecognizedSpeech)
                {
                    Console.WriteLine($"RECOGNIZED: Text={speechRecognitionResult.Text}");
                    return speechRecognitionResult.Text;
                }
                else if (speechRecognitionResult.Reason == ResultReason.NoMatch)
                {
                    Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                    return null; // Or an appropriate error message  
                }
                else if (speechRecognitionResult.Reason == ResultReason.Canceled)
                {
                    var cancellation = CancellationDetails.FromResult(speechRecognitionResult);
                    Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                    if (cancellation.Reason == CancellationReason.Error)
                    {
                        Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                        Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                        Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                        // Consider throwing an exception here to propagate the error  
                    }
                    return null; // Or an appropriate error message  
                }
                else
                {
                    Console.WriteLine($"Unexpected result reason: {speechRecognitionResult.Reason}");
                    return null; // Or an appropriate error message  
                }
            }
        }
        catch (Exception ex)
        {
            Console.Error.WriteLine($"Exception during speech recognition: {ex.Message}");
            return null; // Or throw the exception, depending on your error handling strategy  
        }
    }
}

public class ContosoAudioStream : PullAudioInputStreamCallback
{
    private BinaryReader _reader;
    private int _chunkSize;

    public ContosoAudioStream(Stream audioStream, int chunkSize = 1024)
    {
        _reader = new BinaryReader(audioStream);
        _chunkSize = chunkSize;
    }

    public override int Read(byte[] buffer, uint size)
    {
        try
        {
            byte[] tempBuffer = _reader.ReadBytes((int)Math.Min(size, _chunkSize));
            tempBuffer.CopyTo(buffer, 0);
            return tempBuffer.Length;
        }
        catch (EndOfStreamException)
        {
            return 0; // Signal the end of the stream  
        }
        catch (Exception ex)
        {
            Console.Error.WriteLine($"Error reading from stream: {ex.Message}");
            return 0;
        }
    }

    public override void Close()
    {
        _reader?.Close();
        Console.WriteLine("ContosoAudioStream closed.");
    }
}

Solution

  • error "NOMATCH: Speech could not be recognized"

    I got the same error when I tried with a WAV file with a sample rate of 48,000 Hz.

    Use the command below to check the sample rate of your WAV file.

    ffmpeg -i <path/to/.wav file>
    

    So, to resolve the issue, I converted my WAV file to 16,000 Hz using the command below and successfully got the speech to text output.

    ffmpeg -i "<path/to/.wav file>" -ar 16000 -ac 1 -sample_fmt s16 "<path/to/converted.wav file>"
    

    enter image description here

    Code :

    using Microsoft.CognitiveServices.Speech;
    using Microsoft.CognitiveServices.Speech.Audio;
    
    class Program
    {
        private static string speechKey = "<SpeechKey>"; 
        private static string speechRegion = "<SpeechKey>"; 
    
        static async Task Main(string[] args)
        {
            string filePath = "<path/to/.wav file>"; 
            try
            {
                if (!File.Exists(filePath))
                {
                    Console.WriteLine("Error: Audio file not found.");
                    return;
                }
                byte[] audioData = File.ReadAllBytes(filePath);
                using (var memoryStream = new MemoryStream(audioData))
                {
                    string resultText = await RecognizeSpeechFromStreamAsync(memoryStream);
                    Console.WriteLine($"Recognition Result: {resultText}");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Exception: {ex.Message}");
            }
        }
    
        public static async Task<string> RecognizeSpeechFromStreamAsync(Stream audioStream)
        {
            try
            {
                byte channels = 1;
                byte bitsPerSample = 16;
                uint samplesPerSecond = 16000; 
                var audioFormat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels);
                var contosoStream = new ContosoAudioStream(audioStream);
                var audioConfig = AudioConfig.FromStreamInput(contosoStream, audioFormat);
                var speechConfig = SpeechConfig.FromSubscription(speechKey, speechRegion);
                speechConfig.SpeechRecognitionLanguage = "en-US";
    
                using (var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig))
                {
                    Console.WriteLine("Starting speech recognition from stream...");
                    var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync();
                    if (speechRecognitionResult.Reason == ResultReason.RecognizedSpeech)
                    {
                        Console.WriteLine($"RECOGNIZED: Text={speechRecognitionResult.Text}");
                        return speechRecognitionResult.Text;
                    }
                    else if (speechRecognitionResult.Reason == ResultReason.NoMatch)
                    {
                        Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                        return null;
                    }
                    else if (speechRecognitionResult.Reason == ResultReason.Canceled)
                    {
                        var cancellation = CancellationDetails.FromResult(speechRecognitionResult);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
    
                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                            Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                        }
                        return null;
                    }
                    else
                    {
                        Console.WriteLine($"Unexpected result reason: {speechRecognitionResult.Reason}");
                        return null;
                    }
                }
            }
            catch (Exception ex)
            {
                Console.Error.WriteLine($"Exception during speech recognition: {ex.Message}");
                return null;
            }
        }
    }
    
    public class ContosoAudioStream : PullAudioInputStreamCallback
    {
        private BinaryReader _reader;
        private int _chunkSize;
        public ContosoAudioStream(Stream audioStream, int chunkSize = 1024)
        {
            _reader = new BinaryReader(audioStream);
            _chunkSize = chunkSize;
        }
    
        public override int Read(byte[] buffer, uint size)
        {
            try
            {
                byte[] tempBuffer = _reader.ReadBytes((int)Math.Min(size, _chunkSize));
                tempBuffer.CopyTo(buffer, 0);
                return tempBuffer.Length;
            }
            catch (EndOfStreamException)
            {
                return 0;  
            }
            catch (Exception ex)
            {
                Console.Error.WriteLine($"Error reading from stream: {ex.Message}");
                return 0;
            }
        }
        public override void Close()
        {
            _reader?.Close();
            Console.WriteLine("ContosoAudioStream closed.");
        }
    }
    

    Output :

    enter image description here