pythonazurespeech-recognitionspeech-to-text

Unable to convert Speech to Text using Azure Speech-to-Text service


I'm using the below code to convert speech to text using Azure Speech-to-Text service.I want to convert my audio files into text.Below is the code for the same:

import  os
import  azure.cognitiveservices.speech  as  speechsdk



def  recognize_from_microphone():

# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"

    speech_config = speechsdk.SpeechConfig(subscription=my_key, region=my_region)

    speech_config.speech_recognition_language="en-US"

  
    audio_config = speechsdk.audio.AudioConfig(filename="C:\\Users\\DELL\\Desktop\\flowlly.com\\demo\\003. Class 3 - Monolith, Microservices, gRPC, Webhooks.mp4")

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

  

    speech_recognition_result = speech_recognizer.recognize_once_async().get()

  

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:

        print("Recognized: {}".format(speech_recognition_result.text))

    elif  speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))

    elif  speech_recognition_result.reason == speechsdk.ResultReason.Canceled:

        cancellation_details = speech_recognition_result.cancellation_details

        print("Speech Recognition canceled: {}".format(cancellation_details.reason))

        if  cancellation_details.reason == speechsdk.CancellationReason.Error:

            print("Error details: {}".format(cancellation_details.error_details))

            print("Did you set the speech resource key and region values?")

  
recognize_from_microphone()

But Im getting this error when trying to run the transcriber:

 File "C:\Users\DELL\Desktop\flowlly.com\demo\transcriber.py", line 48, in <module>
    recognize_from_microphone()
  File "C:\Users\DELL\Desktop\flowlly.com\demo\transcriber.py", line 18, in recognize_from_microphone
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\speech.py", line 1006, in __init__
    _call_hr_fn(
  File "C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 62, in _call_hr_fn
    _raise_if_failed(hr)
  File "C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 55, in _raise_if_failed
    __try_get_error(_spx_handle(hr))
  File "C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 50, in __try_get_error
    raise RuntimeError(message)
RuntimeError: Exception with error code:
[CALL STACK BEGIN]

    > pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - recognizer_create_speech_recognizer_from_config
    - recognizer_create_speech_recognizer_from_config

[CALL STACK END]

Exception with an error code: 0xa (SPXERR_INVALID_HEADER)

I have installed the sdk for the same but its not working. What should I do now?


Solution

  • The currently supported format in Azure Speech-to-Text service is WAV (16 kHz or 8 kHz, 16-bit, and mono PCM).

    import os
    import azure.cognitiveservices.speech as speechsdk
    
    def recognize_from_audio_file():
        # Replace 'my_key' and 'my_region' with your actual subscription key and region
        my_key = "YourSubscriptionKey"
        my_region = "YourRegion"
    
    
        speech_config = speechsdk.SpeechConfig(subscription=my_key, region=my_region)
        speech_config.speech_recognition_language = "en-US"
    
        # Provide the path to your WAV audio file
        audio_file_path = r"C:\Users\samplest 3.wav"
        audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    
        speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
    
        speech_recognition_result = speech_recognizer.recognize_once_async().get()
    
        if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized: {}".format(speech_recognition_result.text))
        elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
        elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speech_recognition_result.cancellation_details
            print("Speech Recognition canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(cancellation_details.error_details))
                print("Did you set the speech resource key and region values?")
    
    recognize_from_audio_file()
    
    

    Output: enter image description here