javascriptmicrophoneazure-speechtranscriptionspeaker-diarization

Trying to build azure speech program that can transcribe and diarize audio real-time, how do I do this on javascript/html? Can't find working examples


I specifically am trying to build an application that can run an html-javascript file that can recognize the speech input from a microphone, transcribe it, and assign it to a speaker, continuously until I hit stop.

I have a working code that can transcribe from microphone input well enough, but when I tweaked it so the config would diarize, I only saw it transcribe the text without identifying the speaker, even though I had set the config to diarize, as well.

After I tweaked it to diarize, I wrote this:

<!DOCTYPE html>
<html>

<head>
    <title>Speech Sample</title>
    <meta charset="utf-8" />
    <script type="text/javascript" src="./difflib-browser.js"></script>
</head>

<body style="font-family:'Helvetica Neue',Helvetica,Arial,sans-serif; font-size:13px;">
    <div id="warning">
        <h1 style="font-weight:500;">Speech Recognition Speech SDK not found
            (microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1>
    </div>
    <div id="content" style="display:none">
        <table>
            <tr>
                <td></td>
                <td>
                    <h2 style="font-weight:500;">Microsoft Cognitive Services Speech SDK</h2>
                    <h3 style="font-weight:500;">Javascript Browser Sample</h3>
                </td>
            </tr>
            <tr>
                <td align="right"><a href="https://www.microsoft.com/cognitive-services/sign-up"
                        target="_blank">Subscription</a>:</td>
                <td><input id="key" type="text" size="60" placeholder="required: speech subscription key"></td>
            </tr>
            <tr>
                <td align="right">Region:</td>
                <td align="left">
                    <select id="regionOptions">
                        <option value="westus" selected="selected">West US</option>
                        <option value="westus2">West US 2</option>
                        <option value="eastus">East US</option>
                        <option value="eastus2">East US 2</option>
                        <option value="eastasia">East Asia</option>
                        <option value="southeastasia">South East Asia</option>
                        <option value="centralindia">Central India</option>
                        <option value="northeurope">North Europe</option>
                        <option value="westeurope">West Europe</option>
                    </select>
                </td>
            </tr>
            <tr>
                <td align="right">Recognition language:</td>
                <td align="left">
                    <select id="languageOptions">
                        <option value="en-US" selected="selected">English - US</option>
                        <!-- Add other languages as desired -->
                    </select>
                </td>
            </tr>
            <tr>
                <td align="right"><b></b></td>
                <td>
                    <button id="scenarioStartButton">Start</button>
                    <button id="scenarioStopButton" disabled="disabled">Stop</button>
                </td>
            </tr>
            <tr>
                <td align="right">Results:</td>
                <td align="left">
                    <textarea id="phraseDiv" style="display: inline-block;width:500px;height:200px"></textarea>
                </td>
            </tr>
            <tr>
                <td align="right">Events:</td>
                <td align="left">
                    <textarea id="statusDiv"
                        style="display: inline-block;width:500px;height:200px;overflow: scroll;white-space: nowrap;">
                    </textarea>
                </td>
            </tr>
        </table>
    </div>

    <!-- Speech SDK REFERENCE -->
    <script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>

    <!-- Speech SDK presence check -->
    <script>
        function Initialize(onComplete) {
            if (!!window.SpeechSDK) {
                document.getElementById('content').style.display = 'block';
                document.getElementById('warning').style.display = 'none';
                onComplete(window.SpeechSDK);
            }
        }
    </script>

    <script>
        var SpeechSDK;
        var phraseDiv, statusDiv;
        var key, authorizationToken;
        var regionOptions;
        var recognizer;
        
        document.addEventListener("DOMContentLoaded", function () {
            scenarioStartButton = document.getElementById('scenarioStartButton');
            scenarioStopButton = document.getElementById('scenarioStopButton');
            phraseDiv = document.getElementById("phraseDiv");
            statusDiv = document.getElementById("statusDiv");
            key = document.getElementById("key");
            regionOptions = document.getElementById("regionOptions");

            scenarioStartButton.addEventListener("click", function () {
                doContinuousRecognition();
            });

            scenarioStopButton.addEventListener("click", function() {
                if (recognizer) {
                    recognizer.stopContinuousRecognitionAsync();
                }
            });
        });

        function getAudioConfig() {
            return SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
        }

        function getSpeechConfig() {
            var speechConfig = SpeechSDK.SpeechConfig.fromSubscription(key.value, regionOptions.value);
            speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true"); // Enable speaker diarization
            console.log("Speaker diarization enabled."); // Log confirmation
            return speechConfig;
        }

        function onRecognized(sender, recognitionEventArgs) {
            var result = recognitionEventArgs.result;
            console.log(result); // Log the entire result for debugging
            phraseDiv.scrollTop = phraseDiv.scrollHeight;
        
            var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]` : '';
        
            statusDiv.innerHTML += `(recognized) Reason: ${SpeechSDK.ResultReason[result.reason]}`;
            phraseDiv.innerHTML += `${result.text}${speakerId}\r\n`;
        }

        function doContinuousRecognition() {
            var audioConfig = getAudioConfig();
            var speechConfig = getSpeechConfig();
            if (!audioConfig || !speechConfig) return;

            recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);
            recognizer.recognized = onRecognized;

            recognizer.startContinuousRecognitionAsync();
        }

        Initialize(async function (speechSdk) {
            SpeechSDK = speechSdk;
        });
    </script>
</body>

</html>

Solution

  • Below is the sample code use Azure Speech Recognition with Speaker Diarization.

    Use ConversationTranscriber instead of SpeechRecognizer for diarization support.

    let transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
    

    Extract and display the speaker ID like this:

    var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]
    

    Use ConversationTranscriber instead of SpeechRecognizer for diarization support.

    let transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
    

    Refer this MSDOC for Real-time diarization quickstart - Speech service in Azure AI services

    
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <title>Real-Time Speech Diarization</title>
        <meta charset="UTF-8">
    </head>
    <body>
        <h2>Azure Speech Recognition with Speaker Diarization</h2>
        <button id="startButton">Start</button>
        <button id="stopButton" disabled>Stop</button>
        <br><br>
        <textarea id="output" rows="10" cols="80" readonly></textarea>
        
        <script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>
        <script>
            let speechConfig, audioConfig, transcriber;
            const startButton = document.getElementById("startButton");
            const stopButton = document.getElementById("stopButton");
            const output = document.getElementById("output");
    
            async function initializeDiarization() {
                const speechKey = "YOUR_AZURE_SPEECH_KEY";  // Replace with your Azure Speech Key
                const speechRegion = "YOUR_AZURE_REGION";   // Replace with your Azure Region
    
                speechConfig = SpeechSDK.SpeechConfig.fromSubscription(speechKey, speechRegion);
                speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true");
    
                audioConfig = SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
                transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
    
                transcriber.sessionStarted = (s, e) => console.log("Session started:", e.sessionId);
                transcriber.sessionStopped = (s, e) => console.log("Session stopped:", e.sessionId);
                transcriber.canceled = (s, e) => console.error("Canceled:", e.errorDetails);
                
                transcriber.transcribed = (s, e) => {
                    const speakerId = e.result.speakerId || "Unknown Speaker";
                    const text = e.result.text;
                    output.value += `[${speakerId}]: ${text}\n`;
                    output.scrollTop = output.scrollHeight; // Auto-scroll output
                };
    
                startButton.addEventListener("click", () => {
                    startButton.disabled = true;
                    stopButton.disabled = false;
                    transcriber.startTranscribingAsync();
                    console.log("Diarization started.");
                });
    
                stopButton.addEventListener("click", () => {
                    transcriber.stopTranscribingAsync();
                    startButton.disabled = false;
                    stopButton.disabled = true;
                    console.log("Diarization stopped.");
                });
            }
    
            document.addEventListener("DOMContentLoaded", () => {
                if (!!window.SpeechSDK) {
                    initializeDiarization();
                } else {
                    alert("Speech SDK not found!");
                }
            });
        </script>
    </body>
    </html>
    
    
    

    Output:

    Output