I specifically am trying to build an application that can run an html-javascript file that can recognize the speech input from a microphone, transcribe it, and assign it to a speaker, continuously until I hit stop.
I have a working code that can transcribe from microphone input well enough, but when I tweaked it so the config would diarize, I only saw it transcribe the text without identifying the speaker, even though I had set the config to diarize, as well.
After I tweaked it to diarize, I wrote this:
<!DOCTYPE html>
<html>
<head>
<title>Speech Sample</title>
<meta charset="utf-8" />
<script type="text/javascript" src="./difflib-browser.js"></script>
</head>
<body style="font-family:'Helvetica Neue',Helvetica,Arial,sans-serif; font-size:13px;">
<div id="warning">
<h1 style="font-weight:500;">Speech Recognition Speech SDK not found
(microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1>
</div>
<div id="content" style="display:none">
<table>
<tr>
<td></td>
<td>
<h2 style="font-weight:500;">Microsoft Cognitive Services Speech SDK</h2>
<h3 style="font-weight:500;">Javascript Browser Sample</h3>
</td>
</tr>
<tr>
<td align="right"><a href="https://www.microsoft.com/cognitive-services/sign-up"
target="_blank">Subscription</a>:</td>
<td><input id="key" type="text" size="60" placeholder="required: speech subscription key"></td>
</tr>
<tr>
<td align="right">Region:</td>
<td align="left">
<select id="regionOptions">
<option value="westus" selected="selected">West US</option>
<option value="westus2">West US 2</option>
<option value="eastus">East US</option>
<option value="eastus2">East US 2</option>
<option value="eastasia">East Asia</option>
<option value="southeastasia">South East Asia</option>
<option value="centralindia">Central India</option>
<option value="northeurope">North Europe</option>
<option value="westeurope">West Europe</option>
</select>
</td>
</tr>
<tr>
<td align="right">Recognition language:</td>
<td align="left">
<select id="languageOptions">
<option value="en-US" selected="selected">English - US</option>
<!-- Add other languages as desired -->
</select>
</td>
</tr>
<tr>
<td align="right"><b></b></td>
<td>
<button id="scenarioStartButton">Start</button>
<button id="scenarioStopButton" disabled="disabled">Stop</button>
</td>
</tr>
<tr>
<td align="right">Results:</td>
<td align="left">
<textarea id="phraseDiv" style="display: inline-block;width:500px;height:200px"></textarea>
</td>
</tr>
<tr>
<td align="right">Events:</td>
<td align="left">
<textarea id="statusDiv"
style="display: inline-block;width:500px;height:200px;overflow: scroll;white-space: nowrap;">
</textarea>
</td>
</tr>
</table>
</div>
<!-- Speech SDK REFERENCE -->
<script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>
<!-- Speech SDK presence check -->
<script>
function Initialize(onComplete) {
if (!!window.SpeechSDK) {
document.getElementById('content').style.display = 'block';
document.getElementById('warning').style.display = 'none';
onComplete(window.SpeechSDK);
}
}
</script>
<script>
var SpeechSDK;
var phraseDiv, statusDiv;
var key, authorizationToken;
var regionOptions;
var recognizer;
document.addEventListener("DOMContentLoaded", function () {
scenarioStartButton = document.getElementById('scenarioStartButton');
scenarioStopButton = document.getElementById('scenarioStopButton');
phraseDiv = document.getElementById("phraseDiv");
statusDiv = document.getElementById("statusDiv");
key = document.getElementById("key");
regionOptions = document.getElementById("regionOptions");
scenarioStartButton.addEventListener("click", function () {
doContinuousRecognition();
});
scenarioStopButton.addEventListener("click", function() {
if (recognizer) {
recognizer.stopContinuousRecognitionAsync();
}
});
});
function getAudioConfig() {
return SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
}
function getSpeechConfig() {
var speechConfig = SpeechSDK.SpeechConfig.fromSubscription(key.value, regionOptions.value);
speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true"); // Enable speaker diarization
console.log("Speaker diarization enabled."); // Log confirmation
return speechConfig;
}
function onRecognized(sender, recognitionEventArgs) {
var result = recognitionEventArgs.result;
console.log(result); // Log the entire result for debugging
phraseDiv.scrollTop = phraseDiv.scrollHeight;
var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]` : '';
statusDiv.innerHTML += `(recognized) Reason: ${SpeechSDK.ResultReason[result.reason]}`;
phraseDiv.innerHTML += `${result.text}${speakerId}\r\n`;
}
function doContinuousRecognition() {
var audioConfig = getAudioConfig();
var speechConfig = getSpeechConfig();
if (!audioConfig || !speechConfig) return;
recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);
recognizer.recognized = onRecognized;
recognizer.startContinuousRecognitionAsync();
}
Initialize(async function (speechSdk) {
SpeechSDK = speechSdk;
});
</script>
</body>
</html>
Below is the sample code use Azure Speech Recognition with Speaker Diarization.
Use ConversationTranscriber
instead of SpeechRecognizer
for diarization support.
let transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
Extract and display the speaker ID like this:
var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]
Use ConversationTranscriber
instead of SpeechRecognizer
for diarization support.
let transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
Refer this MSDOC for Real-time diarization quickstart - Speech service in Azure AI services
<!DOCTYPE html>
<html lang="en">
<head>
<title>Real-Time Speech Diarization</title>
<meta charset="UTF-8">
</head>
<body>
<h2>Azure Speech Recognition with Speaker Diarization</h2>
<button id="startButton">Start</button>
<button id="stopButton" disabled>Stop</button>
<br><br>
<textarea id="output" rows="10" cols="80" readonly></textarea>
<script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>
<script>
let speechConfig, audioConfig, transcriber;
const startButton = document.getElementById("startButton");
const stopButton = document.getElementById("stopButton");
const output = document.getElementById("output");
async function initializeDiarization() {
const speechKey = "YOUR_AZURE_SPEECH_KEY"; // Replace with your Azure Speech Key
const speechRegion = "YOUR_AZURE_REGION"; // Replace with your Azure Region
speechConfig = SpeechSDK.SpeechConfig.fromSubscription(speechKey, speechRegion);
speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true");
audioConfig = SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
transcriber = new SpeechSDK.ConversationTranscriber(speechConfig, audioConfig);
transcriber.sessionStarted = (s, e) => console.log("Session started:", e.sessionId);
transcriber.sessionStopped = (s, e) => console.log("Session stopped:", e.sessionId);
transcriber.canceled = (s, e) => console.error("Canceled:", e.errorDetails);
transcriber.transcribed = (s, e) => {
const speakerId = e.result.speakerId || "Unknown Speaker";
const text = e.result.text;
output.value += `[${speakerId}]: ${text}\n`;
output.scrollTop = output.scrollHeight; // Auto-scroll output
};
startButton.addEventListener("click", () => {
startButton.disabled = true;
stopButton.disabled = false;
transcriber.startTranscribingAsync();
console.log("Diarization started.");
});
stopButton.addEventListener("click", () => {
transcriber.stopTranscribingAsync();
startButton.disabled = false;
stopButton.disabled = true;
console.log("Diarization stopped.");
});
}
document.addEventListener("DOMContentLoaded", () => {
if (!!window.SpeechSDK) {
initializeDiarization();
} else {
alert("Speech SDK not found!");
}
});
</script>
</body>
</html>
Output: