javascriptwebkitwebspeech-apiwebkitspeechrecognition

Detect the time when the user talk SpeechRecognition API?


I am trying to detect when the user pauses and start talking again without stop recognizing. This is the code I set to detect when the user talks and output it to the page:

//set up vars
var final_transcript;
var recognizing = false;
//check if using chrome and up to date
if ('webkitSpeechRecognition' in window) {
//init
  var recognition = new webkitSpeechRecognition();

  recognition.continuous = true;
  recognition.interimResults = true;

  recognition.onstart = function() {
    recognizing = true;
  };
//if there is error somewhere
  recognition.onerror = function(event) {
    console.log(event.error);
  };


  recognition.onend = function() {
    recognizing = false;
  };
//after giving the spech
  recognition.onresult = function(event) {
    var interim_transcript = '';
    for (var i = event.resultIndex; i < event.results.length; ++i) {
      if (event.results[i].isFinal) {
        // finilize and show the compleated text
        final_transcript += event.results[i][0].transcript;
      } else {
        // run the speech and output it 
        interim_transcript += event.results[i][0].transcript;

      }
    }
    final_transcript = capitalize(final_transcript);
    final_span.innerHTML = linebreak(final_transcript);
    interim_span.innerHTML = linebreak(interim_transcript);

  };
}

var two_line = /\n\n/g;
var one_line = /\n/g;
function linebreak(s) {
  return s.replace(two_line, '<p></p>').replace(one_line, '<br>');
}

function capitalize(s) {
  return s.replace(s.substr(0,1), function(m) { return m.toUpperCase(); });
}

function startDictation(event) {
  if (recognizing) {
    recognition.stop();
    return;
  }
  final_transcript = '';
  recognition.lang = 'en';
  recognition.start();
}
//startDictation();
<div id="results">
        <span id="final_span" class="final"></span>
        <span id="interim_span" class="interim"></span>

    </div>

So is there a way to detect when the user pauses and the length of time he paused?


Solution

  • This is not a complete answer, but it's start.

    https://jsfiddle.net/persianturtle/7uygdyy1/1/

    I'm not sure if the onspeechstart event only fires once, or if it would fire again if I was in a less noisy area.

    If it fires multiple times, this becomes very easy to do since we can store speech start and speech end times and figure out the periods of silence from there.

    If it onspeechstart only fires once, then you can probably find a way to get the average speech duration given text and figure out silences from there.

    There is, however, an event that fires each time words are captured: onresult.

    So the basic idea is to define an array of activity that you can push speech data into, and then analyze periods of silence afterwards.

    Code:

    let activity = []
    
    recognition.onresult = event => {
      console.log(event)
      activity.push('Ended:' + event.timeStamp)
    }
    
    recognition.onspeechstart = event => {
      activity.push('Started:' + event.timeStamp)
    }
    
    recognition.onend = event => {
      console.log(activity)
    }