node.jswebsockettwiliospeechgoogle-speech-api

Google Speech-To-Text v2 doesn't accept audio in Node.JS


I've been trying for couple days now to migrate to Google STT V2 using Node.JS. In v1 everything worked perfectly. I have created a recognizer and created a script using https://github.com/GoogleCloudPlatform/nodejs-docs-samples/blob/main/speech/transcribeStreaming.v2.js

My point is to transcribe audio coming from Twilio phone call and I'm using websockets for Twilio to connect to a WSS and stream audio data, which I pass to Google streamingRecognition. My code looks like this:

const speech = require('@google-cloud/speech').v2;
const fs = require('fs');

const client = new speech.SpeechClient({
  keyFilename: './googlecreds.json',
  apiEndpoint: 'eu-speech.googleapis.com'
});

const recognizerName = "projects/12345678910/locations/eu/recognizers/name";

const recognitionConfig = {
  audoDecodingConfig: {},
};

const streamingConfig = {
  config: recognitionConfig,
};

const configRequest = {
  recognizer: recognizerName,
  streamingConfig: streamingConfig,
};

const express = require('express');
const bodyParser = require('body-parser');
const app = express();
app.use(bodyParser.urlencoded({ extended: true }));

// Load your key and certificate
const privateKey = fs.readFileSync('location', 'utf8');
const certificate = fs.readFileSync('location', 'utf8');
const ca = fs.readFileSync('location', 'utf8');

const credentials = {
  key: privateKey,
  cert: certificate,
  ca: ca
};

//wss
const WebSocket = require('ws');
const https = require('https');
const server = https.createServer(credentials, app);
const wss = new WebSocket.Server({ 
  server: server, 
  path: '/stream',
});

wss.on("connection", async function connection(ws) {
    let recognizeStream = null;
    ws.on("message", function incoming(message) {
        const msg = JSON.parse(message);
        switch (msg.event) {
            case "start":
                recognizeStream = client
                ._streamingRecognize()
                .on('data', response => {
                  const {results} = response;
                  console.log(results[0].alternatives[0].transcript);
                })
                .on('error', err => {
                  console.error(err.message);
                })
                recognizeStream.write(configRequest);
                break;
            case "media":
                // Write the raw media data to the recognize stream
                recognizeStream.write({audio: msg.media.payload});
                break;
            case "stop":
                // Stop the recognize stream
                recognizeStream.end();
                break;
        }
    });
});

app.post('/voice', (req, res) => {
  twiml = `
<Response>
    <Say>talk now</Say>
    <Connect>
        <Stream url="wss://my.domain.com/stream"/>
    </Connect>
    <Pause length="60"/>
</Response>
`
  res.type('text/xml');
  res.send(twiml);
});


const port = process.env.PORT || 8080;
server.listen(port, '0.0.0.0', () => {
  console.log(`Server running on port ${port}`);
});

Stream is connected, config writes without an error. I can log received msg.media.payload from Twilio in my "media" case, but writing it to recognizeStream does nothing, I get no answers. I'm not sure what to do anymore.


Solution

  • Working on the same feature. Was able to resolve. Two fixes:

    1 config

    const recognitionConfig = {
      explicitDecodingConfig: {
        encoding: 'MULAW',
        sampleRateHertz: 8000,
        audioChannelCount: 1
      }
    }
    

    2 buffer conversion

    const buffer = Buffer.from(msg.media.payload, 'base64')
    recognizeStream?.write({ audio: buffer })