node.jsreactjsopenai-apirecorderopenai-whisper

whisper api from a recorded audio blob


I am creating a transcriber using openAI whisper API in nodejs and react. I want the user to be able to record an audio file in the browser and transcribe their recording. i am doing this by saving the buffer data of the audio blob they have recorded into an mp3 file, then using the createTranscription() api call i input the fs.createReadStream(recorded_audio_file.mp3) which outputs a 400 error. When i record an audio file using the windows recorder and input that file the api call works just fin. Here is my recorder component in react

import React, { useState, useEffect, useRef } from "react";

import Microphone from "./Microphone/Microphone";
const TSST = () => {
  const BASE_URL = process.env.REACT_APP_SERVER_URL || "http://localhost:5000";

  const mediaRecorder = useRef(null);
  const [stream, setStream] = useState(null);
  const [audioChunks, setAudioChunks] = useState([]);
  const [audio, setAudio] = useState(null);
  const [audioFile, setAudioFile] = useState(null);
  const [transcribtion, setTranscription] = useState("");
  const [audioBlob, setAudioBlob] = useState("");
  const [audioBuffer, setAudioBuffer] = useState("");

  useEffect(() => {
    const initializeMediaRecorder = async () => {
      if ("MediaRecorder" in window) {
        try {
            const streamData = await navigator.mediaDevices.getUserMedia({ audio: true });
            setStream(streamData);
        } catch (err) {
            console.log(err.message);
        }
      } else {
          console.log("The MediaRecorder API is not supported in your browser.");
      }
    }

    initializeMediaRecorder();
  }, [])

  const handleStartRecording = () => {
    const media = new MediaRecorder(stream, { type: "audio/mp3" });

    mediaRecorder.current = media;
    mediaRecorder.current.start();

    let chunks = [];
    mediaRecorder.current.ondataavailable = (e) => {
       chunks.push(e.data);
    };
    setAudioChunks(chunks);
  }
  const handleStopRecording = () => {
    mediaRecorder.current.stop();
    mediaRecorder.current.onstop = () => {
      const audioBlob = new Blob(audioChunks, { type: "audio/mp3" });
      const audioUrl = URL.createObjectURL(audioBlob);

      setAudioBlob(audioBlob)
      setAudio(audioUrl);
      setAudioChunks([]);

      let file = new File([audioUrl], "recorded_audio.mp3",{type:"audio/mp3", lastModified:new Date().getTime()});
      let container = new DataTransfer();
      container.items.add(file);
      document.getElementById("audioFile").files = container.files;
      setAudioFile(container.files[0]);

      console.log(file);
    };
  }

  const handleSubmitRecording = async () => {
    try {
      // Assuming you have an audio blob called 'audioBlob'

      // Convert the audio blob to a base64 string
      const reader = new FileReader();
      reader.onloadend = async () => {
        const base64String = reader.result.split(',')[1]; // Extract base64 data from the result
        const res = await fetch(`${BASE_URL}/api/openai/transcriber`, {
          method: "POST",
          headers: {
            "Content-Type": "application/json",
          },
          body: JSON.stringify({ audioBuffer: base64String, lang: "en" })
        })
        const data = await res.json();
        setTranscription(data);
      };
      reader.readAsDataURL(audioBlob);

    } catch (error) {
      console.log(error);

    } finally {
    }
  }

    return (
      <div className="h-[calc(100vh-73px)] flex justify-center items-center">
        <div className="w-[40%] flex justify-between items-center">
          <div className="flex flex-col">
            <Microphone startFunction={ handleStartRecording } stopFunction={ handleStopRecording } />
            <button onClick={handleStartRecording} className="w-fit my-10 p-5 bg-gray-200 rounded-lg">Start Recording</button>
            <button onClick={handleStopRecording} className="w-fit mb-10 p-5 bg-gray-200 rounded-lg">Stop Recording</button>

            <audio className="mb-10" src={audio && audio} controls></audio>
            <input id="audioFile" type="file" onChange={ (e) => {setAudioFile(e.target.files[0])}}/>
          </div>
          
          <div>
            <button className="p-10 bg-yellow-500 rounded-xl" onClick={ handleSubmitRecording } >Submit</button>
          </div>
        </div>

        <div className="w-[40%] flex justify-center items-center">
          <textarea value={transcribtion} readOnly className="w-[60%] aspect-square resize-none shadow-lg shadow-black"></textarea>
        </div>
      </div>
    );
};
export default TSST;

here is the api:

export const transcribe = async (req, res) => {
    // const lang = JSON.parse(req.body.json).lang;
    // const audioBuffer = req.file;
    const { audioBuffer, lang} = req.body;

    const audioBufferBase64 = Buffer.from(audioBuffer, 'base64');

    const fileName = "test.mp3";
    const folderName = `./audio/${fileName}`

    const writableStream = fs.createWriteStream(folderName); // Replace with your desired file path and extension
    writableStream.write(audioBufferBase64);

    const readStream = fs.createReadStream(folderName);

    readStream.on('data', (data) => {
        console.log('Read stream data:', data);
    });

    try {
        const whisperRes = await openai.createTranscription(
            readStream,
            "whisper-1",
        )

        const chatResponse = whisperRes.data.text;
        console.log(chatResponse)

        res.status(200).json({ chatResponse: chatResponse });
    } catch (error) {
        //console.log(error);
        res.status(500).json({ message: error });
    }
}

and here is the server call:

import express from "express";
import cors from "cors";
import * as dotenv from "dotenv";
import mongoose from "mongoose";
import multer from "multer";

import { dalle, chatGPT, summarize, translate, transcribe } from "./api/openai.js";
import { getImages, postImage } from "./api/imageShowcase.js";
import { login, signup } from "./api/user.js";

dotenv.config();

const app = express();
const upload = multer();
const storage = multer.memoryStorage();
const uploadMiddleware = multer({ storage: storage });

app.use(cors());
app.use(express.json({limit: '50mb'}));

const atlasURL = process.env.MONGODB_URL;    
const PORT = process.env.PORT || 5000;

mongoose.connect(atlasURL)
    .then(() => app.listen(PORT, () => console.log(`Successfully connected to port ${PORT}`)))
    .catch(error => console.log("There was an error: ", error));

app.get("/", async (req, res) => {
    res.send("Server is RUNNING");
})

app.post("/api/openai/transcriber",(req, res) => transcribe(req, res));

the saved mp3 file is working just fine. the apikey is correct. when i record my own mp3 using windows recorder and use the createReadStream of that it works just fine. the saved file data is a buffer of the form

i tried changing the way i save a file, using different formatting methods for the buffer, binary hex, base64. tried uploading the buffer directly to whisper api. tried using axios to post to the api url directly. tried making a promise out of the saving of the mp3 file and then createReadStream and a lot of other little changes. tried to make a readable out of the buffer directly. I view all the similar questions with answers with no avail.


Solution

  • Just call transcribeAudio function in your try, catch of transcribe function.

    Also, make sure you are able to create the .mp3 file locally and try to play it. Sometimes, the audio file is not correct which causes problems while executing the code.

    try {
            const whisperRes = await transcribeAudio(readStream);
    
            const chatResponse = whisperRes.data.text;
            console.log(chatResponse)
    
            res.status(200).json({ chatResponse: chatResponse });
        } catch (error) {
            //console.log(error);
            res.status(500).json({ message: error });
        }
    
    import FormData from "form-data";
    import axios from 'axios'
    
    const transcribeAudio = async (file) => {
      let data = new FormData();
    
      data.append("file", fs.createReadStream(file));
      data.append("model", "whisper-1");
      data.append("language", "en");
    
      let config = {
        method: "post",
        maxBodyLength: Infinity,
        url: "https://api.openai.com/v1/audio/transcriptions",
        headers: {
          Authorization:
            `Bearer ${process.env.OPENAI_API_KEY}`,
          "Content-Type": "multipart/form-data",
          ...data.getHeaders(),
        },
        data: data,
      };
    
      try {
        const response = await axios.request(config);
        const data = response.data;
    
        return { data };
      } catch (error) {
        return {};
      }
    };