I'm building a Flutter application that uses Azure Speech Services for voice commands. When I say "open", the microphone should start listening (indicated by turning red), but it's not working correctly. Here's my complete implementation:
AudioRecorder Provider:
final isListeningProvider = StateProvider<bool>((ref) => false);
final audioRecorderProvider = Provider<AudioRecorder>((ref) => AudioRecorder(ref));
class AudioRecorder {
final FlutterSoundRecorder _recorder = FlutterSoundRecorder();
bool _isInitialized = false;
String? _path;
final Ref _ref;
AudioRecorder(this._ref);
bool get isListening => _ref.read(isListeningProvider);
Future<void> init() async {
if (!_isInitialized) {
final status = await Permission.microphone.request();
if (status != PermissionStatus.granted) {
throw RecordingPermissionException('Microphone permission not granted');
}
await _recorder.openRecorder();
_isInitialized = true;
}
}
Future<void> startListening(String command) async {
if (!_isInitialized) await init();
if (command.toLowerCase() == "open") {
try {
final dir = await getTemporaryDirectory();
_path = '${dir.path}/audio_${DateTime.now().millisecondsSinceEpoch}.aac';
await _recorder.startRecorder(
toFile: _path,
codec: Codec.aacADTS,
);
_ref.read(isListeningProvider.notifier).state = true;
} catch (e) {
debugPrint('Error starting recording: $e');
}
}
}
Future<String?> stopListening() async {
try {
if (_recorder.isRecording) {
await _recorder.stopRecorder();
_ref.read(isListeningProvider.notifier).state = false;
return _path;
}
return null;
} catch (e) {
debugPrint('Error stopping recording: $e');
return null;
}
}
Future<void> start() async {
if (!_isInitialized) await init();
try {
final dir = await getTemporaryDirectory();
_path = '${dir.path}/audio_${DateTime.now().millisecondsSinceEpoch}.aac';
await _recorder.startRecorder(
toFile: _path,
codec: Codec.aacADTS,
);
_ref.read(isListeningProvider.notifier).state = true;
} catch (e) {
debugPrint('Error recording audio: $e');
}
}
Future<String?> stop() async {
try {
if (_recorder.isRecording) {
await _recorder.stopRecorder();
_ref.read(isListeningProvider.notifier).state = false;
return _path;
}
return null;
} catch (e) {
debugPrint('Error stopping recording: $e');
return null;
}
}
Future<bool> isRecording() async {
return _recorder.isRecording;
}
Future<void> dispose() async {
if (_isInitialized) {
await _recorder.closeRecorder();
_isInitialized = false;
}
}
}
Voice Command State and Provider:
class VoiceCommandState {
final bool isListening;
final String? lastCommand;
final String? error;
final bool isProcessing;
VoiceCommandState({
this.isListening = false,
this.lastCommand,
this.error,
this.isProcessing = false,
});
VoiceCommandState copyWith({
bool? isListening,
String? lastCommand,
String? error,
bool? isProcessing,
}) {
return VoiceCommandState(
isListening: isListening ?? this.isListening,
lastCommand: lastCommand ?? this.lastCommand,
error: error ?? this.error,
isProcessing: isProcessing ?? this.isProcessing,
);
}
}
class VoiceCommandNotifier extends StateNotifier<VoiceCommandState> {
final AudioRecorder _recorder;
final TranslationRepository _repository;
final Ref _ref;
VoiceCommandNotifier(this._recorder, this._repository, this._ref)
: super(VoiceCommandState());
Future<void> processVoiceCommand(String command) async {
try {
final commandLower = command.toLowerCase();
if (commandLower == "open") {
// First update prompt screen state
_ref.read(promptScreenProvider.notifier).setListening(true);
// Start recording first
try {
await _recorder.startListening(command);
// Only update state after successful start of listening
state = state.copyWith(
isListening: true,
lastCommand: command,
isProcessing: false
);
} catch (e) {
// If recording fails, update both states accordingly
_ref.read(promptScreenProvider.notifier).setListening(false);
state = state.copyWith(
isListening: false,
error: e.toString(),
isProcessing: false
);
throw e; // Re-throw to be caught by outer try-catch
}
} else if (commandLower == "stop") {
if (state.isListening) {
try {
final audioPath = await _recorder.stopListening();
_ref.read(promptScreenProvider.notifier).setListening(false);
if (audioPath != null) {
state = state.copyWith(isProcessing: true);
final text = await _repository.processAudioInput(audioPath);
_ref.read(promptScreenProvider.notifier).updateText(text);
state = state.copyWith(
isListening: false,
lastCommand: text,
isProcessing: false
);
} else {
state = state.copyWith(
isListening: false,
error: "Failed to get audio path",
isProcessing: false
);
}
} catch (e) {
state = state.copyWith(
isListening: false,
error: e.toString(),
isProcessing: false
);
}
}
}
} catch (e) {
state = state.copyWith(
isListening: false,
error: e.toString(),
isProcessing: false
);
}
}
Future<void> handleSpeechRecognition(String audioPath) async {
try {
final text = await _repository.processAudioInput(audioPath);
if (text.toLowerCase() == "open") {
await processVoiceCommand("open");
} else if (text.toLowerCase() == "stop") {
await processVoiceCommand("stop");
}
} catch (e) {
state = state.copyWith(
isListening: false,
error: e.toString(),
isProcessing: false
);
}
}
}
final voiceCommandProvider = StateNotifierProvider<VoiceCommandNotifier, VoiceCommandState>((ref) {
return VoiceCommandNotifier(
ref.watch(audioRecorderProvider),
ref.watch(translationRepositoryProvider),
ref,
);
});
Prompt Screen Implementation:
final isListeningProvider = StateProvider<bool>((ref) => false);
class PromptScreen extends ConsumerStatefulWidget {
const PromptScreen({super.key});
@override
ConsumerState<PromptScreen> createState() => _PromptScreenState();
}
class _PromptScreenState extends ConsumerState<PromptScreen> {
late final TextEditingController _textController;
late final AudioRecorder _recorder;
@override
void initState() {
super.initState();
_textController = TextEditingController();
_recorder = ref.read(audioRecorderProvider);
_initializeRecorder();
}
Future<void> _initializeRecorder() async {
try {
await _recorder.init();
} catch (e) {
debugPrint('Recorder init error: $e');
}
}
void _handleVoiceCommand(VoiceCommandState state) {
if (!mounted) return;
setState(() {}); // Force UI update
if (state.lastCommand?.toLowerCase() == "open") {
_startVoiceRecording();
} else if (state.lastCommand?.toLowerCase() == "stop") {
_stopVoiceRecording();
}
if (state.error != null) {
ScaffoldMessenger.of(context)
.showSnackBar(SnackBar(content: Text(state.error!)));
}
}
Future<void> _startVoiceRecording() async {
try {
await _recorder.startListening("open");
ref.read(isListeningProvider.notifier).state = true;
final currentState = ref.read(voiceCommandProvider);
ref.read(voiceCommandProvider.notifier).state =
currentState.copyWith(isListening: true);
} catch (e) {
debugPrint('Recording start error: $e');
}
}
Future<void> _stopVoiceRecording() async {
try {
final path = await _recorder.stopListening();
if (path != null) {
final text = await ref
.read(translationRepositoryProvider)
.processAudioInput(path);
_textController.text = text;
}
} catch (e) {
debugPrint('Recording stop error: $e');
} finally {
ref.read(isListeningProvider.notifier).state = false;
final currentState = ref.read(voiceCommandProvider);
ref.read(voiceCommandProvider.notifier).state =
currentState.copyWith(isListening: false);
}
}
@override
void dispose() {
_recorder.dispose();
_textController.dispose();
super.dispose();
}
@override
Widget build(BuildContext context) {
final voiceState = ref.watch(voiceCommandProvider);
// Add listener for voice commands
ref.listen<VoiceCommandState>(voiceCommandProvider, (_, state) {
if (!mounted) return;
_handleVoiceCommand(state);
});
return Scaffold(
// ... scaffold code
Row(
children: [
Expanded(
child: ElevatedButton(
onPressed: () async {
// Make onPressed async
if (_textController.text.isNotEmpty) {
// Play sound before navigation
await ref
.read(translationRepositoryProvider)
.playUISound('start_conversation');
// Navigate after sound plays
if (mounted) {
// Check if widget is still mounted
Navigator.pushNamed(
context,
'/conversation',
arguments: _textController.text,
).then((_) => _textController.clear());
}
}
},
style: ElevatedButton.styleFrom(
backgroundColor: const Color.fromARGB(255, 61, 62, 63),
minimumSize: const Size(double.infinity, 50),
),
child: const Text('start conversation',
style: TextStyle(color: Colors.white)),
),
),
const SizedBox(width: 16),
Consumer(
builder: (context, ref, child) {
final voiceState = ref.watch(voiceCommandProvider);
return ElevatedButton(
onPressed: () => _toggleRecording(voiceState.isListening),
style: ElevatedButton.styleFrom(
backgroundColor:
voiceState.isListening ? Colors.red : Colors.white,
shape: const CircleBorder(),
padding: const EdgeInsets.all(16),
),
child: const Icon(Icons.mic, size: 28),
);
},
),
],
),
],
),
),
);
}
Future<void> _toggleRecording(bool isCurrentlyListening) async {
if (isCurrentlyListening) {
// Play sound before stopping
await ref.read(translationRepositoryProvider).playUISound('mic_off');
await _stopVoiceRecording();
} else {
// Play sound before starting
await ref.read(translationRepositoryProvider).playUISound('mic_on');
await _startVoiceRecording();
}
}
}
Backend Speech Service (Python/FastAPI):
class SpeechService:
def __init__(self):
self.speech_key = os.getenv("AZURE_SPEECH_KEY")
self.speech_region = os.getenv("AZURE_SPEECH_REGION")
if not self.speech_key or not self.speech_region:
raise ValueError("Azure Speech credentials not found")
self.speech_config = speechsdk.SpeechConfig(
subscription=self.speech_key,
region=self.speech_region
)
self.speech_config.speech_recognition_language = "en-EN"
# Initialize speech recognizer for general audio processing
self.recognizer = sr.Recognizer()
self.recognizer.energy_threshold = 300
self.recognizer.dynamic_energy_threshold = True
# Define wake words/commands
self.WAKE_WORDS = {
"open": "START_RECORDING",
"stop": "STOP_RECORDING"
}
# Audio format configuration
self.supported_formats = [".wav", ".aac", ".mp3", ".ogg", ".mp4", ".m4a"]
self.valid_mime_types = [
"audio/wav", "audio/aac", "audio/mpeg", "audio/ogg",
"audio/mp4", "audio/x-m4a"
]
self.translation_service = TranslationService()
async def process_command(self, audio_path: str) -> str:
"""Process audio for wake word detection using Azure Speech Services"""
working_path = audio_path
converted_path = None
try:
# Convert to WAV if needed
if not working_path.lower().endswith(".wav"):
converted_path = await self._convert_to_wav(working_path)
working_path = converted_path
# Set up Azure speech recognition
audio_config = speechsdk.AudioConfig(filename=working_path)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=self.speech_config,
audio_config=audio_config
)
# Use promise for async recognition
done = False
recognized_text = None
def handle_result(evt):
nonlocal done, recognized_text
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
recognized_text = evt.result.text.lower().strip()
done = True
speech_recognizer.recognized.connect(handle_result)
# Start recognition
speech_recognizer.start_continuous_recognition()
# Wait for result with timeout
timeout = 5 # 5 seconds timeout
start_time = asyncio.get_event_loop().time()
while not done:
if asyncio.get_event_loop().time() - start_time > timeout:
speech_recognizer.stop_continuous_recognition()
raise HTTPException(
status_code=408,
detail="Recognition timeout"
)
await asyncio.sleep(0.1)
speech_recognizer.stop_continuous_recognition()
# Check if recognized text matches any wake words
if recognized_text in self.WAKE_WORDS:
return recognized_text
return "UNKNOWN_COMMAND"
except Exception as e:
logger.error(f"Command processing error: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"Command processing failed: {str(e)}"
)
finally:
# Cleanup temporary files
await self._cleanup_temp_files(converted_path)
Expected behavior:
Something like this:
"esto es una prueba" is " this is a test"
Actual behavior:
I suspect there might be an issue with the state management or how the voice commands are being processed, but I can't figure out where the problem lies.
This is the correct implementation logic, but I am currently using Picovoice
, which is relatively expensive for extensive use. Nevertheless, I will demonstrate the logic with Picovoice
. However, I aim to replace it with Azure
since it is more cost-effective, and I wish to learn how to implement it using Azure
.
import 'package:flutter/cupertino.dart';
import 'package:flutter/material.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:porcupine_flutter/porcupine.dart';
import 'package:porcupine_flutter/porcupine_error.dart';
import 'package:porcupine_flutter/porcupine_manager.dart';
import 'package:speech_to_text/speech_to_text.dart' as stt;
import '../../domain/repositories/translation_repository.dart';
import '../providers/audio_recorder_provider.dart';
import '../providers/voice_command_provider.dart';
import '../widgets/voice_command_status_inficator.dart';
final isListeningProvider = StateProvider<bool>((ref) => false);
class PromptScreen extends ConsumerStatefulWidget {
const PromptScreen({super.key});
@override
ConsumerState<PromptScreen> createState() => _PromptScreenState();
}
class _PromptScreenState extends ConsumerState<PromptScreen> {
late final TextEditingController _textController;
late final AudioRecorder _recorder;
late PorcupineManager _porcupineManager;
late stt.SpeechToText _speech;
bool _isWakeWordMode = true;
@override
void initState() {
super.initState();
_textController = TextEditingController();
_recorder = ref.read(audioRecorderProvider);
_speech = stt.SpeechToText();
_initializeRecorder();
_initPorcupine();
}
Future<void> _initializeRecorder() async {
try {
await _recorder.init();
} catch (e) {
debugPrint('Recorder init error: $e');
}
}
void _initPorcupine() async {
try {
_porcupineManager = await PorcupineManager.fromBuiltInKeywords(
'PICOVOICE_API_KEY',
[BuiltInKeyword.JARVIS, BuiltInKeyword.ALEXA],
_wakeWordCallback,
);
await _porcupineManager.start();
debugPrint("Porcupine initialized successfully");
} on PorcupineException catch (err) {
debugPrint("Failed to initialize Porcupine: ${err.message}");
}
}
Future<void> _startConversation() async {
if (_textController.text.isNotEmpty) {
await ref.read(translationRepositoryProvider).playUISound('start_conversation');
if (mounted) {
Navigator.pushNamed(
context,
'/conversation',
arguments: _textController.text,
).then((_) => _textController.clear());
}
}
}
void _wakeWordCallback(int keywordIndex) async {
if (!mounted) return;
// JARVIS detected
if (keywordIndex == 0 && _isWakeWordMode) {
await _startVoiceRecording();
_isWakeWordMode = false;
}
// ALEXA detected
else if (keywordIndex == 1 && !_isWakeWordMode) {
await _stopVoiceRecording();
_isWakeWordMode = true;
// Automatically start conversation after stopping recording
if (_textController.text.isNotEmpty) {
await _startConversation();
}
}
}
void _handleVoiceCommand(VoiceCommandState state) {
if (!mounted) return;
setState(() {});
if (state.error != null) {
ScaffoldMessenger.of(context)
.showSnackBar(SnackBar(content: Text(state.error!)));
}
}
Future<void> _startVoiceRecording() async {
try {
await ref.read(translationRepositoryProvider).playUISound('mic_on');
await _recorder.startListening("open");
ref.read(isListeningProvider.notifier).state = true;
final currentState = ref.read(voiceCommandProvider);
ref.read(voiceCommandProvider.notifier).state =
currentState.copyWith(isListening: true);
} catch (e) {
debugPrint('Recording start error: $e');
}
}
Future<void> _stopVoiceRecording() async {
try {
await ref.read(translationRepositoryProvider).playUISound('mic_off');
final path = await _recorder.stopListening();
if (path != null) {
var text = await ref
.read(translationRepositoryProvider)
.processAudioInput(path);
// Filter out wake words from the recognized text
text = text.replaceAll(RegExp(r'\b(?:jarvis|alexa)\b', caseSensitive: false), '').trim();
// Only update text if there's actual content after filtering
if (text.isNotEmpty) {
_textController.text = text;
}
}
} catch (e) {
debugPrint('Recording stop error: $e');
} finally {
ref.read(isListeningProvider.notifier).state = false;
final currentState = ref.read(voiceCommandProvider);
ref.read(voiceCommandProvider.notifier).state =
currentState.copyWith(isListening: false);
}
}
@override
void dispose() {
_porcupineManager.delete();
_recorder.dispose();
_textController.dispose();
super.dispose();
}
@override
Widget build(BuildContext context) {
final voiceState = ref.watch(voiceCommandProvider);
ref.listen<VoiceCommandState>(voiceCommandProvider, (_, state) {
if (!mounted) return;
_handleVoiceCommand(state);
});
return Scaffold(
backgroundColor: const Color(0xFF000000),
appBar: CupertinoNavigationBar(
backgroundColor: const Color(0xFF1C1C1E),
border: null,
middle: const Text('AI Chat Assistant',
style: TextStyle(
color: Colors.white,
fontSize: 17,
fontWeight: FontWeight.w600)),
trailing: CupertinoButton(
padding: EdgeInsets.zero,
child: const Icon(CupertinoIcons.gear,
color: CupertinoColors.systemGrey, size: 28),
onPressed: () => Navigator.pushNamed(context, '/settings'),
),
),
body: Padding(
padding: const EdgeInsets.all(16.0),
child: Column(
children: [
VoiceCommandStatusIndicator(
isListening: voiceState.isListening,
),
Text(
_isWakeWordMode
? 'Say "Jarvis" to start listening'
: 'Say "Alexa" to stop listening and start conversation',
style: const TextStyle(color: Colors.white, fontSize: 14),
),
const SizedBox(height: 12),
Expanded(
child: Align(
alignment: Alignment.topLeft,
child: CupertinoTextField(
controller: _textController,
maxLines: null,
style: const TextStyle(color: Colors.white, fontSize: 17),
placeholder: 'write your prompt here',
placeholderStyle: const TextStyle(
color: CupertinoColors.placeholderText, fontSize: 17),
decoration: BoxDecoration(
color: const Color(0xFF2C2C2E),
borderRadius: BorderRadius.circular(12),
border: Border.all(
color: const Color(0xFF3A3A3C),
width: 0.5,
),
),
padding: const EdgeInsets.all(16),
),
),
),
const SizedBox(height: 20),
Row(
children: [
Expanded(
child: ElevatedButton(
onPressed: _startConversation,
style: ElevatedButton.styleFrom(
backgroundColor: const Color.fromARGB(255, 61, 62, 63),
minimumSize: const Size(double.infinity, 50),
),
child: const Text('start conversation',
style: TextStyle(color: Colors.white)),
),
),
const SizedBox(width: 16),
Consumer(
builder: (context, ref, child) {
final voiceState = ref.watch(voiceCommandProvider);
return ElevatedButton(
onPressed: () => _toggleRecording(voiceState.isListening),
style: ElevatedButton.styleFrom(
backgroundColor:
voiceState.isListening ? Colors.red : Colors.white,
shape: const CircleBorder(),
padding: const EdgeInsets.all(16),
),
child: const Icon(Icons.mic, size: 28, color: Colors.black,),
);
},
),
],
),
],
),
),
);
}
Future<void> _toggleRecording(bool isCurrentlyListening) async {
if (isCurrentlyListening) {
await _stopVoiceRecording();
_isWakeWordMode = true;
} else {
await _startVoiceRecording();
_isWakeWordMode = false;
}
}
}