flutterazurespeech-recognitionriverpodstate-management

Flutter: Voice command 'open' not activating microphone listening state with Azure Speech Services


I'm building a Flutter application that uses Azure Speech Services for voice commands. When I say "open", the microphone should start listening (indicated by turning red), but it's not working correctly. Here's my complete implementation:

AudioRecorder Provider:


final isListeningProvider = StateProvider<bool>((ref) => false);

final audioRecorderProvider = Provider<AudioRecorder>((ref) => AudioRecorder(ref));

class AudioRecorder {
  final FlutterSoundRecorder _recorder = FlutterSoundRecorder();
  bool _isInitialized = false;
  String? _path;
  final Ref _ref;

  AudioRecorder(this._ref);

  bool get isListening => _ref.read(isListeningProvider);

  Future<void> init() async {
    if (!_isInitialized) {
      final status = await Permission.microphone.request();
      if (status != PermissionStatus.granted) {
        throw RecordingPermissionException('Microphone permission not granted');
      }
      await _recorder.openRecorder();
      _isInitialized = true;
    }
  }

  Future<void> startListening(String command) async {
    if (!_isInitialized) await init();
    
    if (command.toLowerCase() == "open") {
      try {
        final dir = await getTemporaryDirectory();
        _path = '${dir.path}/audio_${DateTime.now().millisecondsSinceEpoch}.aac';
        await _recorder.startRecorder(
          toFile: _path,
          codec: Codec.aacADTS,
        );
        _ref.read(isListeningProvider.notifier).state = true;
      } catch (e) {
        debugPrint('Error starting recording: $e');
      }
    }
  }

  Future<String?> stopListening() async {
    try {
      if (_recorder.isRecording) {
        await _recorder.stopRecorder();
        _ref.read(isListeningProvider.notifier).state = false;
        return _path;
      }
      return null;
    } catch (e) {
      debugPrint('Error stopping recording: $e');
      return null;
    }
  }

  Future<void> start() async {
    if (!_isInitialized) await init();
    try {
      final dir = await getTemporaryDirectory();
      _path = '${dir.path}/audio_${DateTime.now().millisecondsSinceEpoch}.aac';
      await _recorder.startRecorder(
        toFile: _path,
        codec: Codec.aacADTS,
      );
      _ref.read(isListeningProvider.notifier).state = true;
    } catch (e) {
      debugPrint('Error recording audio: $e');
    }
  }

  Future<String?> stop() async {
    try {
      if (_recorder.isRecording) {
        await _recorder.stopRecorder();
        _ref.read(isListeningProvider.notifier).state = false;
        return _path;
      }
      return null;
    } catch (e) {
      debugPrint('Error stopping recording: $e');
      return null;
    }
  }

  Future<bool> isRecording() async {
    return _recorder.isRecording;
  }

  Future<void> dispose() async {
    if (_isInitialized) {
      await _recorder.closeRecorder();
      _isInitialized = false;
    }
  }
}

Voice Command State and Provider:


class VoiceCommandState {
  final bool isListening;
  final String? lastCommand;
  final String? error;
  final bool isProcessing; 

  VoiceCommandState({
    this.isListening = false,
    this.lastCommand,
    this.error,
    this.isProcessing = false,
  });

  
  VoiceCommandState copyWith({
    bool? isListening,
    String? lastCommand,
    String? error,
    bool? isProcessing,
  }) {
    return VoiceCommandState(
      isListening: isListening ?? this.isListening,
      lastCommand: lastCommand ?? this.lastCommand,
      error: error ?? this.error,
      isProcessing: isProcessing ?? this.isProcessing,
    );
  }
}

class VoiceCommandNotifier extends StateNotifier<VoiceCommandState> {
  final AudioRecorder _recorder;
  final TranslationRepository _repository;
  final Ref _ref;

  VoiceCommandNotifier(this._recorder, this._repository, this._ref)
      : super(VoiceCommandState());

  Future<void> processVoiceCommand(String command) async {
    try {
      final commandLower = command.toLowerCase();
      
      if (commandLower == "open") {
        // First update prompt screen state
        _ref.read(promptScreenProvider.notifier).setListening(true);
        
        // Start recording first
        try {
          await _recorder.startListening(command);
          // Only update state after successful start of listening
          state = state.copyWith(
            isListening: true,
            lastCommand: command,
            isProcessing: false
          );
        } catch (e) {
          // If recording fails, update both states accordingly
          _ref.read(promptScreenProvider.notifier).setListening(false);
          state = state.copyWith(
            isListening: false,
            error: e.toString(),
            isProcessing: false
          );
          throw e; // Re-throw to be caught by outer try-catch
        }
      } else if (commandLower == "stop") {
        if (state.isListening) {
          try {
            final audioPath = await _recorder.stopListening();
            _ref.read(promptScreenProvider.notifier).setListening(false);
            
            if (audioPath != null) {
              state = state.copyWith(isProcessing: true);
              final text = await _repository.processAudioInput(audioPath);
              _ref.read(promptScreenProvider.notifier).updateText(text);
              
              state = state.copyWith(
                isListening: false,
                lastCommand: text,
                isProcessing: false
              );
            } else {
              state = state.copyWith(
                isListening: false,
                error: "Failed to get audio path",
                isProcessing: false
              );
            }
          } catch (e) {
            state = state.copyWith(
              isListening: false,
              error: e.toString(),
              isProcessing: false
            );
          }
        }
      }
    } catch (e) {
      state = state.copyWith(
        isListening: false,
        error: e.toString(),
        isProcessing: false
      );
    }
  }

  Future<void> handleSpeechRecognition(String audioPath) async {
    try {
      final text = await _repository.processAudioInput(audioPath);
      if (text.toLowerCase() == "open") {
        await processVoiceCommand("open");
      } else if (text.toLowerCase() == "stop") {
        await processVoiceCommand("stop");
      }
    } catch (e) {
      state = state.copyWith(
        isListening: false,
        error: e.toString(),
        isProcessing: false
      );
    }
  }
}

final voiceCommandProvider = StateNotifierProvider<VoiceCommandNotifier, VoiceCommandState>((ref) {
  return VoiceCommandNotifier(
    ref.watch(audioRecorderProvider),
    ref.watch(translationRepositoryProvider),
    ref,
  );
});


Prompt Screen Implementation:

final isListeningProvider = StateProvider<bool>((ref) => false);

class PromptScreen extends ConsumerStatefulWidget {
  const PromptScreen({super.key});

  @override
  ConsumerState<PromptScreen> createState() => _PromptScreenState();
}

class _PromptScreenState extends ConsumerState<PromptScreen> {
  late final TextEditingController _textController;
  late final AudioRecorder _recorder;

  @override
  void initState() {
    super.initState();
    _textController = TextEditingController();
    _recorder = ref.read(audioRecorderProvider);

    _initializeRecorder();
  }

  Future<void> _initializeRecorder() async {
    try {
      await _recorder.init();
    } catch (e) {
      debugPrint('Recorder init error: $e');
    }
  }

  void _handleVoiceCommand(VoiceCommandState state) {
    if (!mounted) return;
    setState(() {}); // Force UI update

    if (state.lastCommand?.toLowerCase() == "open") {
      _startVoiceRecording();
    } else if (state.lastCommand?.toLowerCase() == "stop") {
      _stopVoiceRecording();
    }

    if (state.error != null) {
      ScaffoldMessenger.of(context)
          .showSnackBar(SnackBar(content: Text(state.error!)));
    }
  }

  Future<void> _startVoiceRecording() async {
    try {
      await _recorder.startListening("open");
      ref.read(isListeningProvider.notifier).state = true;
      final currentState = ref.read(voiceCommandProvider);
      ref.read(voiceCommandProvider.notifier).state =
          currentState.copyWith(isListening: true);
    } catch (e) {
      debugPrint('Recording start error: $e');
    }
  }

  Future<void> _stopVoiceRecording() async {
    try {
      final path = await _recorder.stopListening();
      if (path != null) {
        final text = await ref
            .read(translationRepositoryProvider)
            .processAudioInput(path);
        _textController.text = text;
      }
    } catch (e) {
      debugPrint('Recording stop error: $e');
    } finally {
      ref.read(isListeningProvider.notifier).state = false;
      final currentState = ref.read(voiceCommandProvider);
      ref.read(voiceCommandProvider.notifier).state =
          currentState.copyWith(isListening: false);
    }
  }

  @override
  void dispose() {
    _recorder.dispose();
    _textController.dispose();
    super.dispose();
  }

  @override
  Widget build(BuildContext context) {
    final voiceState = ref.watch(voiceCommandProvider);

    // Add listener for voice commands
    ref.listen<VoiceCommandState>(voiceCommandProvider, (_, state) {
      if (!mounted) return;
      _handleVoiceCommand(state);
    });


    return Scaffold(
      // ... scaffold code
        Row(
              children: [
                Expanded(
                  child: ElevatedButton(
                    onPressed: () async {
                      // Make onPressed async
                      if (_textController.text.isNotEmpty) {
                        // Play sound before navigation
                        await ref
                            .read(translationRepositoryProvider)
                            .playUISound('start_conversation');

                        // Navigate after sound plays
                        if (mounted) {
                          // Check if widget is still mounted
                          Navigator.pushNamed(
                            context,
                            '/conversation',
                            arguments: _textController.text,
                          ).then((_) => _textController.clear());
                        }
                      }
                    },
                    style: ElevatedButton.styleFrom(
                      backgroundColor: const Color.fromARGB(255, 61, 62, 63),
                      minimumSize: const Size(double.infinity, 50),
                    ),
                    child: const Text('start conversation',
                        style: TextStyle(color: Colors.white)),
                  ),
                ),
                const SizedBox(width: 16),
                Consumer(
                  builder: (context, ref, child) {
                    final voiceState = ref.watch(voiceCommandProvider);
                    return ElevatedButton(
                      onPressed: () => _toggleRecording(voiceState.isListening),
                      style: ElevatedButton.styleFrom(
                        backgroundColor:
                            voiceState.isListening ? Colors.red : Colors.white,
                        shape: const CircleBorder(),
                        padding: const EdgeInsets.all(16),
                      ),
                      child: const Icon(Icons.mic, size: 28),
                    );
                  },
                ),
              ],
            ),
          ],
        ),
      ),
    );
  }

  Future<void> _toggleRecording(bool isCurrentlyListening) async {
    if (isCurrentlyListening) {
      // Play sound before stopping
      await ref.read(translationRepositoryProvider).playUISound('mic_off');
      await _stopVoiceRecording();
    } else {
      // Play sound before starting
      await ref.read(translationRepositoryProvider).playUISound('mic_on');
      await _startVoiceRecording();
    }
  }
}

Backend Speech Service (Python/FastAPI):



class SpeechService:
    def __init__(self):
        self.speech_key = os.getenv("AZURE_SPEECH_KEY")
        self.speech_region = os.getenv("AZURE_SPEECH_REGION")
        
        if not self.speech_key or not self.speech_region:
            raise ValueError("Azure Speech credentials not found")
            
        self.speech_config = speechsdk.SpeechConfig(
            subscription=self.speech_key,
            region=self.speech_region
        )
        self.speech_config.speech_recognition_language = "en-EN"
        
        # Initialize speech recognizer for general audio processing
        self.recognizer = sr.Recognizer()
        self.recognizer.energy_threshold = 300
        self.recognizer.dynamic_energy_threshold = True
        
        # Define wake words/commands
        self.WAKE_WORDS = {
            "open": "START_RECORDING",
            "stop": "STOP_RECORDING"
        }
        
        # Audio format configuration
        self.supported_formats = [".wav", ".aac", ".mp3", ".ogg", ".mp4", ".m4a"]
        self.valid_mime_types = [
            "audio/wav", "audio/aac", "audio/mpeg", "audio/ogg",
            "audio/mp4", "audio/x-m4a"
        ]
        
        self.translation_service = TranslationService()

    async def process_command(self, audio_path: str) -> str:
        """Process audio for wake word detection using Azure Speech Services"""
        working_path = audio_path
        converted_path = None
        
        try:
            # Convert to WAV if needed
            if not working_path.lower().endswith(".wav"):
                converted_path = await self._convert_to_wav(working_path)
                working_path = converted_path

            # Set up Azure speech recognition
            audio_config = speechsdk.AudioConfig(filename=working_path)
            speech_recognizer = speechsdk.SpeechRecognizer(
                speech_config=self.speech_config,
                audio_config=audio_config
            )

            # Use promise for async recognition
            done = False
            recognized_text = None

            def handle_result(evt):
                nonlocal done, recognized_text
                if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                    recognized_text = evt.result.text.lower().strip()
                done = True

            speech_recognizer.recognized.connect(handle_result)
            
            # Start recognition
            speech_recognizer.start_continuous_recognition()
            
            # Wait for result with timeout
            timeout = 5  # 5 seconds timeout
            start_time = asyncio.get_event_loop().time()
            
            while not done:
                if asyncio.get_event_loop().time() - start_time > timeout:
                    speech_recognizer.stop_continuous_recognition()
                    raise HTTPException(
                        status_code=408,
                        detail="Recognition timeout"
                    )
                await asyncio.sleep(0.1)
            
            speech_recognizer.stop_continuous_recognition()

            # Check if recognized text matches any wake words
            if recognized_text in self.WAKE_WORDS:
                return recognized_text
            
            return "UNKNOWN_COMMAND"

        except Exception as e:
            logger.error(f"Command processing error: {str(e)}")
            raise HTTPException(
                status_code=500,
                detail=f"Command processing failed: {str(e)}"
            )
        finally:
            # Cleanup temporary files
            await self._cleanup_temp_files(converted_path)

Expected behavior:

  1. When I say "open", the microphone should start listening (turn red)
  2. The mic should stay in listening state until I say "stop"
  3. While in listening state, it should perform speech recognition

Something like this:

"esto es una prueba" is " this is a test"

enter image description here

Actual behavior:

I suspect there might be an issue with the state management or how the voice commands are being processed, but I can't figure out where the problem lies.


Solution

  • This is the correct implementation logic, but I am currently using Picovoice, which is relatively expensive for extensive use. Nevertheless, I will demonstrate the logic with Picovoice. However, I aim to replace it with Azure since it is more cost-effective, and I wish to learn how to implement it using Azure.

    prompt_screen.dart

    import 'package:flutter/cupertino.dart';
    import 'package:flutter/material.dart';
    import 'package:flutter_riverpod/flutter_riverpod.dart';
    import 'package:porcupine_flutter/porcupine.dart';
    import 'package:porcupine_flutter/porcupine_error.dart';
    import 'package:porcupine_flutter/porcupine_manager.dart';
    import 'package:speech_to_text/speech_to_text.dart' as stt;
    import '../../domain/repositories/translation_repository.dart';
    import '../providers/audio_recorder_provider.dart';
    import '../providers/voice_command_provider.dart';
    import '../widgets/voice_command_status_inficator.dart';
    
    final isListeningProvider = StateProvider<bool>((ref) => false);
    
    class PromptScreen extends ConsumerStatefulWidget {
      const PromptScreen({super.key});
    
      @override
      ConsumerState<PromptScreen> createState() => _PromptScreenState();
    }
    
    class _PromptScreenState extends ConsumerState<PromptScreen> {
      late final TextEditingController _textController;
      late final AudioRecorder _recorder;
      late PorcupineManager _porcupineManager;
      late stt.SpeechToText _speech;
      bool _isWakeWordMode = true;
    
      @override
      void initState() {
        super.initState();
        _textController = TextEditingController();
        _recorder = ref.read(audioRecorderProvider);
        _speech = stt.SpeechToText();
    
        _initializeRecorder();
        _initPorcupine();
      }
    
      Future<void> _initializeRecorder() async {
        try {
          await _recorder.init();
        } catch (e) {
          debugPrint('Recorder init error: $e');
        }
      }
    
      void _initPorcupine() async {
        try {
          _porcupineManager = await PorcupineManager.fromBuiltInKeywords(
            'PICOVOICE_API_KEY',
            [BuiltInKeyword.JARVIS, BuiltInKeyword.ALEXA],
            _wakeWordCallback,
          );
          await _porcupineManager.start();
          debugPrint("Porcupine initialized successfully");
        } on PorcupineException catch (err) {
          debugPrint("Failed to initialize Porcupine: ${err.message}");
        }
      }
    
      Future<void> _startConversation() async {
        if (_textController.text.isNotEmpty) {
          await ref.read(translationRepositoryProvider).playUISound('start_conversation');
    
          if (mounted) {
            Navigator.pushNamed(
              context,
              '/conversation',
              arguments: _textController.text,
            ).then((_) => _textController.clear());
          }
        }
      }
    
      void _wakeWordCallback(int keywordIndex) async {
        if (!mounted) return;
    
        // JARVIS detected
        if (keywordIndex == 0 && _isWakeWordMode) {
          await _startVoiceRecording();
          _isWakeWordMode = false;
        }
        // ALEXA detected
        else if (keywordIndex == 1 && !_isWakeWordMode) {
          await _stopVoiceRecording();
          _isWakeWordMode = true;
          
          // Automatically start conversation after stopping recording
          if (_textController.text.isNotEmpty) {
            await _startConversation();
          }
        }
      }
    
      void _handleVoiceCommand(VoiceCommandState state) {
        if (!mounted) return;
        setState(() {});
    
        if (state.error != null) {
          ScaffoldMessenger.of(context)
              .showSnackBar(SnackBar(content: Text(state.error!)));
        }
      }
    
      Future<void> _startVoiceRecording() async {
        try {
          await ref.read(translationRepositoryProvider).playUISound('mic_on');
          await _recorder.startListening("open");
          ref.read(isListeningProvider.notifier).state = true;
          final currentState = ref.read(voiceCommandProvider);
          ref.read(voiceCommandProvider.notifier).state =
              currentState.copyWith(isListening: true);
        } catch (e) {
          debugPrint('Recording start error: $e');
        }
      }
    
      Future<void> _stopVoiceRecording() async {
        try {
          await ref.read(translationRepositoryProvider).playUISound('mic_off');
          final path = await _recorder.stopListening();
          if (path != null) {
            var text = await ref
                .read(translationRepositoryProvider)
                .processAudioInput(path);
    
            // Filter out wake words from the recognized text
            text = text.replaceAll(RegExp(r'\b(?:jarvis|alexa)\b', caseSensitive: false), '').trim();
    
            // Only update text if there's actual content after filtering
            if (text.isNotEmpty) {
              _textController.text = text;
            }
          }
        } catch (e) {
          debugPrint('Recording stop error: $e');
        } finally {
          ref.read(isListeningProvider.notifier).state = false;
          final currentState = ref.read(voiceCommandProvider);
          ref.read(voiceCommandProvider.notifier).state =
              currentState.copyWith(isListening: false);
        }
      }
    
      @override
      void dispose() {
        _porcupineManager.delete();
        _recorder.dispose();
        _textController.dispose();
        super.dispose();
      }
    
      @override
      Widget build(BuildContext context) {
        final voiceState = ref.watch(voiceCommandProvider);
    
        ref.listen<VoiceCommandState>(voiceCommandProvider, (_, state) {
          if (!mounted) return;
          _handleVoiceCommand(state);
        });
    
        return Scaffold(
          backgroundColor: const Color(0xFF000000),
          appBar: CupertinoNavigationBar(
            backgroundColor: const Color(0xFF1C1C1E),
            border: null,
            middle: const Text('AI Chat Assistant',
                style: TextStyle(
                    color: Colors.white,
                    fontSize: 17,
                    fontWeight: FontWeight.w600)),
            trailing: CupertinoButton(
              padding: EdgeInsets.zero,
              child: const Icon(CupertinoIcons.gear,
                  color: CupertinoColors.systemGrey, size: 28),
              onPressed: () => Navigator.pushNamed(context, '/settings'),
            ),
          ),
          body: Padding(
            padding: const EdgeInsets.all(16.0),
            child: Column(
              children: [
                VoiceCommandStatusIndicator(
                  isListening: voiceState.isListening,
                ),
                Text(
                  _isWakeWordMode 
                    ? 'Say "Jarvis" to start listening'
                    : 'Say "Alexa" to stop listening and start conversation',
                  style: const TextStyle(color: Colors.white, fontSize: 14),
                ),
                const SizedBox(height: 12),
                Expanded(
                  child: Align(
                    alignment: Alignment.topLeft,
                    child: CupertinoTextField(
                      controller: _textController,
                      maxLines: null,
                      style: const TextStyle(color: Colors.white, fontSize: 17),
                      placeholder: 'write your prompt here',
                      placeholderStyle: const TextStyle(
                          color: CupertinoColors.placeholderText, fontSize: 17),
                      decoration: BoxDecoration(
                        color: const Color(0xFF2C2C2E),
                        borderRadius: BorderRadius.circular(12),
                        border: Border.all(
                          color: const Color(0xFF3A3A3C),
                          width: 0.5,
                        ),
                      ),
                      padding: const EdgeInsets.all(16),
                    ),
                  ),
                ),
                const SizedBox(height: 20),
                Row(
                  children: [
                    Expanded(
                      child: ElevatedButton(
                        onPressed: _startConversation,
                        style: ElevatedButton.styleFrom(
                          backgroundColor: const Color.fromARGB(255, 61, 62, 63),
                          minimumSize: const Size(double.infinity, 50),
                        ),
                        child: const Text('start conversation',
                            style: TextStyle(color: Colors.white)),
                      ),
                    ),
                    const SizedBox(width: 16),
                    Consumer(
                      builder: (context, ref, child) {
                        final voiceState = ref.watch(voiceCommandProvider);
                        return ElevatedButton(
                          onPressed: () => _toggleRecording(voiceState.isListening),
                          style: ElevatedButton.styleFrom(
                            backgroundColor:
                                voiceState.isListening ? Colors.red : Colors.white,
                            shape: const CircleBorder(),
                            padding: const EdgeInsets.all(16),
                          ),
                          child: const Icon(Icons.mic, size: 28, color: Colors.black,),
                        );
                      },
                    ),
                  ],
                ),
              ],
            ),
          ),
        );
      }
    
      Future<void> _toggleRecording(bool isCurrentlyListening) async {
        if (isCurrentlyListening) {
          await _stopVoiceRecording();
          _isWakeWordMode = true;
        } else {
          await _startVoiceRecording();
          _isWakeWordMode = false;
        }
      }
    }