#!/usr/bin/env python3 """ Voice Processing Server for Maix Duino Voice Assistant Purpose and usage: This server runs on Heimdall (10.1.10.71) and handles: - Audio stream reception from Maix Duino - Speech-to-text using Whisper - Intent recognition and Home Assistant API calls - Text-to-speech using Piper - Audio response streaming back to device Path: /home/alan/voice-assistant/voice_server.py Requirements: - whisper (already installed) - piper-tts - flask - requests - python-dotenv Usage: python3 voice_server.py [--host HOST] [--port PORT] [--ha-url URL] """ import os import sys import argparse import tempfile import wave import io import re import threading import queue from pathlib import Path from typing import Optional, Dict, Any, Tuple import whisper import requests from flask import Flask, request, jsonify, send_file from werkzeug.exceptions import BadRequest # Try to load environment variables try: from dotenv import load_dotenv load_dotenv() except ImportError: print("Warning: python-dotenv not installed. Using environment variables only.") # Try to import Mycroft Precise PRECISE_AVAILABLE = False try: from precise_runner import PreciseEngine, PreciseRunner import pyaudio PRECISE_AVAILABLE = True except ImportError: print("Warning: Mycroft Precise not installed. Wake word detection disabled.") print("Install with: pip install mycroft-precise pyaudio") # Configuration DEFAULT_HOST = "0.0.0.0" DEFAULT_PORT = 5000 DEFAULT_WHISPER_MODEL = "medium" DEFAULT_HA_URL = os.getenv("HA_URL", "http://homeassistant.local:8123") DEFAULT_HA_TOKEN = os.getenv("HA_TOKEN", "") DEFAULT_PRECISE_MODEL = os.getenv("PRECISE_MODEL", "") DEFAULT_PRECISE_SENSITIVITY = float(os.getenv("PRECISE_SENSITIVITY", "0.5")) DEFAULT_PRECISE_ENGINE = "/usr/local/bin/precise-engine" # Initialize Flask app app = Flask(__name__) app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max audio file # Global variables for loaded models whisper_model = None ha_client = None precise_runner = None precise_enabled = False wake_word_queue = queue.Queue() # Queue for wake word detections class HomeAssistantClient: """Client for interacting with Home Assistant API""" def __init__(self, base_url: str, token: str): self.base_url = base_url.rstrip('/') self.token = token self.session = requests.Session() self.session.headers.update({ 'Authorization': f'Bearer {token}', 'Content-Type': 'application/json' }) def get_state(self, entity_id: str) -> Optional[Dict[str, Any]]: """Get the state of an entity""" try: response = self.session.get(f'{self.base_url}/api/states/{entity_id}') response.raise_for_status() return response.json() except requests.RequestException as e: print(f"Error getting state for {entity_id}: {e}") return None def call_service(self, domain: str, service: str, entity_id: str, **kwargs) -> bool: """Call a Home Assistant service""" try: data = {'entity_id': entity_id} data.update(kwargs) response = self.session.post( f'{self.base_url}/api/services/{domain}/{service}', json=data ) response.raise_for_status() return True except requests.RequestException as e: print(f"Error calling service {domain}.{service}: {e}") return False def turn_on(self, entity_id: str, **kwargs) -> bool: """Turn on an entity""" domain = entity_id.split('.')[0] return self.call_service(domain, 'turn_on', entity_id, **kwargs) def turn_off(self, entity_id: str, **kwargs) -> bool: """Turn off an entity""" domain = entity_id.split('.')[0] return self.call_service(domain, 'turn_off', entity_id, **kwargs) def toggle(self, entity_id: str, **kwargs) -> bool: """Toggle an entity""" domain = entity_id.split('.')[0] return self.call_service(domain, 'toggle', entity_id, **kwargs) class IntentParser: """Simple pattern-based intent recognition""" # Intent patterns (can be expanded or replaced with ML-based NLU) PATTERNS = { 'turn_on': [ r'turn on (the )?(.+)', r'switch on (the )?(.+)', r'enable (the )?(.+)', ], 'turn_off': [ r'turn off (the )?(.+)', r'switch off (the )?(.+)', r'disable (the )?(.+)', ], 'toggle': [ r'toggle (the )?(.+)', ], 'get_state': [ r'what(?:\'s| is) (the )?(.+)', r'how is (the )?(.+)', r'status of (the )?(.+)', ], 'get_temperature': [ r'what(?:\'s| is) the temperature', r'how (?:warm|cold|hot) is it', ], } # Entity name mapping (friendly names to entity IDs) ENTITY_MAP = { 'living room light': 'light.living_room', 'living room lights': 'light.living_room', 'bedroom light': 'light.bedroom', 'bedroom lights': 'light.bedroom', 'kitchen light': 'light.kitchen', 'kitchen lights': 'light.kitchen', 'all lights': 'group.all_lights', 'temperature': 'sensor.temperature', 'thermostat': 'climate.thermostat', } def parse(self, text: str) -> Optional[Tuple[str, str, Dict[str, Any]]]: """ Parse text into intent, entity, and parameters Returns: (intent, entity_id, params) or None if no match """ text = text.lower().strip() for intent, patterns in self.PATTERNS.items(): for pattern in patterns: match = re.match(pattern, text, re.IGNORECASE) if match: # Extract entity name from match groups entity_name = None for group in match.groups(): if group and group.lower() not in ['the', 'a', 'an']: entity_name = group.lower().strip() break # Map entity name to entity ID entity_id = None if entity_name: entity_id = self.ENTITY_MAP.get(entity_name) # For get_temperature, use default sensor if intent == 'get_temperature': entity_id = self.ENTITY_MAP.get('temperature') if entity_id: return (intent, entity_id, {}) return None def load_whisper_model(model_name: str = DEFAULT_WHISPER_MODEL): """Load Whisper model""" global whisper_model if whisper_model is None: print(f"Loading Whisper model: {model_name}") whisper_model = whisper.load_model(model_name) print("Whisper model loaded successfully") return whisper_model def transcribe_audio(audio_file_path: str) -> Optional[str]: """Transcribe audio file using Whisper""" try: model = load_whisper_model() result = model.transcribe(audio_file_path) return result['text'].strip() except Exception as e: print(f"Error transcribing audio: {e}") return None def generate_tts(text: str) -> Optional[bytes]: """ Generate speech from text using Piper TTS TODO: Implement Piper TTS integration For now, returns None - implement based on Piper installation """ # Placeholder for TTS implementation print(f"TTS requested for: {text}") # You'll need to add Piper TTS integration here # Example command: piper --model --output_file < text return None def on_wake_word_detected(): """ Callback when Mycroft Precise detects wake word This function is called by the Precise runner when the wake word is detected. It signals the main application to start recording and processing the user's command. """ print("Wake word detected by Precise!") wake_word_queue.put({ 'timestamp': time.time(), 'source': 'precise' }) def start_precise_listener(model_path: str, sensitivity: float = 0.5, engine_path: str = DEFAULT_PRECISE_ENGINE): """ Start Mycroft Precise wake word detection Args: model_path: Path to .net model file sensitivity: Detection threshold (0.0-1.0, default 0.5) engine_path: Path to precise-engine binary Returns: PreciseRunner instance if successful, None otherwise """ global precise_runner, precise_enabled if not PRECISE_AVAILABLE: print("Error: Mycroft Precise not available") return None # Verify model exists if not os.path.exists(model_path): print(f"Error: Precise model not found: {model_path}") return None # Verify engine exists if not os.path.exists(engine_path): print(f"Error: precise-engine not found: {engine_path}") print("Download from: https://github.com/MycroftAI/mycroft-precise/releases") return None try: # Create Precise engine engine = PreciseEngine(engine_path, model_path) # Create runner with callback precise_runner = PreciseRunner( engine, sensitivity=sensitivity, on_activation=on_wake_word_detected ) # Start listening precise_runner.start() precise_enabled = True print(f"Precise listening started:") print(f" Model: {model_path}") print(f" Sensitivity: {sensitivity}") print(f" Engine: {engine_path}") return precise_runner except Exception as e: print(f"Error starting Precise: {e}") return None def stop_precise_listener(): """Stop Mycroft Precise wake word detection""" global precise_runner, precise_enabled if precise_runner: try: precise_runner.stop() precise_enabled = False print("Precise listener stopped") except Exception as e: print(f"Error stopping Precise: {e}") def record_audio_after_wake(duration: int = 5) -> Optional[bytes]: """ Record audio after wake word is detected Args: duration: Maximum recording duration in seconds Returns: WAV audio data or None Note: This is for server-side wake word detection where the server is also doing audio capture. For Maix Duino client-side wake detection, audio comes from the client. """ if not PRECISE_AVAILABLE: return None try: # Audio settings CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 p = pyaudio.PyAudio() # Open stream stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK ) print(f"Recording for {duration} seconds...") frames = [] for _ in range(0, int(RATE / CHUNK * duration)): data = stream.read(CHUNK) frames.append(data) # Stop and close stream stream.stop_stream() stream.close() p.terminate() # Convert to WAV wav_buffer = io.BytesIO() with wave.open(wav_buffer, 'wb') as wf: wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) return wav_buffer.getvalue() except Exception as e: print(f"Error recording audio: {e}") return None import time # Add this import at the top if not already there def execute_intent(intent: str, entity_id: str, params: Dict[str, Any]) -> str: """Execute an intent and return response text""" if intent == 'turn_on': success = ha_client.turn_on(entity_id) if success: entity_name = entity_id.split('.')[-1].replace('_', ' ') return f"Turned on {entity_name}" else: return "Sorry, I couldn't turn that on" elif intent == 'turn_off': success = ha_client.turn_off(entity_id) if success: entity_name = entity_id.split('.')[-1].replace('_', ' ') return f"Turned off {entity_name}" else: return "Sorry, I couldn't turn that off" elif intent == 'toggle': success = ha_client.toggle(entity_id) if success: entity_name = entity_id.split('.')[-1].replace('_', ' ') return f"Toggled {entity_name}" else: return "Sorry, I couldn't toggle that" elif intent in ['get_state', 'get_temperature']: state = ha_client.get_state(entity_id) if state: entity_name = entity_id.split('.')[-1].replace('_', ' ') value = state.get('state', 'unknown') unit = state.get('attributes', {}).get('unit_of_measurement', '') return f"The {entity_name} is {value} {unit}".strip() else: return "Sorry, I couldn't get that information" return "I didn't understand that command" # Flask routes @app.route('/health', methods=['GET']) def health(): """Health check endpoint""" return jsonify({ 'status': 'healthy', 'whisper_loaded': whisper_model is not None, 'ha_connected': ha_client is not None, 'precise_enabled': precise_enabled, 'precise_available': PRECISE_AVAILABLE }) @app.route('/wake-word/status', methods=['GET']) def wake_word_status(): """Get wake word detection status""" return jsonify({ 'enabled': precise_enabled, 'available': PRECISE_AVAILABLE, 'model': DEFAULT_PRECISE_MODEL if precise_enabled else None, 'sensitivity': DEFAULT_PRECISE_SENSITIVITY if precise_enabled else None }) @app.route('/wake-word/detections', methods=['GET']) def wake_word_detections(): """ Get recent wake word detections (non-blocking) Returns any wake word detections in the queue. Used for testing and monitoring. """ detections = [] try: while not wake_word_queue.empty(): detections.append(wake_word_queue.get_nowait()) except queue.Empty: pass return jsonify({ 'detections': detections, 'count': len(detections) }) @app.route('/transcribe', methods=['POST']) def transcribe(): """ Transcribe audio file Expects: WAV audio file in request body Returns: JSON with transcribed text """ if 'audio' not in request.files: raise BadRequest('No audio file provided') audio_file = request.files['audio'] # Save to temporary file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: audio_file.save(temp_file.name) temp_path = temp_file.name try: # Transcribe text = transcribe_audio(temp_path) if text: return jsonify({ 'success': True, 'text': text }) else: return jsonify({ 'success': False, 'error': 'Transcription failed' }), 500 finally: # Clean up temp file if os.path.exists(temp_path): os.remove(temp_path) @app.route('/process', methods=['POST']) def process(): """ Process complete voice command Expects: WAV audio file in request body Returns: JSON with response and audio file """ if 'audio' not in request.files: raise BadRequest('No audio file provided') audio_file = request.files['audio'] # Save to temporary file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: audio_file.save(temp_file.name) temp_path = temp_file.name try: # Step 1: Transcribe text = transcribe_audio(temp_path) if not text: return jsonify({ 'success': False, 'error': 'Transcription failed' }), 500 print(f"Transcribed: {text}") # Step 2: Parse intent parser = IntentParser() intent_result = parser.parse(text) if not intent_result: response_text = "I didn't understand that command" else: intent, entity_id, params = intent_result print(f"Intent: {intent}, Entity: {entity_id}") # Step 3: Execute intent response_text = execute_intent(intent, entity_id, params) print(f"Response: {response_text}") # Step 4: Generate TTS (placeholder for now) # audio_response = generate_tts(response_text) return jsonify({ 'success': True, 'transcription': text, 'response': response_text, # 'audio_available': audio_response is not None }) finally: # Clean up temp file if os.path.exists(temp_path): os.remove(temp_path) @app.route('/tts', methods=['POST']) def tts(): """ Generate TTS audio Expects: JSON with 'text' field Returns: WAV audio file """ data = request.get_json() if not data or 'text' not in data: raise BadRequest('No text provided') text = data['text'] # Generate TTS audio_data = generate_tts(text) if audio_data: return send_file( io.BytesIO(audio_data), mimetype='audio/wav', as_attachment=True, download_name='response.wav' ) else: return jsonify({ 'success': False, 'error': 'TTS generation not implemented yet' }), 501 def main(): parser = argparse.ArgumentParser( description="Voice Processing Server for Maix Duino Voice Assistant" ) parser.add_argument('--host', default=DEFAULT_HOST, help=f'Server host (default: {DEFAULT_HOST})') parser.add_argument('--port', type=int, default=DEFAULT_PORT, help=f'Server port (default: {DEFAULT_PORT})') parser.add_argument('--whisper-model', default=DEFAULT_WHISPER_MODEL, help=f'Whisper model to use (default: {DEFAULT_WHISPER_MODEL})') parser.add_argument('--ha-url', default=DEFAULT_HA_URL, help=f'Home Assistant URL (default: {DEFAULT_HA_URL})') parser.add_argument('--ha-token', default=DEFAULT_HA_TOKEN, help='Home Assistant long-lived access token') parser.add_argument('--enable-precise', action='store_true', help='Enable Mycroft Precise wake word detection') parser.add_argument('--precise-model', default=DEFAULT_PRECISE_MODEL, help='Path to Precise .net model file') parser.add_argument('--precise-sensitivity', type=float, default=DEFAULT_PRECISE_SENSITIVITY, help='Precise sensitivity threshold (0.0-1.0, default: 0.5)') parser.add_argument('--precise-engine', default=DEFAULT_PRECISE_ENGINE, help=f'Path to precise-engine binary (default: {DEFAULT_PRECISE_ENGINE})') args = parser.parse_args() # Validate HA configuration if not args.ha_token: print("Warning: No Home Assistant token provided!") print("Set HA_TOKEN environment variable or use --ha-token") print("Commands will not execute without authentication.") # Initialize global clients global ha_client ha_client = HomeAssistantClient(args.ha_url, args.ha_token) # Load Whisper model print(f"Starting voice processing server on {args.host}:{args.port}") load_whisper_model(args.whisper_model) # Start Precise if enabled if args.enable_precise: if not PRECISE_AVAILABLE: print("Error: --enable-precise specified but Mycroft Precise not installed") print("Install with: pip install mycroft-precise pyaudio") sys.exit(1) if not args.precise_model: print("Error: --enable-precise requires --precise-model") sys.exit(1) print("\nStarting Mycroft Precise wake word detection...") precise_result = start_precise_listener( args.precise_model, args.precise_sensitivity, args.precise_engine ) if not precise_result: print("Error: Failed to start Precise listener") sys.exit(1) print("\nWake word detection active!") print("The server will detect wake words and queue them for processing.") print("Use /wake-word/detections endpoint to check for detections.\n") # Start Flask server try: app.run(host=args.host, port=args.port, debug=False) except KeyboardInterrupt: print("\nShutting down...") if args.enable_precise: stop_precise_listener() sys.exit(0) if __name__ == '__main__': main()