minerva/scripts/voice_server.py

#!/usr/bin/env python3
"""
Voice Processing Server for Maix Duino Voice Assistant

Purpose and usage:
    This server runs on Heimdall (10.1.10.71) and handles:
    - Audio stream reception from Maix Duino
    - Speech-to-text using Whisper
    - Intent recognition and Home Assistant API calls
    - Text-to-speech using Piper
    - Audio response streaming back to device

Path: /home/alan/voice-assistant/voice_server.py

Requirements:
    - whisper (already installed)
    - piper-tts
    - flask
    - requests
    - python-dotenv

Usage:
    python3 voice_server.py [--host HOST] [--port PORT] [--ha-url URL]
"""

import os
import sys
import argparse
import tempfile
import wave
import io
import re
import threading
import queue
from pathlib import Path
from typing import Optional, Dict, Any, Tuple

import whisper
import requests
from flask import Flask, request, jsonify, send_file
from werkzeug.exceptions import BadRequest

# Try to load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    print("Warning: python-dotenv not installed. Using environment variables only.")

# Try to import Mycroft Precise
PRECISE_AVAILABLE = False
try:
    from precise_runner import PreciseEngine, PreciseRunner
    import pyaudio
    PRECISE_AVAILABLE = True
except ImportError:
    print("Warning: Mycroft Precise not installed. Wake word detection disabled.")
    print("Install with: pip install mycroft-precise pyaudio")

# Configuration
DEFAULT_HOST = "0.0.0.0"
DEFAULT_PORT = 5000
DEFAULT_WHISPER_MODEL = "medium"
DEFAULT_HA_URL = os.getenv("HA_URL", "http://homeassistant.local:8123")
DEFAULT_HA_TOKEN = os.getenv("HA_TOKEN", "")
DEFAULT_PRECISE_MODEL = os.getenv("PRECISE_MODEL", "")
DEFAULT_PRECISE_SENSITIVITY = float(os.getenv("PRECISE_SENSITIVITY", "0.5"))
DEFAULT_PRECISE_ENGINE = "/usr/local/bin/precise-engine"

# Initialize Flask app
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max audio file

# Global variables for loaded models
whisper_model = None
ha_client = None
precise_runner = None
precise_enabled = False
wake_word_queue = queue.Queue()  # Queue for wake word detections


class HomeAssistantClient:
    """Client for interacting with Home Assistant API"""

    def __init__(self, base_url: str, token: str):
        self.base_url = base_url.rstrip('/')
        self.token = token
        self.session = requests.Session()
        self.session.headers.update({
            'Authorization': f'Bearer {token}',
            'Content-Type': 'application/json'
        })

    def get_state(self, entity_id: str) -> Optional[Dict[str, Any]]:
        """Get the state of an entity"""
        try:
            response = self.session.get(f'{self.base_url}/api/states/{entity_id}')
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            print(f"Error getting state for {entity_id}: {e}")
            return None

    def call_service(self, domain: str, service: str, entity_id: str,
                    **kwargs) -> bool:
        """Call a Home Assistant service"""
        try:
            data = {'entity_id': entity_id}
            data.update(kwargs)

            response = self.session.post(
                f'{self.base_url}/api/services/{domain}/{service}',
                json=data
            )
            response.raise_for_status()
            return True
        except requests.RequestException as e:
            print(f"Error calling service {domain}.{service}: {e}")
            return False

    def turn_on(self, entity_id: str, **kwargs) -> bool:
        """Turn on an entity"""
        domain = entity_id.split('.')[0]
        return self.call_service(domain, 'turn_on', entity_id, **kwargs)

    def turn_off(self, entity_id: str, **kwargs) -> bool:
        """Turn off an entity"""
        domain = entity_id.split('.')[0]
        return self.call_service(domain, 'turn_off', entity_id, **kwargs)

    def toggle(self, entity_id: str, **kwargs) -> bool:
        """Toggle an entity"""
        domain = entity_id.split('.')[0]
        return self.call_service(domain, 'toggle', entity_id, **kwargs)


class IntentParser:
    """Simple pattern-based intent recognition"""

    # Intent patterns (can be expanded or replaced with ML-based NLU)
    PATTERNS = {
        'turn_on': [
            r'turn on (the )?(.+)',
            r'switch on (the )?(.+)',
            r'enable (the )?(.+)',
        ],
        'turn_off': [
            r'turn off (the )?(.+)',
            r'switch off (the )?(.+)',
            r'disable (the )?(.+)',
        ],
        'toggle': [
            r'toggle (the )?(.+)',
        ],
        'get_state': [
            r'what(?:\'s| is) (the )?(.+)',
            r'how is (the )?(.+)',
            r'status of (the )?(.+)',
        ],
        'get_temperature': [
            r'what(?:\'s| is) the temperature',
            r'how (?:warm|cold|hot) is it',
        ],
    }

    # Entity name mapping (friendly names to entity IDs)
    ENTITY_MAP = {
        'living room light': 'light.living_room',
        'living room lights': 'light.living_room',
        'bedroom light': 'light.bedroom',
        'bedroom lights': 'light.bedroom',
        'kitchen light': 'light.kitchen',
        'kitchen lights': 'light.kitchen',
        'all lights': 'group.all_lights',
        'temperature': 'sensor.temperature',
        'thermostat': 'climate.thermostat',
    }

    def parse(self, text: str) -> Optional[Tuple[str, str, Dict[str, Any]]]:
        """
        Parse text into intent, entity, and parameters

        Returns:
            (intent, entity_id, params) or None if no match
        """
        text = text.lower().strip()

        for intent, patterns in self.PATTERNS.items():
            for pattern in patterns:
                match = re.match(pattern, text, re.IGNORECASE)
                if match:
                    # Extract entity name from match groups
                    entity_name = None
                    for group in match.groups():
                        if group and group.lower() not in ['the', 'a', 'an']:
                            entity_name = group.lower().strip()
                            break

                    # Map entity name to entity ID
                    entity_id = None
                    if entity_name:
                        entity_id = self.ENTITY_MAP.get(entity_name)

                    # For get_temperature, use default sensor
                    if intent == 'get_temperature':
                        entity_id = self.ENTITY_MAP.get('temperature')

                    if entity_id:
                        return (intent, entity_id, {})

        return None


def load_whisper_model(model_name: str = DEFAULT_WHISPER_MODEL):
    """Load Whisper model"""
    global whisper_model

    if whisper_model is None:
        print(f"Loading Whisper model: {model_name}")
        whisper_model = whisper.load_model(model_name)
        print("Whisper model loaded successfully")

    return whisper_model


def transcribe_audio(audio_file_path: str) -> Optional[str]:
    """Transcribe audio file using Whisper"""
    try:
        model = load_whisper_model()
        result = model.transcribe(audio_file_path)
        return result['text'].strip()
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None


def generate_tts(text: str) -> Optional[bytes]:
    """
    Generate speech from text using Piper TTS

    TODO: Implement Piper TTS integration
    For now, returns None - implement based on Piper installation
    """
    # Placeholder for TTS implementation
    print(f"TTS requested for: {text}")

    # You'll need to add Piper TTS integration here
    # Example command: piper --model <model> --output_file <file> < text

    return None


def on_wake_word_detected():
    """
    Callback when Mycroft Precise detects wake word

    This function is called by the Precise runner when the wake word
    is detected. It signals the main application to start recording
    and processing the user's command.
    """
    print("Wake word detected by Precise!")
    wake_word_queue.put({
        'timestamp': time.time(),
        'source': 'precise'
    })


def start_precise_listener(model_path: str, sensitivity: float = 0.5,
                          engine_path: str = DEFAULT_PRECISE_ENGINE):
    """
    Start Mycroft Precise wake word detection

    Args:
        model_path: Path to .net model file
        sensitivity: Detection threshold (0.0-1.0, default 0.5)
        engine_path: Path to precise-engine binary

    Returns:
        PreciseRunner instance if successful, None otherwise
    """
    global precise_runner, precise_enabled

    if not PRECISE_AVAILABLE:
        print("Error: Mycroft Precise not available")
        return None

    # Verify model exists
    if not os.path.exists(model_path):
        print(f"Error: Precise model not found: {model_path}")
        return None

    # Verify engine exists
    if not os.path.exists(engine_path):
        print(f"Error: precise-engine not found: {engine_path}")
        print("Download from: https://github.com/MycroftAI/mycroft-precise/releases")
        return None

    try:
        # Create Precise engine
        engine = PreciseEngine(engine_path, model_path)

        # Create runner with callback
        precise_runner = PreciseRunner(
            engine,
            sensitivity=sensitivity,
            on_activation=on_wake_word_detected
        )

        # Start listening
        precise_runner.start()
        precise_enabled = True

        print(f"Precise listening started:")
        print(f"  Model: {model_path}")
        print(f"  Sensitivity: {sensitivity}")
        print(f"  Engine: {engine_path}")

        return precise_runner

    except Exception as e:
        print(f"Error starting Precise: {e}")
        return None


def stop_precise_listener():
    """Stop Mycroft Precise wake word detection"""
    global precise_runner, precise_enabled

    if precise_runner:
        try:
            precise_runner.stop()
            precise_enabled = False
            print("Precise listener stopped")
        except Exception as e:
            print(f"Error stopping Precise: {e}")


def record_audio_after_wake(duration: int = 5) -> Optional[bytes]:
    """
    Record audio after wake word is detected

    Args:
        duration: Maximum recording duration in seconds

    Returns:
        WAV audio data or None

    Note: This is for server-side wake word detection where
    the server is also doing audio capture. For Maix Duino
    client-side wake detection, audio comes from the client.
    """
    if not PRECISE_AVAILABLE:
        return None

    try:
        # Audio settings
        CHUNK = 1024
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000

        p = pyaudio.PyAudio()

        # Open stream
        stream = p.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK
        )

        print(f"Recording for {duration} seconds...")

        frames = []
        for _ in range(0, int(RATE / CHUNK * duration)):
            data = stream.read(CHUNK)
            frames.append(data)

        # Stop and close stream
        stream.stop_stream()
        stream.close()
        p.terminate()

        # Convert to WAV
        wav_buffer = io.BytesIO()
        with wave.open(wav_buffer, 'wb') as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))

        return wav_buffer.getvalue()

    except Exception as e:
        print(f"Error recording audio: {e}")
        return None


import time  # Add this import at the top if not already there


def execute_intent(intent: str, entity_id: str, params: Dict[str, Any]) -> str:
    """Execute an intent and return response text"""

    if intent == 'turn_on':
        success = ha_client.turn_on(entity_id)
        if success:
            entity_name = entity_id.split('.')[-1].replace('_', ' ')
            return f"Turned on {entity_name}"
        else:
            return "Sorry, I couldn't turn that on"

    elif intent == 'turn_off':
        success = ha_client.turn_off(entity_id)
        if success:
            entity_name = entity_id.split('.')[-1].replace('_', ' ')
            return f"Turned off {entity_name}"
        else:
            return "Sorry, I couldn't turn that off"

    elif intent == 'toggle':
        success = ha_client.toggle(entity_id)
        if success:
            entity_name = entity_id.split('.')[-1].replace('_', ' ')
            return f"Toggled {entity_name}"
        else:
            return "Sorry, I couldn't toggle that"

    elif intent in ['get_state', 'get_temperature']:
        state = ha_client.get_state(entity_id)
        if state:
            entity_name = entity_id.split('.')[-1].replace('_', ' ')
            value = state.get('state', 'unknown')
            unit = state.get('attributes', {}).get('unit_of_measurement', '')

            return f"The {entity_name} is {value} {unit}".strip()
        else:
            return "Sorry, I couldn't get that information"

    return "I didn't understand that command"


# Flask routes

@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy',
        'whisper_loaded': whisper_model is not None,
        'ha_connected': ha_client is not None,
        'precise_enabled': precise_enabled,
        'precise_available': PRECISE_AVAILABLE
    })


@app.route('/wake-word/status', methods=['GET'])
def wake_word_status():
    """Get wake word detection status"""
    return jsonify({
        'enabled': precise_enabled,
        'available': PRECISE_AVAILABLE,
        'model': DEFAULT_PRECISE_MODEL if precise_enabled else None,
        'sensitivity': DEFAULT_PRECISE_SENSITIVITY if precise_enabled else None
    })


@app.route('/wake-word/detections', methods=['GET'])
def wake_word_detections():
    """
    Get recent wake word detections (non-blocking)

    Returns any wake word detections in the queue.
    Used for testing and monitoring.
    """
    detections = []

    try:
        while not wake_word_queue.empty():
            detections.append(wake_word_queue.get_nowait())
    except queue.Empty:
        pass

    return jsonify({
        'detections': detections,
        'count': len(detections)
    })


@app.route('/transcribe', methods=['POST'])
def transcribe():
    """
    Transcribe audio file

    Expects: WAV audio file in request body
    Returns: JSON with transcribed text
    """
    if 'audio' not in request.files:
        raise BadRequest('No audio file provided')

    audio_file = request.files['audio']

    # Save to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        audio_file.save(temp_file.name)
        temp_path = temp_file.name

    try:
        # Transcribe
        text = transcribe_audio(temp_path)

        if text:
            return jsonify({
                'success': True,
                'text': text
            })
        else:
            return jsonify({
                'success': False,
                'error': 'Transcription failed'
            }), 500

    finally:
        # Clean up temp file
        if os.path.exists(temp_path):
            os.remove(temp_path)


@app.route('/process', methods=['POST'])
def process():
    """
    Process complete voice command

    Expects: WAV audio file in request body
    Returns: JSON with response and audio file
    """
    if 'audio' not in request.files:
        raise BadRequest('No audio file provided')

    audio_file = request.files['audio']

    # Save to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        audio_file.save(temp_file.name)
        temp_path = temp_file.name

    try:
        # Step 1: Transcribe
        text = transcribe_audio(temp_path)

        if not text:
            return jsonify({
                'success': False,
                'error': 'Transcription failed'
            }), 500

        print(f"Transcribed: {text}")

        # Step 2: Parse intent
        parser = IntentParser()
        intent_result = parser.parse(text)

        if not intent_result:
            response_text = "I didn't understand that command"
        else:
            intent, entity_id, params = intent_result
            print(f"Intent: {intent}, Entity: {entity_id}")

            # Step 3: Execute intent
            response_text = execute_intent(intent, entity_id, params)

        print(f"Response: {response_text}")

        # Step 4: Generate TTS (placeholder for now)
        # audio_response = generate_tts(response_text)

        return jsonify({
            'success': True,
            'transcription': text,
            'response': response_text,
            # 'audio_available': audio_response is not None
        })

    finally:
        # Clean up temp file
        if os.path.exists(temp_path):
            os.remove(temp_path)


@app.route('/tts', methods=['POST'])
def tts():
    """
    Generate TTS audio

    Expects: JSON with 'text' field
    Returns: WAV audio file
    """
    data = request.get_json()

    if not data or 'text' not in data:
        raise BadRequest('No text provided')

    text = data['text']

    # Generate TTS
    audio_data = generate_tts(text)

    if audio_data:
        return send_file(
            io.BytesIO(audio_data),
            mimetype='audio/wav',
            as_attachment=True,
            download_name='response.wav'
        )
    else:
        return jsonify({
            'success': False,
            'error': 'TTS generation not implemented yet'
        }), 501


def main():
    parser = argparse.ArgumentParser(
        description="Voice Processing Server for Maix Duino Voice Assistant"
    )
    parser.add_argument('--host', default=DEFAULT_HOST,
                       help=f'Server host (default: {DEFAULT_HOST})')
    parser.add_argument('--port', type=int, default=DEFAULT_PORT,
                       help=f'Server port (default: {DEFAULT_PORT})')
    parser.add_argument('--whisper-model', default=DEFAULT_WHISPER_MODEL,
                       help=f'Whisper model to use (default: {DEFAULT_WHISPER_MODEL})')
    parser.add_argument('--ha-url', default=DEFAULT_HA_URL,
                       help=f'Home Assistant URL (default: {DEFAULT_HA_URL})')
    parser.add_argument('--ha-token', default=DEFAULT_HA_TOKEN,
                       help='Home Assistant long-lived access token')
    parser.add_argument('--enable-precise', action='store_true',
                       help='Enable Mycroft Precise wake word detection')
    parser.add_argument('--precise-model', default=DEFAULT_PRECISE_MODEL,
                       help='Path to Precise .net model file')
    parser.add_argument('--precise-sensitivity', type=float,
                       default=DEFAULT_PRECISE_SENSITIVITY,
                       help='Precise sensitivity threshold (0.0-1.0, default: 0.5)')
    parser.add_argument('--precise-engine', default=DEFAULT_PRECISE_ENGINE,
                       help=f'Path to precise-engine binary (default: {DEFAULT_PRECISE_ENGINE})')

    args = parser.parse_args()

    # Validate HA configuration
    if not args.ha_token:
        print("Warning: No Home Assistant token provided!")
        print("Set HA_TOKEN environment variable or use --ha-token")
        print("Commands will not execute without authentication.")

    # Initialize global clients
    global ha_client
    ha_client = HomeAssistantClient(args.ha_url, args.ha_token)

    # Load Whisper model
    print(f"Starting voice processing server on {args.host}:{args.port}")
    load_whisper_model(args.whisper_model)

    # Start Precise if enabled
    if args.enable_precise:
        if not PRECISE_AVAILABLE:
            print("Error: --enable-precise specified but Mycroft Precise not installed")
            print("Install with: pip install mycroft-precise pyaudio")
            sys.exit(1)

        if not args.precise_model:
            print("Error: --enable-precise requires --precise-model")
            sys.exit(1)

        print("\nStarting Mycroft Precise wake word detection...")
        precise_result = start_precise_listener(
            args.precise_model,
            args.precise_sensitivity,
            args.precise_engine
        )

        if not precise_result:
            print("Error: Failed to start Precise listener")
            sys.exit(1)

        print("\nWake word detection active!")
        print("The server will detect wake words and queue them for processing.")
        print("Use /wake-word/detections endpoint to check for detections.\n")

    # Start Flask server
    try:
        app.run(host=args.host, port=args.port, debug=False)
    except KeyboardInterrupt:
        print("\nShutting down...")
        if args.enable_precise:
            stop_precise_listener()
        sys.exit(0)


if __name__ == '__main__':
    main()