minerva/scripts/voice_server.py
pyr0ball 173f7f37d4 feat: import mycroft-precise work as Minerva foundation
Ports prior voice assistant research and prototypes from devl/Devops
into the Minerva repo. Includes:

- docs/: architecture, wake word guides, ESP32-S3 spec, hardware buying guide
- scripts/: voice_server.py, voice_server_enhanced.py, setup scripts
- hardware/maixduino/: edge device scripts with WiFi credentials scrubbed
  (replaced hardcoded password with secrets.py pattern)
- config/.env.example: server config template
- .gitignore: excludes .env, secrets.py, model blobs, ELF firmware
- CLAUDE.md: Minerva product context and connection to cf-voice roadmap
2026-04-06 22:21:12 -07:00

700 lines
21 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Voice Processing Server for Maix Duino Voice Assistant
Purpose and usage:
This server runs on Heimdall (10.1.10.71) and handles:
- Audio stream reception from Maix Duino
- Speech-to-text using Whisper
- Intent recognition and Home Assistant API calls
- Text-to-speech using Piper
- Audio response streaming back to device
Path: /home/alan/voice-assistant/voice_server.py
Requirements:
- whisper (already installed)
- piper-tts
- flask
- requests
- python-dotenv
Usage:
python3 voice_server.py [--host HOST] [--port PORT] [--ha-url URL]
"""
import os
import sys
import argparse
import tempfile
import wave
import io
import re
import threading
import queue
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
import whisper
import requests
from flask import Flask, request, jsonify, send_file
from werkzeug.exceptions import BadRequest
# Try to load environment variables
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
print("Warning: python-dotenv not installed. Using environment variables only.")
# Try to import Mycroft Precise
PRECISE_AVAILABLE = False
try:
from precise_runner import PreciseEngine, PreciseRunner
import pyaudio
PRECISE_AVAILABLE = True
except ImportError:
print("Warning: Mycroft Precise not installed. Wake word detection disabled.")
print("Install with: pip install mycroft-precise pyaudio")
# Configuration
DEFAULT_HOST = "0.0.0.0"
DEFAULT_PORT = 5000
DEFAULT_WHISPER_MODEL = "medium"
DEFAULT_HA_URL = os.getenv("HA_URL", "http://homeassistant.local:8123")
DEFAULT_HA_TOKEN = os.getenv("HA_TOKEN", "")
DEFAULT_PRECISE_MODEL = os.getenv("PRECISE_MODEL", "")
DEFAULT_PRECISE_SENSITIVITY = float(os.getenv("PRECISE_SENSITIVITY", "0.5"))
DEFAULT_PRECISE_ENGINE = "/usr/local/bin/precise-engine"
# Initialize Flask app
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max audio file
# Global variables for loaded models
whisper_model = None
ha_client = None
precise_runner = None
precise_enabled = False
wake_word_queue = queue.Queue() # Queue for wake word detections
class HomeAssistantClient:
"""Client for interacting with Home Assistant API"""
def __init__(self, base_url: str, token: str):
self.base_url = base_url.rstrip('/')
self.token = token
self.session = requests.Session()
self.session.headers.update({
'Authorization': f'Bearer {token}',
'Content-Type': 'application/json'
})
def get_state(self, entity_id: str) -> Optional[Dict[str, Any]]:
"""Get the state of an entity"""
try:
response = self.session.get(f'{self.base_url}/api/states/{entity_id}')
response.raise_for_status()
return response.json()
except requests.RequestException as e:
print(f"Error getting state for {entity_id}: {e}")
return None
def call_service(self, domain: str, service: str, entity_id: str,
**kwargs) -> bool:
"""Call a Home Assistant service"""
try:
data = {'entity_id': entity_id}
data.update(kwargs)
response = self.session.post(
f'{self.base_url}/api/services/{domain}/{service}',
json=data
)
response.raise_for_status()
return True
except requests.RequestException as e:
print(f"Error calling service {domain}.{service}: {e}")
return False
def turn_on(self, entity_id: str, **kwargs) -> bool:
"""Turn on an entity"""
domain = entity_id.split('.')[0]
return self.call_service(domain, 'turn_on', entity_id, **kwargs)
def turn_off(self, entity_id: str, **kwargs) -> bool:
"""Turn off an entity"""
domain = entity_id.split('.')[0]
return self.call_service(domain, 'turn_off', entity_id, **kwargs)
def toggle(self, entity_id: str, **kwargs) -> bool:
"""Toggle an entity"""
domain = entity_id.split('.')[0]
return self.call_service(domain, 'toggle', entity_id, **kwargs)
class IntentParser:
"""Simple pattern-based intent recognition"""
# Intent patterns (can be expanded or replaced with ML-based NLU)
PATTERNS = {
'turn_on': [
r'turn on (the )?(.+)',
r'switch on (the )?(.+)',
r'enable (the )?(.+)',
],
'turn_off': [
r'turn off (the )?(.+)',
r'switch off (the )?(.+)',
r'disable (the )?(.+)',
],
'toggle': [
r'toggle (the )?(.+)',
],
'get_state': [
r'what(?:\'s| is) (the )?(.+)',
r'how is (the )?(.+)',
r'status of (the )?(.+)',
],
'get_temperature': [
r'what(?:\'s| is) the temperature',
r'how (?:warm|cold|hot) is it',
],
}
# Entity name mapping (friendly names to entity IDs)
ENTITY_MAP = {
'living room light': 'light.living_room',
'living room lights': 'light.living_room',
'bedroom light': 'light.bedroom',
'bedroom lights': 'light.bedroom',
'kitchen light': 'light.kitchen',
'kitchen lights': 'light.kitchen',
'all lights': 'group.all_lights',
'temperature': 'sensor.temperature',
'thermostat': 'climate.thermostat',
}
def parse(self, text: str) -> Optional[Tuple[str, str, Dict[str, Any]]]:
"""
Parse text into intent, entity, and parameters
Returns:
(intent, entity_id, params) or None if no match
"""
text = text.lower().strip()
for intent, patterns in self.PATTERNS.items():
for pattern in patterns:
match = re.match(pattern, text, re.IGNORECASE)
if match:
# Extract entity name from match groups
entity_name = None
for group in match.groups():
if group and group.lower() not in ['the', 'a', 'an']:
entity_name = group.lower().strip()
break
# Map entity name to entity ID
entity_id = None
if entity_name:
entity_id = self.ENTITY_MAP.get(entity_name)
# For get_temperature, use default sensor
if intent == 'get_temperature':
entity_id = self.ENTITY_MAP.get('temperature')
if entity_id:
return (intent, entity_id, {})
return None
def load_whisper_model(model_name: str = DEFAULT_WHISPER_MODEL):
"""Load Whisper model"""
global whisper_model
if whisper_model is None:
print(f"Loading Whisper model: {model_name}")
whisper_model = whisper.load_model(model_name)
print("Whisper model loaded successfully")
return whisper_model
def transcribe_audio(audio_file_path: str) -> Optional[str]:
"""Transcribe audio file using Whisper"""
try:
model = load_whisper_model()
result = model.transcribe(audio_file_path)
return result['text'].strip()
except Exception as e:
print(f"Error transcribing audio: {e}")
return None
def generate_tts(text: str) -> Optional[bytes]:
"""
Generate speech from text using Piper TTS
TODO: Implement Piper TTS integration
For now, returns None - implement based on Piper installation
"""
# Placeholder for TTS implementation
print(f"TTS requested for: {text}")
# You'll need to add Piper TTS integration here
# Example command: piper --model <model> --output_file <file> < text
return None
def on_wake_word_detected():
"""
Callback when Mycroft Precise detects wake word
This function is called by the Precise runner when the wake word
is detected. It signals the main application to start recording
and processing the user's command.
"""
print("Wake word detected by Precise!")
wake_word_queue.put({
'timestamp': time.time(),
'source': 'precise'
})
def start_precise_listener(model_path: str, sensitivity: float = 0.5,
engine_path: str = DEFAULT_PRECISE_ENGINE):
"""
Start Mycroft Precise wake word detection
Args:
model_path: Path to .net model file
sensitivity: Detection threshold (0.0-1.0, default 0.5)
engine_path: Path to precise-engine binary
Returns:
PreciseRunner instance if successful, None otherwise
"""
global precise_runner, precise_enabled
if not PRECISE_AVAILABLE:
print("Error: Mycroft Precise not available")
return None
# Verify model exists
if not os.path.exists(model_path):
print(f"Error: Precise model not found: {model_path}")
return None
# Verify engine exists
if not os.path.exists(engine_path):
print(f"Error: precise-engine not found: {engine_path}")
print("Download from: https://github.com/MycroftAI/mycroft-precise/releases")
return None
try:
# Create Precise engine
engine = PreciseEngine(engine_path, model_path)
# Create runner with callback
precise_runner = PreciseRunner(
engine,
sensitivity=sensitivity,
on_activation=on_wake_word_detected
)
# Start listening
precise_runner.start()
precise_enabled = True
print(f"Precise listening started:")
print(f" Model: {model_path}")
print(f" Sensitivity: {sensitivity}")
print(f" Engine: {engine_path}")
return precise_runner
except Exception as e:
print(f"Error starting Precise: {e}")
return None
def stop_precise_listener():
"""Stop Mycroft Precise wake word detection"""
global precise_runner, precise_enabled
if precise_runner:
try:
precise_runner.stop()
precise_enabled = False
print("Precise listener stopped")
except Exception as e:
print(f"Error stopping Precise: {e}")
def record_audio_after_wake(duration: int = 5) -> Optional[bytes]:
"""
Record audio after wake word is detected
Args:
duration: Maximum recording duration in seconds
Returns:
WAV audio data or None
Note: This is for server-side wake word detection where
the server is also doing audio capture. For Maix Duino
client-side wake detection, audio comes from the client.
"""
if not PRECISE_AVAILABLE:
return None
try:
# Audio settings
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
# Open stream
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK
)
print(f"Recording for {duration} seconds...")
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
# Stop and close stream
stream.stop_stream()
stream.close()
p.terminate()
# Convert to WAV
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
return wav_buffer.getvalue()
except Exception as e:
print(f"Error recording audio: {e}")
return None
import time # Add this import at the top if not already there
def execute_intent(intent: str, entity_id: str, params: Dict[str, Any]) -> str:
"""Execute an intent and return response text"""
if intent == 'turn_on':
success = ha_client.turn_on(entity_id)
if success:
entity_name = entity_id.split('.')[-1].replace('_', ' ')
return f"Turned on {entity_name}"
else:
return "Sorry, I couldn't turn that on"
elif intent == 'turn_off':
success = ha_client.turn_off(entity_id)
if success:
entity_name = entity_id.split('.')[-1].replace('_', ' ')
return f"Turned off {entity_name}"
else:
return "Sorry, I couldn't turn that off"
elif intent == 'toggle':
success = ha_client.toggle(entity_id)
if success:
entity_name = entity_id.split('.')[-1].replace('_', ' ')
return f"Toggled {entity_name}"
else:
return "Sorry, I couldn't toggle that"
elif intent in ['get_state', 'get_temperature']:
state = ha_client.get_state(entity_id)
if state:
entity_name = entity_id.split('.')[-1].replace('_', ' ')
value = state.get('state', 'unknown')
unit = state.get('attributes', {}).get('unit_of_measurement', '')
return f"The {entity_name} is {value} {unit}".strip()
else:
return "Sorry, I couldn't get that information"
return "I didn't understand that command"
# Flask routes
@app.route('/health', methods=['GET'])
def health():
"""Health check endpoint"""
return jsonify({
'status': 'healthy',
'whisper_loaded': whisper_model is not None,
'ha_connected': ha_client is not None,
'precise_enabled': precise_enabled,
'precise_available': PRECISE_AVAILABLE
})
@app.route('/wake-word/status', methods=['GET'])
def wake_word_status():
"""Get wake word detection status"""
return jsonify({
'enabled': precise_enabled,
'available': PRECISE_AVAILABLE,
'model': DEFAULT_PRECISE_MODEL if precise_enabled else None,
'sensitivity': DEFAULT_PRECISE_SENSITIVITY if precise_enabled else None
})
@app.route('/wake-word/detections', methods=['GET'])
def wake_word_detections():
"""
Get recent wake word detections (non-blocking)
Returns any wake word detections in the queue.
Used for testing and monitoring.
"""
detections = []
try:
while not wake_word_queue.empty():
detections.append(wake_word_queue.get_nowait())
except queue.Empty:
pass
return jsonify({
'detections': detections,
'count': len(detections)
})
@app.route('/transcribe', methods=['POST'])
def transcribe():
"""
Transcribe audio file
Expects: WAV audio file in request body
Returns: JSON with transcribed text
"""
if 'audio' not in request.files:
raise BadRequest('No audio file provided')
audio_file = request.files['audio']
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
audio_file.save(temp_file.name)
temp_path = temp_file.name
try:
# Transcribe
text = transcribe_audio(temp_path)
if text:
return jsonify({
'success': True,
'text': text
})
else:
return jsonify({
'success': False,
'error': 'Transcription failed'
}), 500
finally:
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)
@app.route('/process', methods=['POST'])
def process():
"""
Process complete voice command
Expects: WAV audio file in request body
Returns: JSON with response and audio file
"""
if 'audio' not in request.files:
raise BadRequest('No audio file provided')
audio_file = request.files['audio']
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
audio_file.save(temp_file.name)
temp_path = temp_file.name
try:
# Step 1: Transcribe
text = transcribe_audio(temp_path)
if not text:
return jsonify({
'success': False,
'error': 'Transcription failed'
}), 500
print(f"Transcribed: {text}")
# Step 2: Parse intent
parser = IntentParser()
intent_result = parser.parse(text)
if not intent_result:
response_text = "I didn't understand that command"
else:
intent, entity_id, params = intent_result
print(f"Intent: {intent}, Entity: {entity_id}")
# Step 3: Execute intent
response_text = execute_intent(intent, entity_id, params)
print(f"Response: {response_text}")
# Step 4: Generate TTS (placeholder for now)
# audio_response = generate_tts(response_text)
return jsonify({
'success': True,
'transcription': text,
'response': response_text,
# 'audio_available': audio_response is not None
})
finally:
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)
@app.route('/tts', methods=['POST'])
def tts():
"""
Generate TTS audio
Expects: JSON with 'text' field
Returns: WAV audio file
"""
data = request.get_json()
if not data or 'text' not in data:
raise BadRequest('No text provided')
text = data['text']
# Generate TTS
audio_data = generate_tts(text)
if audio_data:
return send_file(
io.BytesIO(audio_data),
mimetype='audio/wav',
as_attachment=True,
download_name='response.wav'
)
else:
return jsonify({
'success': False,
'error': 'TTS generation not implemented yet'
}), 501
def main():
parser = argparse.ArgumentParser(
description="Voice Processing Server for Maix Duino Voice Assistant"
)
parser.add_argument('--host', default=DEFAULT_HOST,
help=f'Server host (default: {DEFAULT_HOST})')
parser.add_argument('--port', type=int, default=DEFAULT_PORT,
help=f'Server port (default: {DEFAULT_PORT})')
parser.add_argument('--whisper-model', default=DEFAULT_WHISPER_MODEL,
help=f'Whisper model to use (default: {DEFAULT_WHISPER_MODEL})')
parser.add_argument('--ha-url', default=DEFAULT_HA_URL,
help=f'Home Assistant URL (default: {DEFAULT_HA_URL})')
parser.add_argument('--ha-token', default=DEFAULT_HA_TOKEN,
help='Home Assistant long-lived access token')
parser.add_argument('--enable-precise', action='store_true',
help='Enable Mycroft Precise wake word detection')
parser.add_argument('--precise-model', default=DEFAULT_PRECISE_MODEL,
help='Path to Precise .net model file')
parser.add_argument('--precise-sensitivity', type=float,
default=DEFAULT_PRECISE_SENSITIVITY,
help='Precise sensitivity threshold (0.0-1.0, default: 0.5)')
parser.add_argument('--precise-engine', default=DEFAULT_PRECISE_ENGINE,
help=f'Path to precise-engine binary (default: {DEFAULT_PRECISE_ENGINE})')
args = parser.parse_args()
# Validate HA configuration
if not args.ha_token:
print("Warning: No Home Assistant token provided!")
print("Set HA_TOKEN environment variable or use --ha-token")
print("Commands will not execute without authentication.")
# Initialize global clients
global ha_client
ha_client = HomeAssistantClient(args.ha_url, args.ha_token)
# Load Whisper model
print(f"Starting voice processing server on {args.host}:{args.port}")
load_whisper_model(args.whisper_model)
# Start Precise if enabled
if args.enable_precise:
if not PRECISE_AVAILABLE:
print("Error: --enable-precise specified but Mycroft Precise not installed")
print("Install with: pip install mycroft-precise pyaudio")
sys.exit(1)
if not args.precise_model:
print("Error: --enable-precise requires --precise-model")
sys.exit(1)
print("\nStarting Mycroft Precise wake word detection...")
precise_result = start_precise_listener(
args.precise_model,
args.precise_sensitivity,
args.precise_engine
)
if not precise_result:
print("Error: Failed to start Precise listener")
sys.exit(1)
print("\nWake word detection active!")
print("The server will detect wake words and queue them for processing.")
print("Use /wake-word/detections endpoint to check for detections.\n")
# Start Flask server
try:
app.run(host=args.host, port=args.port, debug=False)
except KeyboardInterrupt:
print("\nShutting down...")
if args.enable_precise:
stop_precise_listener()
sys.exit(0)
if __name__ == '__main__':
main()