From 173f7f37d4c90335d944c5fb6946a693e2790f5a Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 6 Apr 2026 22:21:12 -0700 Subject: [PATCH] feat: import mycroft-precise work as Minerva foundation Ports prior voice assistant research and prototypes from devl/Devops into the Minerva repo. Includes: - docs/: architecture, wake word guides, ESP32-S3 spec, hardware buying guide - scripts/: voice_server.py, voice_server_enhanced.py, setup scripts - hardware/maixduino/: edge device scripts with WiFi credentials scrubbed (replaced hardcoded password with secrets.py pattern) - config/.env.example: server config template - .gitignore: excludes .env, secrets.py, model blobs, ELF firmware - CLAUDE.md: Minerva product context and connection to cf-voice roadmap --- .gitignore | 29 + CLAUDE.md | 165 +++ config/.env.example | 24 + docs/ADVANCED_WAKE_WORD_TOPICS.md | 905 ++++++++++++++ docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md | 1089 +++++++++++++++++ docs/HARDWARE_BUYING_GUIDE.md | 542 ++++++++ docs/K210_PERFORMANCE_VERIFICATION.md | 223 ++++ docs/LCD_CAMERA_FEATURES.md | 566 +++++++++ docs/MYCROFT_PRECISE_GUIDE.md | 638 ++++++++++ docs/PRECISE_DEPLOYMENT.md | 577 +++++++++ docs/QUESTIONS_ANSWERED.md | 470 +++++++ docs/QUICKSTART.md | 421 +++++++ docs/WAKE_WORD_ADVANCED.md | 723 +++++++++++ docs/WAKE_WORD_QUICK_REF.md | 411 +++++++ docs/maix-voice-assistant-architecture.md | 347 ++++++ hardware/maixduino/MICROPYTHON_QUIRKS.md | 348 ++++++ hardware/maixduino/README.md | 184 +++ .../maixduino/SESSION_PROGRESS_2025-12-03.md | 376 ++++++ hardware/maixduino/maix_debug_wifi.py | 41 + hardware/maixduino/maix_discover_modules.py | 51 + hardware/maixduino/maix_simple_record_test.py | 461 +++++++ hardware/maixduino/maix_test_simple.py | 252 ++++ hardware/maixduino/maix_voice_client.py | 465 +++++++ hardware/maixduino/secrets.py.example | 7 + scripts/download_pretrained_models.sh | 409 +++++++ scripts/quick_start_hey_mycroft.sh | 456 +++++++ scripts/setup_precise.sh | 630 ++++++++++ scripts/setup_voice_assistant.sh | 429 +++++++ scripts/voice_server.py | 700 +++++++++++ scripts/voice_server_enhanced.py | 580 +++++++++ 30 files changed, 12519 insertions(+) create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 config/.env.example create mode 100755 docs/ADVANCED_WAKE_WORD_TOPICS.md create mode 100755 docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md create mode 100755 docs/HARDWARE_BUYING_GUIDE.md create mode 100755 docs/K210_PERFORMANCE_VERIFICATION.md create mode 100755 docs/LCD_CAMERA_FEATURES.md create mode 100755 docs/MYCROFT_PRECISE_GUIDE.md create mode 100755 docs/PRECISE_DEPLOYMENT.md create mode 100755 docs/QUESTIONS_ANSWERED.md create mode 100755 docs/QUICKSTART.md create mode 100755 docs/WAKE_WORD_ADVANCED.md create mode 100755 docs/WAKE_WORD_QUICK_REF.md create mode 100755 docs/maix-voice-assistant-architecture.md create mode 100755 hardware/maixduino/MICROPYTHON_QUIRKS.md create mode 100755 hardware/maixduino/README.md create mode 100755 hardware/maixduino/SESSION_PROGRESS_2025-12-03.md create mode 100755 hardware/maixduino/maix_debug_wifi.py create mode 100755 hardware/maixduino/maix_discover_modules.py create mode 100644 hardware/maixduino/maix_simple_record_test.py create mode 100644 hardware/maixduino/maix_test_simple.py create mode 100755 hardware/maixduino/maix_voice_client.py create mode 100644 hardware/maixduino/secrets.py.example create mode 100755 scripts/download_pretrained_models.sh create mode 100755 scripts/quick_start_hey_mycroft.sh create mode 100755 scripts/setup_precise.sh create mode 100755 scripts/setup_voice_assistant.sh create mode 100755 scripts/voice_server.py create mode 100755 scripts/voice_server_enhanced.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d27ccb5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Credentials +secrets.py +config/.env +*.env +!*.env.example + +# Models (large binary files) +models/*.pb +models/*.pb.params +models/*.net +models/*.tflite +models/*.kmodel + +# OEM firmware blobs +*.elf +*.7z +*.bin + +# Python +__pycache__/ +*.pyc +*.pyo + +# Logs +logs/ + +# IDE +.vscode/ +.idea/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6a34638 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,165 @@ +# Minerva — Developer Context + +**Product code:** `MNRV` +**Status:** Concept / early prototype +**Domain:** Privacy-first, local-only voice assistant hardware platform + +--- + +## What Minerva Is + +A 100% local, FOSS voice assistant hardware platform. No cloud. No subscriptions. No data leaving the local network. + +The goal is a reference hardware + software stack for a privacy-first voice assistant that anyone can build, extend, or self-host — including people without technical backgrounds if the assembly docs are good enough. + +Core design principles (same as all CF products): +- **Local-first inference** — Whisper STT, Piper TTS, Mycroft Precise wake word all run on the host server +- **Edge where possible** — wake word detection moves to edge hardware over time (K210 → ESP32-S3 → custom) +- **No cloud dependency** — Home Assistant optional, not required +- **100% FOSS stack** + +--- + +## Hardware Targets + +### Phase 1 (current): Maix Duino (K210) +- K210 dual-core RISC-V @ 400MHz with KPU neural accelerator +- Audio: I2S microphone + speaker output +- Connectivity: ESP32 WiFi/BLE co-processor +- Programming: MaixPy (MicroPython) +- Status: server-side wake word working; edge inference in progress + +### Phase 2: ESP32-S3 +- More accessible, cheaper, better WiFi +- On-device wake word with Espressif ESP-SR +- See `docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md` + +### Phase 3: Custom hardware +- Dedicated PCB for CF reference platform +- Hardware-accelerated wake word + VAD +- Designed for accessibility: large buttons, LED feedback, easy mounting + +--- + +## Software Stack + +### Edge device (Maix Duino / ESP32-S3) +- Firmware: MaixPy or ESP-IDF +- Client: `hardware/maixduino/maix_voice_client.py` +- Audio: I2S capture and playback +- Network: WiFi → Minerva server + +### Server (runs on Heimdall or any Linux box) +- Voice server: `scripts/voice_server.py` (Flask + Whisper + Precise) +- Enhanced version: `scripts/voice_server_enhanced.py` (adds speaker ID via pyannote) +- STT: Whisper (local) +- Wake word: Mycroft Precise +- TTS: Piper +- Home Assistant: REST API integration (optional) +- Conda env: `whisper_cli` (existing on Heimdall) + +--- + +## Directory Structure + +``` +minerva/ +├── docs/ # Architecture, guides, reference docs +│ ├── maix-voice-assistant-architecture.md +│ ├── MYCROFT_PRECISE_GUIDE.md +│ ├── PRECISE_DEPLOYMENT.md +│ ├── ESP32_S3_VOICE_ASSISTANT_SPEC.md +│ ├── HARDWARE_BUYING_GUIDE.md +│ ├── LCD_CAMERA_FEATURES.md +│ ├── K210_PERFORMANCE_VERIFICATION.md +│ ├── WAKE_WORD_ADVANCED.md +│ ├── ADVANCED_WAKE_WORD_TOPICS.md +│ └── QUESTIONS_ANSWERED.md +├── scripts/ # Server-side scripts +│ ├── voice_server.py # Core Flask + Whisper + Precise server +│ ├── voice_server_enhanced.py # + speaker identification (pyannote) +│ ├── setup_voice_assistant.sh # Server setup +│ ├── setup_precise.sh # Mycroft Precise training environment +│ └── download_pretrained_models.sh +├── hardware/ +│ └── maixduino/ # K210 edge device scripts +│ ├── maix_voice_client.py # Production client +│ ├── maix_simple_record_test.py # Audio capture test +│ ├── maix_test_simple.py # Hardware/network test +│ ├── maix_debug_wifi.py # WiFi diagnostics +│ ├── maix_discover_modules.py # Module discovery +│ ├── secrets.py.example # WiFi/server credential template +│ ├── MICROPYTHON_QUIRKS.md +│ └── README.md +├── config/ +│ └── .env.example # Server config template +├── models/ # Wake word models (gitignored, large) +└── CLAUDE.md # This file +``` + +--- + +## Credentials / Secrets + +**Never commit real credentials.** Pattern: + +- Server: copy `config/.env.example` → `config/.env`, fill in real values +- Edge device: copy `hardware/maixduino/secrets.py.example` → `secrets.py`, fill in WiFi + server URL + +Both files are gitignored. `.example` files are committed as templates. + +--- + +## Running the Server + +```bash +# Activate environment +conda activate whisper_cli + +# Basic server (Whisper + Precise wake word) +python scripts/voice_server.py \ + --enable-precise \ + --precise-model models/hey-minerva.net \ + --precise-sensitivity 0.5 + +# Enhanced server (+ speaker identification) +python scripts/voice_server_enhanced.py \ + --enable-speaker-id \ + --hf-token $HF_TOKEN + +# Test health +curl http://localhost:5000/health +curl http://localhost:5000/wake-word/status +``` + +--- + +## Connection to CF Voice Infrastructure + +Minerva is the **hardware platform** for cf-voice. As `circuitforge_core.voice` matures: + +- `cf_voice.io` (STT/TTS) → replaces the ad hoc Whisper/Piper calls in `voice_server.py` +- `cf_voice.context` (parallel classifier) → augments Mycroft Precise with tone/environment detection +- `cf_voice.telephony` → future: Minerva as an always-on household linnet node + +Minerva hardware + cf-voice software = the CF reference voice assistant stack. + +--- + +## Roadmap + +See Forgejo milestones on this repo. High-level: + +1. **Alpha — Server-side pipeline** — Whisper + Precise + Piper working end-to-end on Heimdall +2. **Beta — Edge wake word** — wake word on K210 or ESP32-S3; audio only streams post-wake +3. **Hardware v1** — documented reference build; buying guide; assembly instructions +4. **cf-voice integration** — Minerva uses cf_voice modules from circuitforge-core +5. **Platform** — multiple hardware targets; custom PCB design + +--- + +## Related + +- `cf-voice` module design: `circuitforge-plans/circuitforge-core/2026-04-06-cf-voice-design.md` +- `linnet` product: real-time tone annotation, will eventually embed Minerva as a hardware node +- Heimdall server: primary dev/deployment target (10.1.10.71 on LAN) diff --git a/config/.env.example b/config/.env.example new file mode 100644 index 0000000..6518901 --- /dev/null +++ b/config/.env.example @@ -0,0 +1,24 @@ +# Minerva Voice Server — configuration +# Copy to config/.env and fill in real values. Never commit .env. + +# Server +SERVER_HOST=0.0.0.0 +SERVER_PORT=5000 + +# Whisper STT +WHISPER_MODEL=base + +# Mycroft Precise wake word +# PRECISE_MODEL=/path/to/wake-word.net +# PRECISE_SENSITIVITY=0.5 + +# Home Assistant integration (optional) +# HA_URL=http://homeassistant.local:8123 +# HA_TOKEN=your_long_lived_access_token_here + +# HuggingFace (for speaker identification, optional) +# HF_TOKEN=your_huggingface_token_here + +# Logging +LOG_LEVEL=INFO +LOG_FILE=logs/minerva.log diff --git a/docs/ADVANCED_WAKE_WORD_TOPICS.md b/docs/ADVANCED_WAKE_WORD_TOPICS.md new file mode 100755 index 0000000..ea90ece --- /dev/null +++ b/docs/ADVANCED_WAKE_WORD_TOPICS.md @@ -0,0 +1,905 @@ +# Advanced Wake Word Topics - Pre-trained Models, Multiple Wake Words, and Voice Adaptation + +## Pre-trained Mycroft Models + +### Yes! Pre-trained Models Exist + +Mycroft AI provides several pre-trained wake word models you can use immediately: + +**Available Models:** +- **Hey Mycroft** - Original Mycroft wake word (most training data) +- **Hey Jarvis** - Popular alternative +- **Christopher** - Alternative wake word +- **Hey Ezra** - Another option + +### Download Pre-trained Models + +```bash +# On Heimdall +conda activate precise +cd ~/precise-models + +# Create directory for pre-trained models +mkdir -p pretrained +cd pretrained + +# Download Hey Mycroft (recommended starting point) +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +tar xzf hey-mycroft.tar.gz + +# Download other models +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-jarvis.tar.gz +tar xzf hey-jarvis.tar.gz + +# List available models +ls -lh *.net +``` + +### Test Pre-trained Model + +```bash +conda activate precise + +# Test Hey Mycroft +precise-listen hey-mycroft.net + +# Speak "Hey Mycroft" - should see "!" when detected +# Press Ctrl+C to exit + +# Test with different threshold +precise-listen hey-mycroft.net -t 0.7 # More conservative +``` + +### Use Pre-trained Model in Voice Server + +```bash +cd ~/voice-assistant + +# Start server with Hey Mycroft model +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/pretrained/hey-mycroft.net \ + --precise-sensitivity 0.5 +``` + +### Fine-tune Pre-trained Models + +You can use pre-trained models as a **starting point** and fine-tune with your voice: + +```bash +cd ~/precise-models +mkdir -p hey-mycroft-custom + +# Copy base model +cp pretrained/hey-mycroft.net hey-mycroft-custom/ + +# Collect your samples +cd hey-mycroft-custom +precise-collect # Record 20-30 samples of YOUR voice + +# Fine-tune from pre-trained model +precise-train -e 30 hey-mycroft-custom.net . \ + --from-checkpoint ../pretrained/hey-mycroft.net + +# This is MUCH faster than training from scratch! +``` + +**Benefits:** +- ✅ Start with proven model +- ✅ Much less training data needed (20-30 vs 100+ samples) +- ✅ Faster training (30 mins vs 60 mins) +- ✅ Good baseline accuracy + +## Multiple Wake Words + +### Architecture Options + +#### Option 1: Multiple Models in Parallel (Server-Side Only) + +Run multiple Precise instances simultaneously: + +```python +# In voice_server.py - Multiple wake word detection + +from precise_runner import PreciseEngine, PreciseRunner +import threading + +# Global runners +precise_runners = {} + +def on_wake_word_detected(wake_word_name): + """Callback factory for different wake words""" + def callback(): + print(f"Wake word detected: {wake_word_name}") + wake_word_queue.put({ + 'wake_word': wake_word_name, + 'timestamp': time.time() + }) + return callback + +def start_multiple_wake_words(wake_word_configs): + """ + Start multiple wake word detectors + + Args: + wake_word_configs: List of dicts with 'name', 'model', 'sensitivity' + + Example: + configs = [ + {'name': 'hey mycroft', 'model': 'hey-mycroft.net', 'sensitivity': 0.5}, + {'name': 'hey jarvis', 'model': 'hey-jarvis.net', 'sensitivity': 0.5} + ] + """ + global precise_runners + + for config in wake_word_configs: + engine = PreciseEngine( + '/usr/local/bin/precise-engine', + config['model'] + ) + + runner = PreciseRunner( + engine, + sensitivity=config['sensitivity'], + on_activation=on_wake_word_detected(config['name']) + ) + + runner.start() + precise_runners[config['name']] = runner + + print(f"Started wake word detector: {config['name']}") +``` + +**Server-Side Multiple Wake Words:** +```bash +# Start server with multiple wake words +python voice_server.py \ + --enable-precise \ + --precise-models "hey-mycroft:~/models/hey-mycroft.net:0.5,hey-jarvis:~/models/hey-jarvis.net:0.5" +``` + +**Performance Impact:** +- CPU: ~5-10% per model (can run 2-3 easily) +- Memory: ~50-100MB per model +- Latency: Minimal (all run in parallel) + +#### Option 2: Single Model, Multiple Phrases (Edge or Server) + +Train ONE model that responds to multiple phrases: + +```bash +cd ~/precise-models/multi-wake +conda activate precise + +# Record samples for BOTH wake words in the SAME dataset +# Label all as "wake-word" regardless of which phrase + +mkdir -p wake-word not-wake-word + +# Record "Hey Mycroft" samples +precise-collect # Save to wake-word/hey-mycroft-*.wav + +# Record "Hey Computer" samples +precise-collect # Save to wake-word/hey-computer-*.wav + +# Record negatives +precise-collect -f not-wake-word/random.wav + +# Train single model on both phrases +precise-train -e 60 multi-wake.net . +``` + +**Pros:** +- ✅ Single model = less compute +- ✅ Works on edge (K210) +- ✅ Easy to deploy + +**Cons:** +- ❌ Can't tell which wake word was used +- ❌ May reduce accuracy for each individual phrase +- ❌ Higher false positive risk + +#### Option 3: Sequential Detection (Edge) + +Detect wake word, then identify which one: + +```python +# Pseudo-code for edge detection +if wake_word_detected(): + audio_snippet = last_2_seconds() + + # Run all models on the audio snippet + scores = { + 'hey-mycroft': model1.score(audio_snippet), + 'hey-jarvis': model2.score(audio_snippet), + 'hey-computer': model3.score(audio_snippet) + } + + # Use highest scoring wake word + wake_word = max(scores, key=scores.get) +``` + +### Recommendations + +**Server-Side (Heimdall):** +- ✅ **Use Option 1** - Multiple models in parallel +- Run 2-3 wake words easily +- Each can have different sensitivity +- Can identify which wake word was used +- Example: "Hey Mycroft" for commands, "Hey Jarvis" for queries + +**Edge (Maix Duino K210):** +- ✅ **Use Option 2** - Single multi-phrase model +- K210 can handle 1 model efficiently +- Train on 2-3 phrases max +- Simpler deployment +- Lower latency + +## Voice Adaptation & Multi-User Support + +### Approach 1: Inclusive Training (Recommended) + +Train ONE model on EVERYONE'S voices: + +```bash +cd ~/precise-models/family-wake-word +conda activate precise + +# Record samples from each family member +# Alice records 30 samples +precise-collect # Save as wake-word/alice-*.wav + +# Bob records 30 samples +precise-collect # Save as wake-word/bob-*.wav + +# Carol records 30 samples +precise-collect # Save as wake-word/carol-*.wav + +# Train on all voices +precise-train -e 60 family-wake-word.net . +``` + +**Pros:** +- ✅ Everyone can use the system +- ✅ Single model deployment +- ✅ Works for all family members +- ✅ Simple maintenance + +**Cons:** +- ❌ Can't identify who spoke +- ❌ May need more training data +- ❌ No personalization + +**Best for:** Family voice assistant, shared devices + +### Approach 2: Speaker Identification (Advanced) + +Detect wake word, then identify speaker: + +```python +# Architecture with speaker ID + +# Step 1: Precise detects wake word +if wake_word_detected(): + + # Step 2: Capture voice sample + voice_sample = record_audio(duration=3) + + # Step 3: Speaker identification + speaker = identify_speaker(voice_sample) + # Uses voice embeddings/neural network + + # Step 4: Process with user context + process_command(voice_sample, user=speaker) +``` + +**Implementation Options:** + +#### Option A: Use resemblyzer (Voice Embeddings) +```bash +pip install resemblyzer --break-system-packages + +# Enrollment phase +python enroll_users.py +# Each user records 10-20 seconds of speech +# System creates voice profile (embedding) + +# Runtime +python speaker_id.py +# Compares incoming audio to stored embeddings +# Returns most likely speaker +``` + +**Example Code:** +```python +from resemblyzer import VoiceEncoder, preprocess_wav +import numpy as np + +# Initialize encoder +encoder = VoiceEncoder() + +# Enrollment - do once per user +def enroll_user(name, audio_files): + """Create voice profile for user""" + embeddings = [] + + for audio_file in audio_files: + wav = preprocess_wav(audio_file) + embedding = encoder.embed_utterance(wav) + embeddings.append(embedding) + + # Average embeddings for robustness + user_profile = np.mean(embeddings, axis=0) + + # Save profile + np.save(f'profiles/{name}.npy', user_profile) + return user_profile + +# Identification - run each time +def identify_speaker(audio_file, profiles_dir='profiles'): + """Identify which enrolled user is speaking""" + wav = preprocess_wav(audio_file) + test_embedding = encoder.embed_utterance(wav) + + # Load all profiles + profiles = {} + for profile_file in os.listdir(profiles_dir): + name = profile_file.replace('.npy', '') + profile = np.load(os.path.join(profiles_dir, profile_file)) + profiles[name] = profile + + # Calculate similarity to each profile + similarities = {} + for name, profile in profiles.items(): + similarity = np.dot(test_embedding, profile) + similarities[name] = similarity + + # Return most similar + best_match = max(similarities, key=similarities.get) + confidence = similarities[best_match] + + if confidence > 0.7: # Threshold + return best_match + else: + return "unknown" +``` + +#### Option B: Use pyannote.audio (Production-grade) +```bash +pip install pyannote.audio --break-system-packages + +# Requires HuggingFace token (same as diarization) +``` + +**Example:** +```python +from pyannote.audio import Inference + +# Initialize +inference = Inference( + "pyannote/embedding", + use_auth_token="your_hf_token" +) + +# Enroll users +alice_profile = inference("alice_sample.wav") +bob_profile = inference("bob_sample.wav") + +# Identify +test_embedding = inference("test_audio.wav") + +# Compare +from scipy.spatial.distance import cosine +alice_similarity = 1 - cosine(test_embedding, alice_profile) +bob_similarity = 1 - cosine(test_embedding, bob_profile) + +if alice_similarity > bob_similarity and alice_similarity > 0.7: + speaker = "Alice" +elif bob_similarity > 0.7: + speaker = "Bob" +else: + speaker = "Unknown" +``` + +**Pros:** +- ✅ Can identify individual users +- ✅ Personalized responses +- ✅ User-specific commands/permissions +- ✅ Better for privacy (know who's speaking) + +**Cons:** +- ❌ More complex implementation +- ❌ Requires enrollment phase +- ❌ Additional processing time (~100-200ms) +- ❌ May fail with similar voices + +### Approach 3: Per-User Wake Word Models + +Each person has their OWN wake word: + +```bash +# Alice's wake word: "Hey Mycroft" +# Train on ONLY Alice's voice + +# Bob's wake word: "Hey Jarvis" +# Train on ONLY Bob's voice + +# Carol's wake word: "Hey Computer" +# Train on ONLY Carol's voice +``` + +**Deployment:** +Run all 3 models in parallel (server-side): +```python +wake_word_configs = [ + {'name': 'Alice', 'wake_word': 'hey mycroft', 'model': 'alice-wake.net'}, + {'name': 'Bob', 'wake_word': 'hey jarvis', 'model': 'bob-wake.net'}, + {'name': 'Carol', 'wake_word': 'hey computer', 'model': 'carol-wake.net'} +] +``` + +**Pros:** +- ✅ Automatic user identification +- ✅ Highest accuracy per user +- ✅ Clear user separation +- ✅ No additional speaker ID needed + +**Cons:** +- ❌ Requires 3x models (server only) +- ❌ Users must remember their wake word +- ❌ 3x CPU usage (~15-30%) +- ❌ Can't work on edge (K210) + +### Approach 4: Context-Based Adaptation + +No speaker ID, but learn from interaction: + +```python +# Track command patterns +user_context = { + 'last_command': 'turn on living room lights', + 'frequent_entities': ['light.living_room', 'light.bedroom'], + 'time_of_day_patterns': {'morning': 'coffee maker', 'evening': 'tv'}, + 'location': 'home' # vs 'away' +} + +# Use context to improve intent recognition +if "turn on the lights" and time.is_morning(): + # Probably means bedroom lights (based on history) + entity = user_context['frequent_entities'][0] +``` + +**Pros:** +- ✅ No enrollment needed +- ✅ Improves over time +- ✅ Simple to implement +- ✅ Works with any number of users + +**Cons:** +- ❌ No true user identification +- ❌ May make incorrect assumptions +- ❌ Privacy concerns (tracking behavior) + +## Recommended Strategy + +### For Your Use Case + +Based on your home lab setup, I recommend: + +#### Phase 1: Single Wake Word, Inclusive Training (Week 1-2) +```bash +# Start simple +cd ~/precise-models/hey-computer +conda activate precise + +# Have all family members record samples +# Alice: 30 samples of "Hey Computer" +# Bob: 30 samples of "Hey Computer" +# You: 30 samples of "Hey Computer" + +# Train single model on all voices +precise-train -e 60 hey-computer.net . + +# Deploy to server +python voice_server.py \ + --enable-precise \ + --precise-model hey-computer.net +``` + +**Why:** +- Simple to setup and test +- Everyone can use it immediately +- Single model = easier debugging +- Works on edge if you migrate later + +#### Phase 2: Add Speaker Identification (Week 3-4) +```bash +# Install resemblyzer +pip install resemblyzer --break-system-packages + +# Enroll users +python enroll_users.py +# Each person speaks for 20 seconds + +# Update voice_server.py to identify speaker +# Use speaker ID for personalized responses +``` + +**Why:** +- Enables personalization +- Can track preferences per user +- User-specific command permissions +- Better privacy (know who's speaking) + +#### Phase 3: Multiple Wake Words (Month 2+) +```bash +# Add alternative wake words for different contexts +# "Hey Mycroft" - General commands +# "Hey Jarvis" - Media/Plex control +# "Computer" - Quick commands (lights, temp) + +# Deploy multiple models on server +python voice_server.py \ + --enable-precise \ + --precise-models "mycroft:hey-mycroft.net:0.5,jarvis:hey-jarvis.net:0.5" +``` + +**Why:** +- Different wake words for different contexts +- Reduces false positives (more specific triggers) +- Fun factor (Jarvis for media!) +- Server can handle 2-3 easily + +## Implementation Guide: Multiple Wake Words + +### Update voice_server.py for Multiple Wake Words + +```python +# Add to voice_server.py + +def start_multiple_wake_words(configs): + """ + Start multiple wake word detectors + + Args: + configs: List of dicts with 'name', 'model_path', 'sensitivity' + """ + global precise_runners + precise_runners = {} + + for config in configs: + try: + engine = PreciseEngine( + DEFAULT_PRECISE_ENGINE, + config['model_path'] + ) + + def make_callback(wake_word_name): + def callback(): + print(f"Wake word detected: {wake_word_name}") + wake_word_queue.put({ + 'wake_word': wake_word_name, + 'timestamp': time.time(), + 'source': 'precise' + }) + return callback + + runner = PreciseRunner( + engine, + sensitivity=config['sensitivity'], + on_activation=make_callback(config['name']) + ) + + runner.start() + precise_runners[config['name']] = runner + + print(f"✓ Started: {config['name']} (sensitivity: {config['sensitivity']})") + + except Exception as e: + print(f"✗ Failed to start {config['name']}: {e}") + + return len(precise_runners) > 0 + +# Add to main() +parser.add_argument('--precise-models', + help='Multiple models: name:path:sensitivity,name2:path2:sensitivity2') + +# Parse multiple models +if args.precise_models: + configs = [] + for model_spec in args.precise_models.split(','): + name, path, sensitivity = model_spec.split(':') + configs.append({ + 'name': name, + 'model_path': os.path.expanduser(path), + 'sensitivity': float(sensitivity) + }) + + start_multiple_wake_words(configs) +``` + +### Usage Example + +```bash +cd ~/voice-assistant + +# Start with multiple wake words +python voice_server.py \ + --enable-precise \ + --precise-models "\ +hey-mycroft:~/precise-models/pretrained/hey-mycroft.net:0.5,\ +hey-jarvis:~/precise-models/pretrained/hey-jarvis.net:0.5" +``` + +## Implementation Guide: Speaker Identification + +### Add to voice_server.py + +```python +# Add resemblyzer support +try: + from resemblyzer import VoiceEncoder, preprocess_wav + import numpy as np + SPEAKER_ID_AVAILABLE = True +except ImportError: + SPEAKER_ID_AVAILABLE = False + print("Warning: resemblyzer not available. Speaker ID disabled.") + +# Initialize encoder +voice_encoder = None +speaker_profiles = {} + +def load_speaker_profiles(profiles_dir='~/voice-assistant/profiles'): + """Load enrolled speaker profiles""" + global speaker_profiles, voice_encoder + + if not SPEAKER_ID_AVAILABLE: + return False + + profiles_dir = os.path.expanduser(profiles_dir) + + if not os.path.exists(profiles_dir): + print(f"No speaker profiles found at {profiles_dir}") + return False + + # Initialize encoder + voice_encoder = VoiceEncoder() + + # Load all profiles + for profile_file in os.listdir(profiles_dir): + if profile_file.endswith('.npy'): + name = profile_file.replace('.npy', '') + profile = np.load(os.path.join(profiles_dir, profile_file)) + speaker_profiles[name] = profile + print(f"Loaded speaker profile: {name}") + + return len(speaker_profiles) > 0 + +def identify_speaker(audio_path, threshold=0.7): + """Identify speaker from audio file""" + if not SPEAKER_ID_AVAILABLE or not speaker_profiles: + return None + + try: + # Get embedding for test audio + wav = preprocess_wav(audio_path) + test_embedding = voice_encoder.embed_utterance(wav) + + # Compare to all profiles + similarities = {} + for name, profile in speaker_profiles.items(): + similarity = np.dot(test_embedding, profile) + similarities[name] = similarity + + # Get best match + best_match = max(similarities, key=similarities.get) + confidence = similarities[best_match] + + print(f"Speaker ID: {best_match} (confidence: {confidence:.2f})") + + if confidence > threshold: + return best_match + else: + return "unknown" + + except Exception as e: + print(f"Error identifying speaker: {e}") + return None + +# Update process endpoint to include speaker ID +@app.route('/process', methods=['POST']) +def process(): + """Process complete voice command with speaker identification""" + # ... existing code ... + + # Add speaker identification + speaker = identify_speaker(temp_path) if speaker_profiles else None + + if speaker: + print(f"Detected speaker: {speaker}") + # Could personalize response based on speaker + + # ... rest of processing ... +``` + +### Enrollment Script + +Create `enroll_speaker.py`: + +```python +#!/usr/bin/env python3 +""" +Enroll users for speaker identification + +Usage: + python enroll_speaker.py --name Alice --audio alice_sample.wav + python enroll_speaker.py --name Alice --duration 20 # Record live +""" + +import argparse +import os +import numpy as np +from resemblyzer import VoiceEncoder, preprocess_wav +import pyaudio +import wave + +def record_audio(duration=20, sample_rate=16000): + """Record audio from microphone""" + print(f"Recording for {duration} seconds...") + print("Speak naturally - read a paragraph, have a conversation, etc.") + + chunk = 1024 + format = pyaudio.paInt16 + channels = 1 + + p = pyaudio.PyAudio() + + stream = p.open( + format=format, + channels=channels, + rate=sample_rate, + input=True, + frames_per_buffer=chunk + ) + + frames = [] + for i in range(0, int(sample_rate / chunk * duration)): + data = stream.read(chunk) + frames.append(data) + + stream.stop_stream() + stream.close() + p.terminate() + + # Save to temp file + temp_file = f"/tmp/enrollment_{os.getpid()}.wav" + wf = wave.open(temp_file, 'wb') + wf.setnchannels(channels) + wf.setsampwidth(p.get_sample_size(format)) + wf.setframerate(sample_rate) + wf.writeframes(b''.join(frames)) + wf.close() + + return temp_file + +def enroll_speaker(name, audio_file, profiles_dir='~/voice-assistant/profiles'): + """Create voice profile for speaker""" + profiles_dir = os.path.expanduser(profiles_dir) + os.makedirs(profiles_dir, exist_ok=True) + + # Initialize encoder + encoder = VoiceEncoder() + + # Process audio + wav = preprocess_wav(audio_file) + embedding = encoder.embed_utterance(wav) + + # Save profile + profile_path = os.path.join(profiles_dir, f'{name}.npy') + np.save(profile_path, embedding) + + print(f"✓ Enrolled speaker: {name}") + print(f" Profile saved to: {profile_path}") + + return profile_path + +def main(): + parser = argparse.ArgumentParser(description="Enroll speaker for voice identification") + parser.add_argument('--name', required=True, help='Speaker name') + parser.add_argument('--audio', help='Path to audio file (wav)') + parser.add_argument('--duration', type=int, default=20, + help='Recording duration if not using audio file') + parser.add_argument('--profiles-dir', default='~/voice-assistant/profiles', + help='Directory to save profiles') + + args = parser.parse_args() + + # Get audio file + if args.audio: + audio_file = args.audio + if not os.path.exists(audio_file): + print(f"Error: Audio file not found: {audio_file}") + return 1 + else: + audio_file = record_audio(args.duration) + + # Enroll speaker + try: + enroll_speaker(args.name, audio_file, args.profiles_dir) + return 0 + except Exception as e: + print(f"Error enrolling speaker: {e}") + return 1 + +if __name__ == '__main__': + import sys + sys.exit(main()) +``` + +## Performance Comparison + +### Single Wake Word +- **Latency:** 100-200ms +- **CPU:** ~5-10% (idle) +- **Memory:** ~100MB +- **Accuracy:** 95%+ + +### Multiple Wake Words (3 models) +- **Latency:** 100-200ms (parallel) +- **CPU:** ~15-30% (idle) +- **Memory:** ~300MB +- **Accuracy:** 95%+ each + +### With Speaker Identification +- **Additional latency:** +100-200ms +- **Additional CPU:** +5% during ID +- **Additional memory:** +50MB +- **Accuracy:** 85-95% (depending on enrollment quality) + +## Best Practices + +### Wake Word Selection +1. **Different enough** - "Hey Mycroft" vs "Hey Jarvis" (not "Hey Alice" vs "Hey Alex") +2. **Clear consonants** - Easier to detect +3. **2-3 syllables** - Not too short, not too long +4. **Test in environment** - Check for false triggers + +### Training +1. **Include all users** - If using single model +2. **Diverse conditions** - Different rooms, noise levels +3. **Regular updates** - Add false positives weekly +4. **Per-user models** - Higher accuracy, more compute + +### Speaker Identification +1. **Quality enrollment** - 20+ seconds of clear speech +2. **Re-enroll periodically** - Voices change (colds, etc.) +3. **Test thresholds** - Balance accuracy vs false IDs +4. **Graceful fallback** - Handle unknown speakers + +## Recommended Path for You + +```bash +# Week 1: Start with pre-trained "Hey Mycroft" +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +precise-listen hey-mycroft.net # Test it! + +# Week 2: Fine-tune with your voices +precise-train -e 30 hey-mycroft-custom.net . \ + --from-checkpoint hey-mycroft.net + +# Week 3: Add speaker identification +pip install resemblyzer +python enroll_speaker.py --name Alan --duration 20 +python enroll_speaker.py --name [Family Member] --duration 20 + +# Week 4: Add second wake word ("Hey Jarvis" for Plex?) +wget hey-jarvis.tar.gz +# Run both in parallel + +# Month 2+: Optimize and expand +# - More wake words for different contexts +# - Per-user wake word models +# - Context-aware responses +``` + +This gives you a smooth progression from simple to advanced! diff --git a/docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md b/docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md new file mode 100755 index 0000000..e5696e0 --- /dev/null +++ b/docs/ESP32_S3_VOICE_ASSISTANT_SPEC.md @@ -0,0 +1,1089 @@ +# ESP32-S3-Touch-LCD Voice Assistant - Technical Specification + +**Date:** 2026-01-01 +**Hardware:** Waveshare ESP32-S3-Touch-LCD-1.69 +**Display:** 240×280 ST7789V2 with Capacitive Touch +**Framework:** ESP-IDF v5.3.1+ with LVGL 8.4.0+ +**Purpose:** Voice assistant endpoint with real-time audio waveform visualization + +--- + +## Overview + +Voice assistant client for ESP32-S3 with integrated LVGL-based visual feedback showing: +- Real-time audio waveform during listening +- Wake word detection animation +- Processing/thinking state +- Response state with audio output visualization +- Touch controls for volume, sensitivity, settings + +**Architecture:** +``` +┌─────────────────────────────────┐ +│ ESP32-S3-Touch-LCD-1.69 │ +│ │ +│ ┌──────────────────────────┐ │ +│ │ LVGL UI (240×280) │ │ +│ │ - Waveform Canvas │ │ +│ │ - State Indicators │ │──┐ +│ │ - Touch Controls │ │ │ +│ └──────────────────────────┘ │ │ +│ │ │ +│ ┌──────────────────────────┐ │ │ WiFi +│ │ Audio Pipeline │ │ │ Audio Stream +│ │ - I2S Mic Input │ │ │ +│ │ - I2S Speaker Output │ │──┤ +│ │ - Buffer Management │ │ │ +│ └──────────────────────────┘ │ │ +│ │ │ +│ ┌──────────────────────────┐ │ │ +│ │ State Machine │ │ │ +│ │ - Idle → Listening │ │ │ +│ │ - Processing → Speaking│ │──┘ +│ └──────────────────────────┘ │ +└─────────────────────────────────┘ + │ + │ TCP/HTTP + ↓ +┌─────────────────────────────────┐ +│ Heimdall Voice Server │ +│ (10.1.10.71:3006) │ +│ │ +│ - Mycroft Precise Wake Word │ +│ - Whisper STT │ +│ - Home Assistant Integration │ +│ - Piper TTS │ +└─────────────────────────────────┘ +``` + +--- + +## Visual States & UI Design + +### State Machine + +``` + ┌─────────┐ + │ IDLE │ ◄──────────────┐ + └────┬────┘ │ + │ │ + Wake Word Detected │ + │ │ + ↓ │ + ┌──────────┐ │ + │LISTENING │ │ + └────┬─────┘ │ + │ │ + End of Speech │ + │ │ + ↓ │ + ┌───────────┐ │ + │PROCESSING │ │ + └─────┬─────┘ │ + │ │ + Response Ready │ + │ │ + ↓ │ + ┌──────────┐ │ + │ SPEAKING │ ───────────────┘ + └──────────┘ +``` + +### Visual Feedback Per State + +#### 1. IDLE State +**Display:** +- Subtle pulsing ring animation (like Google Home) +- Time display from RTC +- Status icons (WiFi strength, battery level) +- Dim backlight (30-50%) + +**Colors:** +- Background: Dark blue (#001F3F) +- Pulse ring: Cyan (#00BFFF) +- Text: White (#FFFFFF) + +**LVGL Widgets:** +```c +lv_obj_t *idle_screen; +lv_obj_t *pulse_ring; // Arc widget, animated rotation +lv_obj_t *time_label; // Label with RTC time +lv_obj_t *status_bar; // Container for icons +``` + +**Animation:** +- Slow pulse: 2-second breathing cycle +- Rotation: 360° over 10 seconds + +--- + +#### 2. LISTENING State +**Display:** +- Real-time audio waveform visualization +- Bright backlight (100%) +- "Listening..." text +- Cancel button (touch) + +**Waveform Visualization:** + +**Option A: Canvas-Based Waveform (Recommended)** +- Use LVGL `lv_canvas` for custom drawing +- Draw waveform from audio buffer samples +- Scrolling waveform (left-to-right) +- Update rate: 30-60 FPS + +**Option B: Bar Chart Spectrum** +- Use `lv_chart` with bar type +- FFT-based spectrum analyzer +- 8-16 bars for frequency bins +- Update rate: 15-30 FPS + +**Colors:** +- Background: Dark gray (#1A1A1A) +- Waveform: Green (#00FF00) +- Peak indicators: Yellow (#FFFF00) +- Clipping: Red (#FF0000) + +**LVGL Implementation:** +```c +// Canvas-based waveform +lv_obj_t *listening_screen; +lv_obj_t *waveform_canvas; // 240×180 canvas +lv_obj_t *listening_label; // "Listening..." +lv_obj_t *cancel_btn; // Touch to cancel + +// Waveform buffer (circular buffer) +#define WAVEFORM_WIDTH 240 +#define WAVEFORM_HEIGHT 180 +#define WAVEFORM_CENTER (WAVEFORM_HEIGHT / 2) +int16_t waveform_buffer[WAVEFORM_WIDTH]; +uint16_t waveform_index = 0; + +// Drawing function (called from audio callback) +void draw_waveform(lv_obj_t *canvas, int16_t *audio_samples, size_t count) { + lv_canvas_fill_bg(canvas, lv_color_hex(0x1A1A1A), LV_OPA_COVER); + + lv_draw_line_dsc_t line_dsc; + lv_draw_line_dsc_init(&line_dsc); + line_dsc.color = lv_color_hex(0x00FF00); + line_dsc.width = 2; + + // Draw waveform line + for (int x = 0; x < WAVEFORM_WIDTH - 1; x++) { + int16_t y1 = WAVEFORM_CENTER + (waveform_buffer[x] / 256); + int16_t y2 = WAVEFORM_CENTER + (waveform_buffer[x + 1] / 256); + + lv_point_t points[] = {{x, y1}, {x + 1, y2}}; + lv_canvas_draw_line(canvas, points, 2, &line_dsc); + } +} + +// Audio callback (I2S task) +void audio_i2s_callback(int16_t *samples, size_t count) { + // Downsample audio for waveform display + for (int i = 0; i < count; i += (count / WAVEFORM_WIDTH)) { + waveform_buffer[waveform_index] = samples[i]; + waveform_index = (waveform_index + 1) % WAVEFORM_WIDTH; + } + + // Trigger LVGL update (use event or flag) + xEventGroupSetBits(ui_event_group, WAVEFORM_UPDATE_BIT); +} +``` + +**Touch Controls:** +- Tap anywhere: Cancel listening +- Swipe down: Lower sensitivity +- Swipe up: Increase sensitivity + +--- + +#### 3. PROCESSING State +**Display:** +- Animated spinner/thinking indicator +- "Processing..." text +- Waveform fades out smoothly + +**Animation:** +- Circular spinner with gradient +- Rotation: 360° per 1 second +- Pulsing opacity + +**Colors:** +- Background: Dark gray (#1A1A1A) +- Spinner: Blue (#0080FF) +- Text: Light gray (#CCCCCC) + +**LVGL Implementation:** +```c +lv_obj_t *processing_screen; +lv_obj_t *spinner; // lv_spinner widget +lv_obj_t *processing_label; // "Processing..." + +// Transition from listening to processing +void transition_to_processing(void) { + // Fade out waveform + lv_anim_t fade_out; + lv_anim_init(&fade_out); + lv_anim_set_var(&fade_out, waveform_canvas); + lv_anim_set_values(&fade_out, LV_OPA_COVER, LV_OPA_TRANSP); + lv_anim_set_time(&fade_out, 300); + lv_anim_set_exec_cb(&fade_out, lv_obj_set_style_opa); + lv_anim_start(&fade_out); + + // Show spinner after fade + lv_timer_t *timer = lv_timer_create(show_spinner_callback, 300, NULL); + lv_timer_set_repeat_count(timer, 1); +} +``` + +--- + +#### 4. SPEAKING State +**Display:** +- Audio output waveform (TTS playback visualization) +- "Speaking..." or response text snippet +- Volume indicator + +**Waveform:** +- Same canvas as LISTENING but different color +- Shows output audio being played +- Synchronized with speaker output + +**Colors:** +- Background: Dark gray (#1A1A1A) +- Waveform: Blue (#0080FF) +- Text: White (#FFFFFF) + +**LVGL Implementation:** +```c +lv_obj_t *speaking_screen; +lv_obj_t *output_waveform_canvas; // Same size as input waveform +lv_obj_t *response_label; // Show part of response text +lv_obj_t *volume_bar; // lv_bar widget for volume level + +// Similar drawing to listening state, but fed from speaker buffer +void draw_output_waveform(lv_obj_t *canvas, int16_t *speaker_samples, size_t count) { + // Same logic as input waveform, different color + line_dsc.color = lv_color_hex(0x0080FF); + // ... draw logic +} +``` + +**Touch Controls:** +- Tap: Skip response (go back to idle) +- Volume slider: Adjust speaker volume + +--- + +### Additional UI Elements + +#### Status Bar (All States) +**Location:** Top 20 pixels +**Contents:** +- WiFi icon + signal strength +- Battery icon + percentage +- Time (from RTC) +- Mute icon (if muted) + +**LVGL Implementation:** +```c +lv_obj_t *status_bar; +lv_obj_t *wifi_icon; +lv_obj_t *battery_icon; +lv_obj_t *time_label; +lv_obj_t *mute_icon; + +// Update every second +void update_status_bar(lv_timer_t *timer) { + // Update WiFi strength + int8_t rssi = wifi_get_rssi(); + lv_img_set_src(wifi_icon, get_wifi_icon(rssi)); + + // Update battery + uint8_t battery_pct = battery_get_percentage(); + lv_img_set_src(battery_icon, get_battery_icon(battery_pct)); + + // Update time from RTC + rtc_time_t time; + pcf85063_get_time(&time); + lv_label_set_text_fmt(time_label, "%02d:%02d", time.hour, time.min); +} + +// Create timer for status bar updates +lv_timer_create(update_status_bar, 1000, NULL); +``` + +#### Settings Screen (Touch Access) +**Trigger:** Long-press on idle screen +**Contents:** +- Volume slider +- Brightness slider +- Wake word sensitivity slider +- WiFi settings button +- About/Info button + +**LVGL Implementation:** +```c +lv_obj_t *settings_screen; +lv_obj_t *volume_slider; +lv_obj_t *brightness_slider; +lv_obj_t *sensitivity_slider; +lv_obj_t *wifi_btn; +lv_obj_t *about_btn; +lv_obj_t *back_btn; + +// Slider event handler +static void slider_event_cb(lv_event_t *e) { + lv_obj_t *slider = lv_event_get_target(e); + int32_t value = lv_slider_get_value(slider); + + if (slider == volume_slider) { + set_speaker_volume(value); + } else if (slider == brightness_slider) { + set_backlight_brightness(value); + } else if (slider == sensitivity_slider) { + set_wake_word_sensitivity(value); + } +} +``` + +--- + +## Audio Pipeline Integration + +### I2S Configuration + +**Microphone (INMP441):** +```c +#define I2S_MIC_NUM I2S_NUM_0 +#define I2S_MIC_BCLK_PIN GPIO_NUM_4 // Verify with board schematic +#define I2S_MIC_WS_PIN GPIO_NUM_5 +#define I2S_MIC_DIN_PIN GPIO_NUM_6 +#define I2S_MIC_SAMPLE_RATE 16000 +#define I2S_MIC_BITS 16 +#define I2S_MIC_CHANNELS 1 + +i2s_config_t i2s_mic_config = { + .mode = I2S_MODE_MASTER | I2S_MODE_RX, + .sample_rate = I2S_MIC_SAMPLE_RATE, + .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, + .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, + .communication_format = I2S_COMM_FORMAT_STAND_I2S, + .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1, + .dma_buf_count = 8, + .dma_buf_len = 256, + .use_apll = false, + .tx_desc_auto_clear = false, + .fixed_mclk = 0 +}; + +i2s_pin_config_t i2s_mic_pins = { + .bck_io_num = I2S_MIC_BCLK_PIN, + .ws_io_num = I2S_MIC_WS_PIN, + .data_out_num = I2S_PIN_NO_CHANGE, + .data_in_num = I2S_MIC_DIN_PIN +}; + +void audio_init_microphone(void) { + i2s_driver_install(I2S_MIC_NUM, &i2s_mic_config, 0, NULL); + i2s_set_pin(I2S_MIC_NUM, &i2s_mic_pins); + i2s_zero_dma_buffer(I2S_MIC_NUM); +} +``` + +**Speaker (MAX98357A I2S Amp):** +```c +#define I2S_SPK_NUM I2S_NUM_1 +#define I2S_SPK_BCLK_PIN GPIO_NUM_7 // Verify with board schematic +#define I2S_SPK_WS_PIN GPIO_NUM_8 +#define I2S_SPK_DOUT_PIN GPIO_NUM_9 +#define I2S_SPK_SAMPLE_RATE 16000 +#define I2S_SPK_BITS 16 +#define I2S_SPK_CHANNELS 1 + +i2s_config_t i2s_spk_config = { + .mode = I2S_MODE_MASTER | I2S_MODE_TX, + .sample_rate = I2S_SPK_SAMPLE_RATE, + .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, + .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, + .communication_format = I2S_COMM_FORMAT_STAND_I2S, + .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1, + .dma_buf_count = 8, + .dma_buf_len = 256, + .use_apll = false, + .tx_desc_auto_clear = true, + .fixed_mclk = 0 +}; + +i2s_pin_config_t i2s_spk_pins = { + .bck_io_num = I2S_SPK_BCLK_PIN, + .ws_io_num = I2S_SPK_WS_PIN, + .data_out_num = I2S_SPK_DOUT_PIN, + .data_in_num = I2S_PIN_NO_CHANGE +}; + +void audio_init_speaker(void) { + i2s_driver_install(I2S_SPK_NUM, &i2s_spk_config, 0, NULL); + i2s_set_pin(I2S_SPK_NUM, &i2s_spk_pins); + i2s_zero_dma_buffer(I2S_SPK_NUM); +} +``` + +### Audio Buffer Management + +**Circular Buffer for Waveform:** +```c +#define AUDIO_BUFFER_SIZE 2048 +#define WAVEFORM_DECIMATION 8 // Downsample for display + +typedef struct { + int16_t samples[AUDIO_BUFFER_SIZE]; + uint16_t write_idx; + uint16_t read_idx; + SemaphoreHandle_t mutex; +} audio_buffer_t; + +audio_buffer_t mic_buffer; +audio_buffer_t spk_buffer; + +void audio_buffer_init(audio_buffer_t *buf) { + memset(buf->samples, 0, sizeof(buf->samples)); + buf->write_idx = 0; + buf->read_idx = 0; + buf->mutex = xSemaphoreCreateMutex(); +} + +void audio_buffer_write(audio_buffer_t *buf, int16_t *samples, size_t count) { + xSemaphoreTake(buf->mutex, portMAX_DELAY); + for (size_t i = 0; i < count; i++) { + buf->samples[buf->write_idx] = samples[i]; + buf->write_idx = (buf->write_idx + 1) % AUDIO_BUFFER_SIZE; + } + xSemaphoreGive(buf->mutex); +} + +// Get downsampled samples for waveform display +void audio_buffer_get_waveform(audio_buffer_t *buf, int16_t *out, size_t out_count) { + xSemaphoreTake(buf->mutex, portMAX_DELAY); + for (size_t i = 0; i < out_count; i++) { + size_t src_idx = (buf->write_idx + (i * WAVEFORM_DECIMATION)) % AUDIO_BUFFER_SIZE; + out[i] = buf->samples[src_idx]; + } + xSemaphoreGive(buf->mutex); +} +``` + +### Audio Streaming Task + +**Microphone Input Task:** +```c +void audio_mic_task(void *pvParameters) { + int16_t i2s_buffer[256]; + size_t bytes_read; + + while (1) { + // Read from I2S microphone + i2s_read(I2S_MIC_NUM, i2s_buffer, sizeof(i2s_buffer), &bytes_read, portMAX_DELAY); + size_t samples_read = bytes_read / sizeof(int16_t); + + if (current_state == STATE_LISTENING) { + // Write to circular buffer for waveform display + audio_buffer_write(&mic_buffer, i2s_buffer, samples_read); + + // Send to Heimdall server via WiFi + audio_send_to_server(i2s_buffer, samples_read); + + // Trigger waveform update + xEventGroupSetBits(ui_event_group, WAVEFORM_UPDATE_BIT); + } + } +} +``` + +**Speaker Output Task:** +```c +void audio_speaker_task(void *pvParameters) { + int16_t i2s_buffer[256]; + size_t bytes_written; + + while (1) { + // Receive audio from Heimdall server + size_t samples_received = audio_receive_from_server(i2s_buffer, 256); + + if (samples_received > 0 && current_state == STATE_SPEAKING) { + // Write to circular buffer for waveform display + audio_buffer_write(&spk_buffer, i2s_buffer, samples_received); + + // Play through I2S speaker + i2s_write(I2S_SPK_NUM, i2s_buffer, samples_received * sizeof(int16_t), + &bytes_written, portMAX_DELAY); + + // Trigger waveform update + xEventGroupSetBits(ui_event_group, WAVEFORM_UPDATE_BIT); + } else { + vTaskDelay(pdMS_TO_TICKS(10)); + } + } +} +``` + +### LVGL Update Task + +**Waveform Rendering Task:** +```c +void lvgl_waveform_task(void *pvParameters) { + int16_t waveform_samples[WAVEFORM_WIDTH]; + + while (1) { + // Wait for waveform update event + EventBits_t bits = xEventGroupWaitBits(ui_event_group, WAVEFORM_UPDATE_BIT, + pdTRUE, pdFALSE, pdMS_TO_TICKS(50)); + + if (bits & WAVEFORM_UPDATE_BIT) { + if (current_state == STATE_LISTENING) { + // Get downsampled mic data + audio_buffer_get_waveform(&mic_buffer, waveform_samples, WAVEFORM_WIDTH); + + // Draw on LVGL canvas (must lock LVGL) + lvgl_lock(); + draw_waveform(waveform_canvas, waveform_samples, WAVEFORM_WIDTH); + lvgl_unlock(); + + } else if (current_state == STATE_SPEAKING) { + // Get downsampled speaker data + audio_buffer_get_waveform(&spk_buffer, waveform_samples, WAVEFORM_WIDTH); + + lvgl_lock(); + draw_output_waveform(output_waveform_canvas, waveform_samples, WAVEFORM_WIDTH); + lvgl_unlock(); + } + } + } +} +``` + +--- + +## Touch Gesture Integration + +### Touch Controller (CST816D) + +**Gestures Supported:** +- Single tap +- Long press +- Swipe up/down/left/right + +**Implementation:** +```c +#define TOUCH_I2C_NUM I2C_NUM_0 +#define TOUCH_SDA_PIN GPIO_NUM_6 +#define TOUCH_SCL_PIN GPIO_NUM_7 +#define TOUCH_INT_PIN GPIO_NUM_9 +#define TOUCH_RST_PIN GPIO_NUM_10 + +typedef enum { + GESTURE_NONE = 0, + GESTURE_TAP, + GESTURE_LONG_PRESS, + GESTURE_SWIPE_UP, + GESTURE_SWIPE_DOWN, + GESTURE_SWIPE_LEFT, + GESTURE_SWIPE_RIGHT +} touch_gesture_t; + +void touch_init(void) { + // I2C init for CST816D + i2c_config_t conf = { + .mode = I2C_MODE_MASTER, + .sda_io_num = TOUCH_SDA_PIN, + .scl_io_num = TOUCH_SCL_PIN, + .sda_pullup_en = GPIO_PULLUP_ENABLE, + .scl_pullup_en = GPIO_PULLUP_ENABLE, + .master.clk_speed = 100000, + }; + i2c_param_config(TOUCH_I2C_NUM, &conf); + i2c_driver_install(TOUCH_I2C_NUM, conf.mode, 0, 0, 0); + + // Reset touch controller + gpio_set_direction(TOUCH_RST_PIN, GPIO_MODE_OUTPUT); + gpio_set_level(TOUCH_RST_PIN, 0); + vTaskDelay(pdMS_TO_TICKS(10)); + gpio_set_level(TOUCH_RST_PIN, 1); + vTaskDelay(pdMS_TO_TICKS(50)); + + // Configure interrupt pin + gpio_set_direction(TOUCH_INT_PIN, GPIO_MODE_INPUT); + gpio_set_intr_type(TOUCH_INT_PIN, GPIO_INTR_NEGEDGE); + gpio_install_isr_service(0); + gpio_isr_handler_add(TOUCH_INT_PIN, touch_isr_handler, NULL); +} + +touch_gesture_t touch_read_gesture(void) { + uint8_t data[8]; + // Read gesture from CST816D register 0x01 + i2c_master_read_from_device(TOUCH_I2C_NUM, CST816D_ADDR, 0x01, data, 8, pdMS_TO_TICKS(100)); + return (touch_gesture_t)data[0]; +} +``` + +### Gesture Actions by State + +**IDLE State:** +- **Tap:** Wake up display (if dimmed) +- **Long Press:** Open settings screen +- **Swipe Up:** Show more info (weather, calendar) + +**LISTENING State:** +- **Tap:** Cancel listening, return to idle +- **Swipe Down:** Lower wake word sensitivity +- **Swipe Up:** Raise wake word sensitivity + +**SPEAKING State:** +- **Tap:** Skip response, return to idle +- **Swipe Left/Right:** Volume down/up + +**PROCESSING State:** +- **Tap:** Cancel processing (if possible) + +--- + +## Network Communication + +### WiFi Configuration + +**Connection:** +```c +#define WIFI_SSID "YourNetworkName" +#define WIFI_PASSWORD "YourPassword" +#define SERVER_URL "http://10.1.10.71:3006" + +void wifi_init(void) { + esp_netif_init(); + esp_event_loop_create_default(); + esp_netif_create_default_wifi_sta(); + + wifi_init_config_t cfg = WIFI_INIT_CONFIG_DEFAULT(); + esp_wifi_init(&cfg); + + wifi_config_t wifi_config = { + .sta = { + .ssid = WIFI_SSID, + .password = WIFI_PASSWORD, + }, + }; + + esp_wifi_set_mode(WIFI_MODE_STA); + esp_wifi_set_config(WIFI_IF_STA, &wifi_config); + esp_wifi_start(); + esp_wifi_connect(); +} +``` + +### Server Communication Protocol + +**Endpoints:** +- `GET /health` - Server health check +- `POST /audio/stream` - Stream audio to server (multipart) +- `GET /audio/tts` - Receive TTS audio response +- `GET /wake-word/status` - Check wake word detection status + +**Audio Streaming (WebSockets Recommended):** +```c +#include "esp_websocket_client.h" + +esp_websocket_client_handle_t ws_client; + +void websocket_init(void) { + esp_websocket_client_config_t ws_cfg = { + .uri = "ws://10.1.10.71:3006/ws/audio", + .buffer_size = 2048, + }; + + ws_client = esp_websocket_client_init(&ws_cfg); + esp_websocket_register_events(ws_client, WEBSOCKET_EVENT_ANY, + websocket_event_handler, NULL); + esp_websocket_client_start(ws_client); +} + +void audio_send_to_server(int16_t *samples, size_t count) { + if (esp_websocket_client_is_connected(ws_client)) { + esp_websocket_client_send_bin(ws_client, (char*)samples, + count * sizeof(int16_t), portMAX_DELAY); + } +} + +size_t audio_receive_from_server(int16_t *out_buffer, size_t max_samples) { + // Receive audio from server (blocking with timeout) + int len = esp_websocket_client_recv(ws_client, (char*)out_buffer, + max_samples * sizeof(int16_t), pdMS_TO_TICKS(100)); + return (len > 0) ? (len / sizeof(int16_t)) : 0; +} +``` + +**Alternative: HTTP Chunked Transfer (Simpler):** +```c +void audio_stream_http(void) { + esp_http_client_config_t config = { + .url = "http://10.1.10.71:3006/audio/stream", + .method = HTTP_METHOD_POST, + }; + esp_http_client_handle_t client = esp_http_client_init(&config); + + // Set headers + esp_http_client_set_header(client, "Content-Type", "audio/pcm"); + esp_http_client_set_header(client, "Transfer-Encoding", "chunked"); + + esp_http_client_open(client, -1); // -1 = chunked mode + + // Stream audio chunks + int16_t buffer[256]; + while (current_state == STATE_LISTENING) { + // Read from mic + size_t bytes_read; + i2s_read(I2S_MIC_NUM, buffer, sizeof(buffer), &bytes_read, portMAX_DELAY); + + // Send to server + esp_http_client_write(client, (char*)buffer, bytes_read); + } + + esp_http_client_close(client); + esp_http_client_cleanup(client); +} +``` + +--- + +## Power Management + +### Battery Monitoring + +**ETA6098 Charging Chip:** +```c +#define BATTERY_ADC_CHANNEL ADC1_CHANNEL_0 // GPIO1 (example) +#define BATTERY_FULL_MV 4200 +#define BATTERY_EMPTY_MV 3300 + +void battery_init(void) { + adc1_config_width(ADC_WIDTH_BIT_12); + adc1_config_channel_atten(BATTERY_ADC_CHANNEL, ADC_ATTEN_DB_11); +} + +uint8_t battery_get_percentage(void) { + int adc_reading = adc1_get_raw(BATTERY_ADC_CHANNEL); + int voltage_mv = esp_adc_cal_raw_to_voltage(adc_reading, &adc_chars); + + if (voltage_mv >= BATTERY_FULL_MV) return 100; + if (voltage_mv <= BATTERY_EMPTY_MV) return 0; + + return ((voltage_mv - BATTERY_EMPTY_MV) * 100) / (BATTERY_FULL_MV - BATTERY_EMPTY_MV); +} + +bool battery_is_charging(void) { + // Check SYS_OUT pin (GPIO36) - high when charging + gpio_set_direction(GPIO_NUM_36, GPIO_MODE_INPUT); + return gpio_get_level(GPIO_NUM_36); +} +``` + +### Low Power Modes + +**Deep Sleep When Idle (Optional):** +```c +#define IDLE_TIMEOUT_MS 300000 // 5 minutes + +void enter_deep_sleep(void) { + // Save state to RTC memory + RTC_DATA_ATTR static uint32_t boot_count = 0; + boot_count++; + + // Configure wake sources + esp_sleep_enable_ext0_wakeup(TOUCH_INT_PIN, 0); // Wake on touch + esp_sleep_enable_timer_wakeup(3600 * 1000000ULL); // Wake every hour + + // Turn off display + gpio_set_level(LCD_BL_PIN, 0); + + // Enter deep sleep + esp_deep_sleep_start(); +} +``` + +--- + +## Performance Optimization + +### LVGL Performance + +**Buffer Configuration:** +```c +#define LVGL_BUFFER_SIZE (240 * 280 * 2) // Full screen buffer + +static lv_color_t buf_1[LVGL_BUFFER_SIZE / 10]; // 1/10 screen buffer +static lv_color_t buf_2[LVGL_BUFFER_SIZE / 10]; // Double buffering + +lv_disp_draw_buf_t draw_buf; +lv_disp_draw_buf_init(&draw_buf, buf_1, buf_2, LVGL_BUFFER_SIZE / 10); +``` + +**Task Priority:** +```c +#define LVGL_TASK_PRIORITY 5 +#define AUDIO_MIC_TASK_PRIORITY 10 // Higher priority for audio +#define AUDIO_SPK_TASK_PRIORITY 10 +#define WIFI_TASK_PRIORITY 8 +#define WAVEFORM_TASK_PRIORITY 4 // Lower priority for visuals + +void app_main(void) { + // Create tasks with priorities + xTaskCreatePinnedToCore(lvgl_task, "LVGL", 8192, NULL, LVGL_TASK_PRIORITY, NULL, 1); + xTaskCreatePinnedToCore(audio_mic_task, "MIC", 4096, NULL, AUDIO_MIC_TASK_PRIORITY, NULL, 0); + xTaskCreatePinnedToCore(audio_speaker_task, "SPK", 4096, NULL, AUDIO_SPK_TASK_PRIORITY, NULL, 0); + xTaskCreatePinnedToCore(lvgl_waveform_task, "WAVE", 4096, NULL, WAVEFORM_TASK_PRIORITY, NULL, 1); +} +``` + +**Reduce Waveform Update Rate:** +```c +// Only update waveform at 30 FPS, not every audio sample +#define WAVEFORM_UPDATE_MS 33 // ~30 FPS + +void lvgl_waveform_task(void *pvParameters) { + TickType_t last_update = xTaskGetTickCount(); + + while (1) { + TickType_t now = xTaskGetTickCount(); + if ((now - last_update) >= pdMS_TO_TICKS(WAVEFORM_UPDATE_MS)) { + // Update waveform + last_update = now; + } + vTaskDelay(pdMS_TO_TICKS(10)); + } +} +``` + +### Memory Management + +**PSRAM Usage:** +```c +// Allocate large buffers in PSRAM (8MB available) +#define AUDIO_LARGE_BUFFER_SIZE (16000 * 10) // 10 seconds at 16kHz + +int16_t *audio_history = heap_caps_malloc(AUDIO_LARGE_BUFFER_SIZE * sizeof(int16_t), + MALLOC_CAP_SPIRAM); + +// Check if allocation succeeded +if (audio_history == NULL) { + ESP_LOGE(TAG, "Failed to allocate PSRAM buffer"); +} +``` + +**Heap Monitoring:** +```c +void log_memory_stats(void) { + ESP_LOGI(TAG, "Free heap: %d bytes", esp_get_free_heap_size()); + ESP_LOGI(TAG, "Free PSRAM: %d bytes", heap_caps_get_free_size(MALLOC_CAP_SPIRAM)); + ESP_LOGI(TAG, "Min free heap: %d bytes", esp_get_minimum_free_heap_size()); +} +``` + +--- + +## Example Code Structure + +### File Organization + +``` +esp32_voice_assistant/ +├── main/ +│ ├── main.c # Entry point, task creation +│ ├── audio/ +│ │ ├── audio_input.c # I2S microphone handling +│ │ ├── audio_output.c # I2S speaker handling +│ │ ├── audio_buffer.c # Circular buffer management +│ │ └── audio_network.c # WebSocket/HTTP streaming +│ ├── ui/ +│ │ ├── ui_init.c # LVGL setup, screen creation +│ │ ├── ui_idle.c # Idle screen UI +│ │ ├── ui_listening.c # Listening screen + waveform +│ │ ├── ui_processing.c # Processing screen + spinner +│ │ ├── ui_speaking.c # Speaking screen + output waveform +│ │ ├── ui_settings.c # Settings screen +│ │ └── ui_waveform.c # Waveform drawing functions +│ ├── touch/ +│ │ ├── touch_cst816d.c # Touch controller driver +│ │ └── touch_gestures.c # Gesture recognition +│ ├── network/ +│ │ └── wifi_manager.c # WiFi connection management +│ ├── power/ +│ │ ├── battery.c # Battery monitoring +│ │ └── power_mgmt.c # Sleep modes +│ └── state_machine.c # Voice assistant state machine +├── components/ +│ └── lvgl/ # LVGL library (ESP-IDF component) +├── CMakeLists.txt +└── sdkconfig # ESP-IDF configuration +``` + +### Main Entry Point + +```c +// main/main.c +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "esp_log.h" + +static const char *TAG = "VOICE_ASSISTANT"; + +void app_main(void) { + ESP_LOGI(TAG, "Voice Assistant Starting..."); + + // Initialize hardware + nvs_flash_init(); // Non-volatile storage + gpio_install_isr_service(0);// GPIO interrupts + + // Power management + battery_init(); + + // Display and touch + lcd_init(); + touch_init(); + ui_init(); + + // Audio pipeline + audio_init_microphone(); + audio_init_speaker(); + audio_buffer_init(&mic_buffer); + audio_buffer_init(&spk_buffer); + + // Network + wifi_init(); + websocket_init(); + + // State machine + state_machine_init(); + + // Create FreeRTOS tasks + xTaskCreatePinnedToCore(lvgl_task, "LVGL", 8192, NULL, 5, NULL, 1); + xTaskCreatePinnedToCore(audio_mic_task, "MIC", 4096, NULL, 10, NULL, 0); + xTaskCreatePinnedToCore(audio_speaker_task, "SPK", 4096, NULL, 10, NULL, 0); + xTaskCreatePinnedToCore(lvgl_waveform_task, "WAVE", 4096, NULL, 4, NULL, 1); + xTaskCreatePinnedToCore(state_machine_task, "STATE", 4096, NULL, 7, NULL, 0); + + ESP_LOGI(TAG, "Voice Assistant Running!"); +} +``` + +--- + +## Testing Plan + +### Phase 1: Hardware Validation +- [ ] LCD display working (show test pattern) +- [ ] Touch controller responding (log touch coordinates) +- [ ] Buzzer working (play test tone) +- [ ] WiFi connecting (check IP address) +- [ ] Battery reading (log voltage) +- [ ] RTC working (log time) +- [ ] IMU working (log accelerometer values) + +### Phase 2: Audio Pipeline +- [ ] I2S microphone reading audio (log levels) +- [ ] Audio streaming to Heimdall server +- [ ] I2S speaker playing audio (test tone) +- [ ] TTS audio playback from server +- [ ] Audio buffer management (no overflows) + +### Phase 3: LVGL UI +- [ ] Idle screen displays correctly +- [ ] State transitions smooth +- [ ] Waveform renders at 30 FPS +- [ ] Touch gestures recognized +- [ ] Settings screen functional +- [ ] Status bar updates correctly + +### Phase 4: Integration +- [ ] Wake word detection triggers listening state +- [ ] Waveform shows mic input in real-time +- [ ] Processing state shows after speech ends +- [ ] TTS response plays with output waveform +- [ ] Touch cancel works in all states +- [ ] Battery indicator accurate + +### Phase 5: Optimization +- [ ] Memory usage stable (no leaks) +- [ ] CPU usage acceptable (<80% average) +- [ ] WiFi latency <100ms +- [ ] Audio latency <200ms end-to-end +- [ ] Display framerate stable (30 FPS) +- [ ] Battery life >4 hours continuous + +--- + +## Bill of Materials (BOM) + +| Component | Part Number | Quantity | Unit Price | Total | +|-----------|-------------|----------|------------|-------| +| ESP32-S3-Touch-LCD-1.69 | Waveshare | 1 | $12.00 | $12.00 | +| I2S MEMS Microphone | INMP441 | 1 | $3.50 | $3.50 | +| I2S Amplifier | MAX98357A | 1 | $3.50 | $3.50 | +| Speaker (3W 8Ω) | Generic | 1 | $5.00 | $5.00 | +| LiPo Battery (1000mAh) | 503040 JST 1.25 | 1 | $7.00 | $7.00 | +| MicroSD Card (8GB) | SanDisk | 1 | $5.00 | $5.00 | +| Breadboard + Wires | Generic | 1 | $5.00 | $5.00 | +| **Total** | | | | **$41.00** | + +**Optional:** +- Enclosure/Case (3D printed or project box): $5-10 +- Backup battery: $7 +- USB-C cable: $3 + +**Grand Total with Options:** ~$56-63 + +--- + +## References & Resources + +### LVGL Audio Visualization Examples +- **Music Player with FFT Spectrum** - [Instructables Guide](https://www.instructables.com/Design-Music-Player-UI-With-LVGL/) + - Source: https://github.com/moononournation/LVGL_Music_Player.git + - Shows FFT-based audio visualization on LVGL canvas + +- **LVGL Audio FFT Spectrum (Xiao S3)** - [GitHub: genvex/LVGL_Audio_FFT_Spectrum_xiaoS3_oled](https://github.com/genvex/LVGL_Audio_FFT_Spectrum_xiaoS3_oled) + - Real-time FFT visualization using low-level LVGL drawing + +- **LVGL Audio FFT Spectrum** - [GitHub: imliubo/LVGL_Audio_FFT_Spectrum](https://github.com/imliubo/LVGL_Audio_FFT_Spectrum) + - Alternative FFT spectrum implementation + +- **Moving Waveform Discussion** - [LVGL Forum Thread](https://forum.lvgl.io/t/best-method-to-display-a-moving-waveform/17361) + - Tips on efficiently displaying moving waveforms + +### ESP32-S3 Resources +- **Waveshare Wiki** - https://www.waveshare.com/wiki/ESP32-S3-LCD-1.69 +- **LVGL ESP32 Port** - [GitHub: lvgl/lv_port_esp32](https://github.com/lvgl/lv_port_esp32) +- **ESP-IDF Documentation** - https://docs.espressif.com/projects/esp-idf/en/latest/ + +### Voice Assistant Project +- **Mycroft Precise Documentation** - https://github.com/MycroftAI/mycroft-precise +- **Whisper OpenAI** - https://github.com/openai/whisper +- **Piper TTS** - https://github.com/rhasspy/piper + +--- + +## Next Steps + +1. **Order Hardware** - ESP32-S3-Touch-LCD + audio components (~$41) +2. **Setup ESP-IDF** - Install ESP-IDF v5.3.1+ on development machine +3. **Clone Examples** - Get LVGL audio visualization examples for reference +4. **Start Simple** - Begin with LCD + LVGL test (no audio) +5. **Add Audio** - Wire I2S mic, test audio streaming +6. **Waveform MVP** - Get basic waveform rendering working +7. **Full Integration** - Connect to Heimdall voice server +8. **Polish** - Add touch controls, settings, battery support + +--- + +**Version:** 1.0 +**Created:** 2026-01-01 +**Status:** Specification Complete, Ready for Implementation + diff --git a/docs/HARDWARE_BUYING_GUIDE.md b/docs/HARDWARE_BUYING_GUIDE.md new file mode 100755 index 0000000..4413b15 --- /dev/null +++ b/docs/HARDWARE_BUYING_GUIDE.md @@ -0,0 +1,542 @@ +# Voice Assistant Hardware - Buying Guide for Second Unit + +**Date:** 2025-11-29 +**Context:** You have one Maix Duino (K210), planning multi-room deployment +**Question:** What should I buy for the second unit? + +--- + +## Quick Answer + +**Best Overall:** **Buy another Maix Duino K210** (~$30-40) +**Runner-up:** **ESP32-S3 with audio board** (~$20-30) +**Budget:** **Generic ESP32 + I2S** (~$15-20) +**Future-proof:** **Sipeed Maix-III** (~$60-80, when available) + +--- + +## Analysis: Why Another Maix Duino K210? + +### Pros ✅ +- **Identical to first unit** - Code reuse, same workflow +- **Proven solution** - You'll know exactly what to expect +- **Stock availability** - Still widely available despite being "outdated" +- **Same accessories** - Microphones, displays, cables compatible +- **Edge detection ready** - Can upgrade to edge wake word later +- **Low cost** - ~$30-40 for full kit with LCD and camera +- **Multi-room consistency** - All units behave identically + +### Cons ❌ +- "Outdated" hardware (but doesn't matter for your use case) +- Limited future support from Sipeed + +### Verdict: ✅ **RECOMMENDED - Best choice for consistency** + +--- + +## Alternative Options + +### Option 1: Another Maix Duino K210 +**Price:** $30-40 (kit with LCD) +**Where:** AliExpress, Amazon, Seeed Studio + +**Specific Model:** +- **Sipeed Maix Duino** (original, what you have) +- Includes: LCD, camera module +- Need to add: I2S microphone + +**Why Choose:** +- Identical setup to first unit +- Code works without modification +- Same troubleshooting experience +- Bulk buy discount possible + +**Link Examples:** +- Seeed Studio: https://www.seeedstudio.com/Sipeed-Maix-Duino-Kit-for-RISC-V-AI-IoT.html +- AliExpress: Search "Sipeed Maix Duino" (~$25-35) + +--- + +### Option 2: Sipeed Maix Bit/Dock (K210 variant) +**Price:** $15-25 (smaller form factor) + +**Differences from Maix Duino:** +- Smaller board +- May need separate LCD +- Same K210 chip +- Same capabilities + +**Why Choose:** +- Cheaper +- More compact +- Same software + +**Why Skip:** +- Need separate accessories +- Different form factor means different mounting +- Less convenient than all-in-one Duino + +**Verdict:** ⚠️ Only if you want smaller/cheaper + +--- + +### Option 3: ESP32-S3 with Audio Kit +**Price:** $20-30 +**Chip:** ESP32-S3 (Xtensa dual-core @ 240MHz) + +**Examples:** +- **ESP32-S3-Box** (~$30) - Has LCD, microphone, speaker built-in +- **Seeed XIAO ESP32-S3 Sense** (~$15) - Tiny, needs accessories +- **M5Stack Core S3** (~$50) - Premium, all-in-one + +**Pros:** +- ✅ More modern than K210 +- ✅ Better WiFi/BLE support +- ✅ Lower power consumption +- ✅ Active development +- ✅ Arduino/ESP-IDF support + +**Cons:** +- ❌ No KPU (neural accelerator) +- ❌ Different code needed (ESP32 vs MaixPy) +- ❌ Less ML capability (for future edge wake word) +- ❌ Different ecosystem + +**Best ESP32-S3 Choice:** **ESP32-S3-Box** +- All-in-one like your Maix Duino +- Built-in mic, speaker, LCD +- Good for server-side wake word +- Cheaper than Maix Duino + +**Verdict:** 🤔 Good alternative if you want to experiment + +--- + +### Option 4: Raspberry Pi Zero 2 W +**Price:** $15-20 (board only, need accessories) + +**Pros:** +- ✅ Full Linux +- ✅ Familiar ecosystem +- ✅ Tons of support +- ✅ Easy Python development + +**Cons:** +- ❌ No neural accelerator +- ❌ No dedicated audio hardware +- ❌ More power hungry (~500mW vs 200mW) +- ❌ Overkill for audio streaming +- ❌ Need USB sound card or I2S HAT +- ❌ Larger form factor + +**Verdict:** ❌ Not ideal for this project + +--- + +### Option 5: Sipeed Maix-III AXera-Pi (Future) +**Price:** $60-80 (when available) +**Chip:** AX620A (much more powerful than K210) + +**Pros:** +- ✅ Modern hardware (2023) +- ✅ Better AI performance +- ✅ Linux + Python support +- ✅ Sipeed ecosystem continuity +- ✅ Great for edge wake word + +**Cons:** +- ❌ More expensive +- ❌ Newer = less community support +- ❌ Overkill for server-side wake word +- ❌ Stock availability varies + +**Verdict:** 🔮 Future-proof option if budget allows + +--- + +### Option 6: Generic ESP32 + I2S Breakout +**Price:** $10-15 (cheapest option) + +**What You Need:** +- ESP32 DevKit (~$5) +- I2S MEMS mic (~$5) +- Optional: I2S speaker amp (~$5) + +**Pros:** +- ✅ Cheapest option +- ✅ Minimal, focused on audio only +- ✅ Very low power +- ✅ WiFi built-in + +**Cons:** +- ❌ No LCD (would need separate) +- ❌ No camera +- ❌ DIY assembly required +- ❌ No neural accelerator +- ❌ Different code from K210 + +**Verdict:** 💰 Budget choice, but less polished + +--- + +## Comparison Table + +| Option | Price | Same Code? | LCD | AI Accel | Best For | +|--------|-------|------------|-----|----------|----------| +| **Maix Duino K210** | $30-40 | ✅ Yes | ✅ Included | ✅ KPU | **Multi-room consistency** | +| Maix Bit/Dock (K210) | $15-25 | ✅ Yes | ⚠️ Optional | ✅ KPU | Compact/Budget | +| ESP32-S3-Box | $25-35 | ❌ No | ✅ Included | ❌ No | Modern alternative | +| ESP32-S3 DIY | $15-25 | ❌ No | ❌ No | ❌ No | Custom build | +| Raspberry Pi Zero 2 W | $30+ | ❌ No | ❌ No | ❌ No | Linux/overkill | +| Maix-III | $60-80 | ⚠️ Similar | ✅ Varies | ✅ NPU | Future-proof | +| Generic ESP32 | $10-15 | ❌ No | ❌ No | ❌ No | Absolute budget | + +--- + +## Recommended Purchase Plan + +### Phase 1: Second Identical Unit (NOW) +**Buy:** Sipeed Maix Duino K210 (same as first) +**Cost:** ~$30-40 +**Why:** Code reuse, proven solution, multi-room consistency + +**What to Order:** +- [ ] Sipeed Maix Duino board with LCD and camera +- [ ] I2S MEMS microphone (if not included) +- [ ] Small speaker or audio output (3-5W) +- [ ] USB-C cable +- [ ] MicroSD card (4GB+) + +**Total Cost:** ~$40-50 with accessories + +--- + +### Phase 2: Third+ Units (LATER) +**Option A:** More Maix Duinos (if still available) +**Option B:** Switch to ESP32-S3-Box for variety/testing +**Option C:** Wait for Maix-III if you want cutting edge + +--- + +## Where to Buy Maix Duino + +### Recommended Sellers + +**1. Seeed Studio (Official Partner)** +- URL: https://www.seeedstudio.com/ +- Search: "Sipeed Maix Duino" +- Price: ~$35-45 +- Shipping: International, good support +- **Pro:** Official, reliable, good documentation +- **Con:** Can be out of stock + +**2. AliExpress (Direct from Sipeed/China)** +- Search: "Sipeed Maix Duino" +- Price: ~$25-35 +- Shipping: 2-4 weeks (free or cheap) +- **Pro:** Cheapest, often bundled with accessories +- **Con:** Longer shipping, variable quality control +- **Tip:** Look for "Sipeed Official Store" + +**3. Amazon** +- Search: "Maix Duino K210" +- Price: ~$40-50 +- Shipping: Fast (Prime eligible sometimes) +- **Pro:** Fast shipping, easy returns +- **Con:** Higher price, limited stock + +**4. Adafruit / SparkFun** +- May carry Sipeed products +- Higher price but US-based support +- Check availability + +--- + +## Accessories to Buy + +### Essential (for each unit) + +**1. I2S MEMS Microphone** +- **Recommended:** Adafruit I2S MEMS Microphone Breakout (~$7) + - Model: SPH0645LM4H + - URL: https://www.adafruit.com/product/3421 +- **Alternative:** INMP441 I2S Microphone (~$3 on AliExpress) + - Cheaper, works well + - Search: "INMP441 I2S microphone" + +**2. Speaker / Audio Output** +- **Option A:** Small 3-5W speaker (~$5-10) + - Search: "3W 8 ohm speaker" +- **Option B:** I2S speaker amplifier + speaker + - MAX98357A I2S amp (~$5) + - 4-8 ohm speaker (~$5) +- **Option C:** Line out to existing speakers (cheapest) + +**3. MicroSD Card** +- 4GB or larger +- FAT32 formatted +- Class 10 recommended +- ~$5 + +**4. USB-C Cable** +- For power and programming +- ~$3-5 + +--- + +### Optional but Nice + +**1. Enclosure/Case** +- 3D print custom case +- Find STL files on Thingiverse +- Or use small project box (~$5) + +**2. Microphone Array** (for better pickup) +- 2 or 4-mic array board (~$15-25) +- Better voice detection +- Phase 2+ enhancement + +**3. Battery Pack** (for portable testing) +- USB-C power bank +- Makes testing easier +- Already have? Use it! + +**4. Mounting Hardware** +- Velcro strips +- 3M command strips +- Wall mount brackets +- ~$5 + +--- + +## Multi-Unit Strategy + +### Same Hardware (Recommended) +**Buy:** 2-4x Maix Duino K210 units +**Benefit:** +- All units identical +- Same code deployment +- Easy troubleshooting +- Bulk buy discount + +**Deployment:** +- Unit 1: Living room +- Unit 2: Bedroom +- Unit 3: Kitchen +- Unit 4: Office + +### Mixed Hardware (Experimental) +**Buy:** +- 2x Maix Duino K210 (proven) +- 1x ESP32-S3-Box (modern) +- 1x Maix-III (future-proof) + +**Benefit:** +- Test different platforms +- Evaluate performance +- Future-proofing + +**Drawback:** +- More complex code +- Different troubleshooting +- Inconsistent UX + +**Verdict:** ⚠️ Only if you want to experiment + +--- + +## Budget Options + +### Ultra-Budget Multi-Room (~$50 total) +- 2x Generic ESP32 + I2S mic ($10 each = $20) +- 2x Speakers ($5 each = $10) +- 2x SD cards ($5 each = $10) +- Cables ($10) +- **Total:** ~$50 for 2 units + +**Pros:** Cheap +**Cons:** No LCD, DIY assembly, different code + +--- + +### Mid-Budget Multi-Room (~$100 total) +- 2x Maix Duino K210 ($35 each = $70) +- 2x I2S mics ($5 each = $10) +- 2x Speakers ($5 each = $10) +- Accessories ($10) +- **Total:** ~$100 for 2 units + +**Pros:** Proven, consistent, LCD included +**Cons:** "Outdated" hardware (doesn't matter for your use) + +--- + +### Premium Multi-Room (~$200 total) +- 2x Maix-III AXera-Pi ($70 each = $140) +- 2x I2S mics ($10 each = $20) +- 2x Speakers ($10 each = $20) +- Accessories ($20) +- **Total:** ~$200 for 2 units + +**Pros:** Future-proof, modern, powerful +**Cons:** More expensive, newer = less support + +--- + +## My Recommendation + +### For Second Unit: Buy Another Maix Duino K210 ✅ + +**Reasoning:** +1. **Code reuse** - Everything you develop for unit 1 works on unit 2 +2. **Known quantity** - No surprises, you know it works +3. **Multi-room consistency** - All units behave the same +4. **Edge wake word ready** - Can upgrade later if desired +5. **Cost-effective** - ~$40 for full kit with LCD +6. **Stock available** - Still widely sold despite being "outdated" + +**Where to Buy:** +- **Best:** AliExpress "Sipeed Official Store" (~$30 + shipping) +- **Fastest:** Amazon (~$45 with Prime) +- **Support:** Seeed Studio (~$40 + shipping) + +**What to Order:** +``` +Shopping List for Second Unit: +[ ] 1x Sipeed Maix Duino Kit (board + LCD + camera) - $30-35 +[ ] 1x I2S MEMS microphone (INMP441 or SPH0645) - $5-7 +[ ] 1x Small speaker (3W, 8 ohm) - $5-10 +[ ] 1x MicroSD card (8GB+, Class 10) - $5 +[ ] 1x USB-C cable - $3-5 +[ ] Optional: Enclosure/mounting - $5-10 + +Total: ~$50-75 (depending on shipping and options) +``` + +--- + +### For Third+ Units: Evaluate + +By the time you're ready for 3rd/4th units: +- You'll have experience with K210 +- You'll know if you want consistency (more K210s) +- Or variety (try ESP32-S3 or Maix-III) +- Maix-III may have better availability +- Prices may have changed + +**Decision:** Revisit when units 1 and 2 are working + +--- + +## Future-Proofing Considerations + +### Will K210 be Supported? +- **MaixPy:** Still actively maintained for K210 +- **Community:** Large existing user base +- **Models:** Pre-trained models still work +- **Lifespan:** Good for 3-5+ years + +**Verdict:** ✅ Safe to buy more K210s now + +### When to Switch Hardware? +Consider switching when: +- [ ] K210 becomes hard to find +- [ ] You need better performance (edge ML) +- [ ] Power consumption is critical +- [ ] New features require newer hardware + +**Timeline:** Probably 2-3 years out + +--- + +## Special Considerations + +### Different Rooms, Different Needs? + +**Living Room (Primary):** +- Needs: Best audio, LCD display, polish +- **Hardware:** Maix Duino K210 with all features + +**Bedroom (Secondary):** +- Needs: Simple, no bright LCD at night +- **Hardware:** Maix Duino K210, disable LCD at night + +**Kitchen (Ambient Noise):** +- Needs: Better microphone array +- **Hardware:** Maix Duino K210 + 4-mic array + +**Office (Minimal):** +- Needs: Cheap, basic audio only +- **Hardware:** Generic ESP32 + I2S mic + +### All Same vs Customized? + +**Recommendation:** Start with all same (Maix Duino), customize later if needed. + +--- + +## Action Plan + +### This Week +1. **Order second Maix Duino K210** (~$30-40) +2. **Order I2S microphone** (~$5-7) +3. **Order speaker** (~$5-10) +4. **Order SD card** (~$5) + +**Total Investment:** ~$50-65 + +### Next Month +1. Wait for delivery (2-4 weeks from AliExpress) +2. Test unit 1 while waiting +3. Refine code and setup process +4. Prepare for unit 2 deployment + +### In 2-3 Months +1. Deploy unit 2 (should be easy after unit 1) +2. Test multi-room +3. Decide on unit 3/4 based on experience +4. Consider bulk order if expanding + +--- + +## Summary + +**Buy for Second Unit:** +- ✅ **Sipeed Maix Duino K210** (same as first) - ~$35 +- ✅ **I2S MEMS microphone** (INMP441) - ~$5 +- ✅ **Small speaker** (3W, 8 ohm) - ~$8 +- ✅ **MicroSD card** (8GB Class 10) - ~$5 +- ✅ **USB-C cable** - ~$5 + +**Total:** ~$60 shipped + +**Why:** Code reuse, consistency, proven solution, future-expandable + +**Where:** AliExpress (cheap) or Amazon (fast) + +**When:** Order now, 2-4 weeks delivery + +**Third+ Units:** Decide after testing 2 units (probably buy more K210s) + +--- + +## Quick Links + +**Official Sipeed Store (AliExpress):** +https://sipeed.aliexpress.com/store/1101739727 + +**Seeed Studio:** +https://www.seeedstudio.com/catalogsearch/result/?q=maix+duino + +**Amazon Search:** +"Sipeed Maix Duino K210" + +**Microphone (Adafruit):** +https://www.adafruit.com/product/3421 + +**Alternative Mic (AliExpress):** +Search: "INMP441 I2S microphone breakout" + +--- + +**Happy Building! 🏠🎙️** diff --git a/docs/K210_PERFORMANCE_VERIFICATION.md b/docs/K210_PERFORMANCE_VERIFICATION.md new file mode 100755 index 0000000..7f2819b --- /dev/null +++ b/docs/K210_PERFORMANCE_VERIFICATION.md @@ -0,0 +1,223 @@ +# K210 Performance Verification for Voice Assistant + +**Date:** 2025-11-29 +**Source:** https://github.com/sipeed/MaixPy Performance Comparison +**Question:** Is K210 suitable for our Mycroft Precise wake word detection project? + +--- + +## K210 Specifications + +- **Processor:** K210 dual-core RISC-V @ 400MHz +- **AI Accelerator:** KPU (Neural Network Processor) +- **SRAM:** 8MB +- **Status:** Considered "outdated" by Sipeed (2018 release) + +--- + +## Performance Comparison (from MaixPy GitHub) + +### YOLOv2 Object Detection +| Chip | Performance | Notes | +|------|------------|-------| +| K210 | 1.8 ms | Limited to older models | +| V831 | 20-40 ms | More modern, but slower | +| R329 | N/A | Newer hardware | + +### Our Use Case: Audio Processing + +**For wake word detection, we need:** +- Audio input (16kHz, mono) ✅ K210 has I2S +- Real-time processing ✅ K210 KPU can handle this +- Network communication ✅ K210 has ESP32 WiFi +- Low latency (<100ms) ✅ Achievable + +--- + +## Deployment Strategy Analysis + +### Option A: Server-Side Wake Word (Recommended) +**K210 Role:** Audio I/O only +- Capture audio from I2S microphone ✅ Well supported +- Stream to Heimdall via WiFi ✅ No problem +- Receive and play TTS audio ✅ Works fine +- LED/display feedback ✅ Easy + +**K210 Requirements:** MINIMAL +- No AI processing needed +- Simple audio streaming +- Network communication only +- **Verdict:** ✅ K210 is MORE than capable + +### Option B: Edge Wake Word (Future) +**K210 Role:** Wake word detection on-device +- Load KMODEL wake word model ⚠️ Needs conversion +- Run inference on KPU ⚠️ Quantization required +- Detect wake word locally ⚠️ Possible but limited + +**K210 Limitations:** +- KMODEL conversion complex (TF→ONNX→KMODEL) +- Quantization may reduce accuracy (80-90% vs 95%+) +- Limited to simpler models +- **Verdict:** ⚠️ Possible but challenging + +--- + +## Why K210 is PERFECT for Our Project + +### 1. We're Starting with Server-Side Detection +- K210 only does audio I/O +- All AI processing on Heimdall (powerful server) +- No need for cutting-edge hardware +- **K210 is ideal for this role** + +### 2. Audio Processing is Not Computationally Intensive +Unlike YOLOv2 (60 FPS video processing): +- Audio: 16kHz sample rate = 16,000 samples/second +- Wake word: Simple streaming +- No real-time neural network inference needed (server-side) +- **K210's "old" specs don't matter** + +### 3. Edge Detection is Optional (Future Enhancement) +- We can prove the concept with server-side first +- Edge detection is a nice-to-have optimization +- If we need edge later, we can: + - Use simpler wake word models + - Accept slightly lower accuracy + - Or upgrade hardware then +- **Starting point doesn't require latest hardware** + +### 4. K210 Advantages We Actually Care About +- ✅ Well-documented (mature platform) +- ✅ Stable MaixPy firmware +- ✅ Large community and examples +- ✅ Proven audio processing +- ✅ Already have the hardware! +- ✅ Cost-effective ($30 vs $100+ newer boards) + +--- + +## Performance Targets vs K210 Capabilities + +### What We Need: +- Audio capture: 16kHz, 1 channel ✅ K210: Easy +- Audio streaming: ~128 kbps over WiFi ✅ K210: No problem +- Wake word latency: <200ms ✅ K210: Achievable (server-side) +- LED feedback: Instant ✅ K210: Trivial +- Audio playback: 16kHz TTS ✅ K210: Supported + +### What We DON'T Need (for initial deployment): +- ❌ Real-time video processing +- ❌ Complex neural networks on device +- ❌ Multi-model inference +- ❌ High-resolution image processing +- ❌ Latest and greatest AI accelerator + +--- + +## Comparison to Alternatives + +### If we bought newer hardware: + +**V831 ($50-70):** +- Pros: Newer, better supported +- Cons: + - More expensive + - SLOWER at neural networks than K210 + - Still need server for Whisper anyway + - Overkill for audio I/O + +**ESP32-S3 ($10-20):** +- Pros: Cheap, WiFi built-in +- Cons: + - No KPU (if we want edge detection later) + - Less capable for ML + - Would work for server-side though + +**Raspberry Pi Zero 2 W ($15):** +- Pros: Full Linux, familiar +- Cons: + - No dedicated audio hardware + - No neural accelerator + - More power hungry + - Overkill for our needs + +**Verdict:** K210 is actually the sweet spot for this project! + +--- + +## Real-World Comparison + +### What K210 CAN Do (Proven): +- Audio classification ✅ +- Simple keyword spotting ✅ +- Voice activity detection ✅ +- Audio streaming ✅ +- Multi-microphone beamforming ✅ + +### What We're Asking It To Do: +- Stream audio to server ✅ Much easier +- (Optional future) Simple wake word detection ✅ Proven capability + +--- + +## Recommendation: Proceed with K210 + +### Phase 1: Server-Side (Now) +K210 role: Audio I/O device +- **Difficulty:** Easy +- **Performance:** Excellent +- **K210 utilization:** ~10-20% +- **Status:** No concerns whatsoever + +### Phase 2: Edge Detection (Future) +K210 role: Wake word detection + audio I/O +- **Difficulty:** Moderate (model conversion) +- **Performance:** Good enough (80-90% accuracy) +- **K210 utilization:** ~30-40% +- **Status:** Feasible, community has done it + +--- + +## Conclusion + +**Is K210 outdated?** Yes, for cutting-edge ML applications. + +**Is K210 suitable for our project?** ABSOLUTELY YES! + +**Why:** +1. We're using server-side processing (K210 just streams audio) +2. K210's audio capabilities are excellent +3. Mature platform = more examples and stability +4. Already have the hardware +5. Cost-effective +6. Can optionally upgrade to edge detection later + +**The "outdated" warning is for people wanting latest ML performance. We're using it as an audio I/O device with WiFi - it's perfect for that!** + +--- + +## Additional Notes + +### From MaixPy GitHub Warning: +> "We now recommend users choose the MaixCAM ... For 2018 K210 ... limited performance" + +**Our Response:** +- We don't need 2024 performance for audio streaming +- Server does the heavy lifting (Heimdall with NVIDIA GPU) +- K210 mature platform is actually an advantage +- If we need more later, we can upgrade edge device while keeping server + +### Community Validation: +Many Mycroft Precise + K210 projects exist: +- Audio streaming: Proven ✅ +- Edge wake word: Proven ✅ +- Full voice assistant: Proven ✅ + +**The K210 is "outdated" for video/vision ML, not for audio projects.** + +--- + +**Final Verdict:** ✅ PROCEED WITH CONFIDENCE + +The K210 is perfect for our use case. Ignore the "outdated" warning - that's for people doing real-time video processing or wanting the latest ML features. For a voice assistant where the heavy lifting happens server-side, the K210 is an excellent, mature, cost-effective choice! diff --git a/docs/LCD_CAMERA_FEATURES.md b/docs/LCD_CAMERA_FEATURES.md new file mode 100755 index 0000000..8c0c25b --- /dev/null +++ b/docs/LCD_CAMERA_FEATURES.md @@ -0,0 +1,566 @@ +# Maix Duino LCD & Camera Feature Analysis + +**Date:** 2025-11-29 +**Hardware:** Sipeed Maix Duino (K210) +**Question:** What's the overhead for using LCD display and camera? + +--- + +## Hardware Capabilities + +### LCD Display +- **Resolution:** Typically 320x240 or 240x135 (depending on model) +- **Interface:** SPI +- **Color:** RGB565 (16-bit color) +- **Frame Rate:** Up to 60 FPS (limited by SPI bandwidth) +- **Status:** ✅ Included with most Maix Duino kits + +### Camera +- **Resolution:** Various (OV2640 common: 2MP, up to 1600x1200) +- **Interface:** DVP (Digital Video Port) +- **Frame Rate:** Up to 60 FPS (lower at high resolution) +- **Status:** ✅ Often included with Maix Duino kits + +### K210 Resources +- **CPU:** Dual-core RISC-V @ 400MHz +- **KPU:** Neural network accelerator +- **SRAM:** 8MB total (6MB available for apps) +- **Flash:** 16MB + +--- + +## LCD Usage for Voice Assistant + +### Use Case 1: Status Display (Minimal Overhead) +**What to Show:** +- Current state (idle/listening/processing/responding) +- Wake word detected indicator +- WiFi status and signal strength +- Server connection status +- Volume level +- Time/date + +**Overhead:** +- **CPU:** ~2-5% (simple text/icons) +- **RAM:** ~200KB (framebuffer + assets) +- **Power:** ~50mW additional +- **Complexity:** Low (MaixPy has built-in LCD support) + +**Code Example:** +```python +import lcd +import image + +lcd.init() +lcd.rotation(2) # Rotate if needed + +# Simple status display +img = image.Image(size=(320, 240)) +img.draw_string(10, 10, "Listening...", color=(0, 255, 0), scale=3) +img.draw_circle(300, 20, 10, color=(0, 255, 0), fill=True) # Status LED +lcd.display(img) +``` + +**Verdict:** ✅ **Very Low Overhead - Highly Recommended** + +--- + +### Use Case 2: Audio Waveform Visualizer (Moderate Overhead) + +#### Input Waveform (Microphone) +**What to Show:** +- Real-time audio level meter +- Waveform display (oscilloscope style) +- VU meter +- Frequency spectrum (simple bars) + +**Overhead:** +- **CPU:** ~10-15% (real-time drawing) +- **RAM:** ~300KB (framebuffer + audio buffer) +- **Frame Rate:** 15-30 FPS (sufficient for audio visualization) +- **Complexity:** Moderate (drawing primitives + FFT) + +**Implementation:** +```python +import lcd, audio, image +import array + +lcd.init() +audio.init() + +def draw_waveform(audio_buffer): + img = image.Image(size=(320, 240)) + + # Draw waveform + width = 320 + height = 240 + center = height // 2 + + # Sample every Nth point to fit on screen + step = len(audio_buffer) // width + + for x in range(width - 1): + y1 = center + (audio_buffer[x * step] // 256) + y2 = center + (audio_buffer[(x + 1) * step] // 256) + img.draw_line(x, y1, x + 1, y2, color=(0, 255, 0)) + + # Add level meter + level = max(abs(min(audio_buffer)), abs(max(audio_buffer))) + bar_height = (level * height) // 32768 + img.draw_rectangle(0, height - bar_height, 20, bar_height, + color=(0, 255, 0), fill=True) + + lcd.display(img) +``` + +**Verdict:** ✅ **Moderate Overhead - Feasible and Cool!** + +--- + +#### Output Waveform (TTS Response) +**What to Show:** +- TTS audio being played back +- Speaking animation (mouth/sound waves) +- Response text scrolling + +**Overhead:** +- **CPU:** ~10-15% (similar to input) +- **RAM:** ~300KB +- **Complexity:** Moderate + +**Note:** Can reuse same visualization code as input waveform. + +**Verdict:** ✅ **Same as Input - Totally Doable** + +--- + +### Use Case 3: Spectrum Analyzer (Higher Overhead) +**What to Show:** +- Frequency bars (FFT visualization) +- 8-16 frequency bands +- Classic "equalizer" look + +**Overhead:** +- **CPU:** ~20-30% (FFT computation + drawing) +- **RAM:** ~500KB (FFT buffers + framebuffer) +- **Complexity:** Moderate-High (FFT required) + +**Implementation Note:** +- K210 KPU can accelerate FFT operations +- Can do simple 8-band analysis with minimal CPU +- More bands = more CPU + +**Verdict:** ⚠️ **Higher Overhead - Use Sparingly** + +--- + +### Use Case 4: Interactive UI (High Overhead) +**What to Show:** +- Touchscreen controls (if touchscreen available) +- Settings menu +- Volume slider +- Wake word selection +- Network configuration + +**Overhead:** +- **CPU:** ~20-40% (touch detection + UI rendering) +- **RAM:** ~1MB (UI framework + assets) +- **Complexity:** High (need UI framework) + +**Verdict:** ⚠️ **High Overhead - Nice-to-Have Later** + +--- + +## Camera Usage for Voice Assistant + +### Use Case 1: Person Detection (Wake on Face) +**What to Do:** +- Detect person in frame +- Only listen when someone present +- Privacy mode: disable when no one around + +**Overhead:** +- **CPU:** ~30-40% (KPU handles inference) +- **RAM:** ~1.5MB (model + frame buffers) +- **Power:** ~200mW additional +- **Complexity:** Moderate (pre-trained models available) + +**Pros:** +- ✅ Privacy enhancement (only listen when occupied) +- ✅ Power saving (sleep when empty room) +- ✅ Pre-trained models available for K210 + +**Cons:** +- ❌ Adds latency (check camera before listening) +- ❌ Privacy concerns (camera always on) +- ❌ Moderate resource usage + +**Verdict:** 🤔 **Interesting but Complex - Phase 2+** + +--- + +### Use Case 2: Visual Context (Future AI Integration) +**What to Do:** +- "What am I holding?" queries +- Visual scene understanding +- QR code scanning +- Gesture control + +**Overhead:** +- **CPU:** 40-60% (vision processing) +- **RAM:** 2-3MB (models + buffers) +- **Complexity:** High (requires vision models) + +**Verdict:** ❌ **Too Complex for Initial Release - Future Feature** + +--- + +### Use Case 3: Visual Wake Word (Gesture Detection) +**What to Do:** +- Wave hand to activate +- Thumbs up/down for feedback +- Alternative to voice wake word + +**Overhead:** +- **CPU:** ~30-40% (gesture detection) +- **RAM:** ~1.5MB +- **Complexity:** Moderate-High + +**Verdict:** 🤔 **Novel Idea - Phase 3+** + +--- + +## Recommended LCD Implementation + +### Phase 1: Basic Status Display (Recommended NOW) +``` +┌─────────────────────────┐ +│ Voice Assistant │ +│ │ +│ Status: Listening ● │ +│ WiFi: ████░░ 75% │ +│ Server: Connected │ +│ │ +│ Volume: [██████░░░] │ +│ │ +│ Time: 14:23 │ +└─────────────────────────┘ +``` + +**Features:** +- Current state indicator +- WiFi signal strength +- Server connection status +- Volume level bar +- Clock +- Wake word indicator (pulsing circle) + +**Overhead:** ~2-5% CPU, 200KB RAM + +--- + +### Phase 2: Waveform Visualization (Cool Addition) +``` +┌─────────────────────────┐ +│ Listening... [●] │ +├─────────────────────────┤ +│ ╱╲ ╱╲ ╱╲ ╱╲ │ +│ ╱ ╲╱ ╲ ╱ ╲╱ ╲ │ +│ │ +│ Level: [████░░░░░░] │ +└─────────────────────────┘ +``` + +**Features:** +- Real-time waveform (15-30 FPS) +- Audio level meter +- State indicator +- Simple and clean + +**Overhead:** ~10-15% CPU, 300KB RAM + +--- + +### Phase 3: Enhanced Visualizer (Polish) +``` +┌─────────────────────────┐ +│ Hey Computer! [●] │ +├─────────────────────────┤ +│ ▁▂▃▄▅▆▇█ ▁▂▃▄▅▆▇█ │ +│ ▁▂▃▄▅▆▇█ ▁▂▃▄▅▆▇█ │ +│ │ +│ "Turn off the lights" │ +└─────────────────────────┘ +``` + +**Features:** +- Spectrum analyzer (8-16 bands) +- Transcription display +- Animated response +- More polished UI + +**Overhead:** ~20-30% CPU, 500KB RAM + +--- + +## Resource Budget Analysis + +### Total K210 Resources +- **CPU:** 2 cores @ 400MHz (assume ~100% available) +- **RAM:** 6MB available for app +- **Bandwidth:** SPI (LCD), I2S (audio), WiFi + +### Current Voice Assistant Usage (Server-Side Wake Word) + +| Component | CPU % | RAM (KB) | +|-----------|-------|----------| +| Audio Capture (I2S) | 5% | 128 | +| Audio Playback | 5% | 128 | +| WiFi Streaming | 10% | 256 | +| Network Stack | 5% | 512 | +| MaixPy Runtime | 10% | 1024 | +| **Base Total** | **35%** | **~2MB** | + +### With LCD Features + +| Display Mode | CPU % | RAM (KB) | Total CPU | Total RAM | +|--------------|-------|----------|-----------|-----------| +| **None** | 0% | 0 | 35% | 2MB | +| **Status Only** | 2-5% | 200 | 37-40% | 2.2MB | +| **Waveform** | 10-15% | 300 | 45-50% | 2.3MB | +| **Spectrum** | 20-30% | 500 | 55-65% | 2.5MB | + +### With Camera Features + +| Feature | CPU % | RAM (KB) | Feasible? | +|---------|-------|----------|-----------| +| Person Detection | 30-40% | 1500 | ⚠️ Tight | +| Gesture Control | 30-40% | 1500 | ⚠️ Tight | +| Visual Context | 40-60% | 2500 | ❌ Too much | + +--- + +## Recommendations + +### ✅ IMPLEMENT NOW: Basic Status Display +- **Why:** Very low overhead, huge UX improvement +- **Overhead:** 2-5% CPU, 200KB RAM +- **Benefit:** Users know what's happening at a glance +- **Difficulty:** Easy (MaixPy has good LCD support) + +### ✅ IMPLEMENT SOON: Waveform Visualizer +- **Why:** Cool factor, moderate overhead +- **Overhead:** 10-15% CPU, 300KB RAM +- **Benefit:** Engaging, confirms mic is working, looks professional +- **Difficulty:** Moderate (simple drawing code) + +### 🤔 CONSIDER LATER: Spectrum Analyzer +- **Why:** Higher overhead, diminishing returns +- **Overhead:** 20-30% CPU, 500KB RAM +- **Benefit:** Looks cool but not essential +- **Difficulty:** Moderate-High (FFT required) + +### ❌ SKIP FOR NOW: Camera Features +- **Why:** High overhead, complex, privacy concerns +- **Overhead:** 30-60% CPU, 1.5-2.5MB RAM +- **Benefit:** Novel but not core functionality +- **Difficulty:** High (model integration, privacy handling) + +--- + +## Implementation Priority + +### Phase 1 (Week 1): Core Functionality +- [x] Audio capture and streaming +- [x] Server integration +- [ ] Basic LCD status display + - Idle/Listening/Processing states + - WiFi status + - Connection indicator + +### Phase 2 (Week 2-3): Visual Enhancement +- [ ] Audio waveform visualizer + - Input (microphone) waveform + - Output (TTS) waveform + - Level meters + - Clean, minimal design + +### Phase 3 (Month 2): Polish +- [ ] Spectrum analyzer option +- [ ] Animated transitions +- [ ] Settings display +- [ ] Network configuration UI (optional) + +### Phase 4 (Month 3+): Advanced Features +- [ ] Camera person detection (privacy mode) +- [ ] Gesture control experiments +- [ ] Visual wake word alternative + +--- + +## Code Structure Recommendation + +```python +# main.py structure with modular display + +import lcd, audio, network +from display_manager import DisplayManager +from audio_processor import AudioProcessor +from voice_client import VoiceClient + +# Initialize +lcd.init() +display = DisplayManager(mode='waveform') # or 'status' or 'spectrum' + +# Main loop +while True: + # Audio processing + audio_buffer = audio.capture() + + # Update display (non-blocking) + if display.mode == 'status': + display.show_status(state='listening', wifi_level=75) + elif display.mode == 'waveform': + display.show_waveform(audio_buffer) + elif display.mode == 'spectrum': + display.show_spectrum(audio_buffer) + + # Network communication + voice_client.stream_audio(audio_buffer) +``` + +--- + +## Measured Overhead (Estimated) + +### Status Display Only +- **CPU:** 38% total (3% for display) +- **RAM:** 2.2MB total (200KB for display) +- **Battery Life:** -2% (minimal impact) +- **WiFi Latency:** No impact +- **Verdict:** ✅ Negligible impact, worth it! + +### Waveform Visualizer +- **CPU:** 48% total (13% for display) +- **RAM:** 2.3MB total (300KB for display) +- **Battery Life:** -5% (minor impact) +- **WiFi Latency:** No impact (still <200ms) +- **Verdict:** ✅ Acceptable, looks great! + +### Spectrum Analyzer +- **CPU:** 60% total (25% for display) +- **RAM:** 2.5MB total (500KB for display) +- **Battery Life:** -8% (noticeable) +- **WiFi Latency:** Possible minor impact +- **Verdict:** ⚠️ Usable but pushing limits + +--- + +## Camera: Should You Use It? + +### Pros +- ✅ Already have the hardware (free!) +- ✅ Novel features (person detection, gestures) +- ✅ Privacy enhancement potential +- ✅ Future-proofing + +### Cons +- ❌ High resource usage (30-60% CPU, 1.5-2.5MB RAM) +- ❌ Complex implementation +- ❌ Privacy concerns (camera always on) +- ❌ Not core to voice assistant +- ❌ Competes with audio processing resources + +### Recommendation +**Skip camera for initial implementation.** Focus on core voice assistant functionality. Revisit in Phase 3+ when: +1. Core features are stable +2. You want to experiment +3. You have time for optimization +4. You want to differentiate from commercial assistants + +--- + +## Final Recommendations + +### Start With (NOW): +```python +# Simple status display +# - State indicator +# - WiFi status +# - Connection status +# - Time/date +# Overhead: ~3% CPU, 200KB RAM +``` + +### Add Next (Week 2): +```python +# Waveform visualizer +# - Real-time audio waveform +# - Level meter +# - Clean design +# Overhead: +10% CPU, +100KB RAM +``` + +### Maybe Later (Month 2+): +```python +# Spectrum analyzer +# - 8-16 frequency bands +# - FFT visualization +# - Optional mode +# Overhead: +15% CPU, +200KB RAM +``` + +### Skip (For Now): +```python +# Camera features +# - Person detection +# - Gestures +# - Visual context +# Too complex, revisit later +``` + +--- + +## Example: Combined Status + Waveform Display + +``` +┌───────────────────────────────┐ +│ Voice Assistant [LISTENING]│ +├───────────────────────────────┤ +│ │ +│ ╱╲ ╱╲ ╱╲ ╱╲ ╱╲ │ +│ ╱ ╲ ╱ ╲╱ ╲ ╱ ╲╱ ╲ │ +│ ╲╱ ╲╱ │ +│ │ +│ Vol: [████████░░] WiFi: ▂▃▅█ │ +│ │ +│ Server: 10.1.10.71 ● 14:23 │ +└───────────────────────────────┘ +``` + +**Total Overhead:** ~15% CPU, 300KB RAM +**Impact:** Minimal, excellent UX improvement +**Coolness Factor:** 9/10 + +--- + +## Conclusion + +### LCD: YES! Definitely Use It! ✅ +- **Status display:** Low overhead, huge benefit +- **Waveform:** Moderate overhead, looks amazing +- **Spectrum:** Higher overhead, nice-to-have + +**Recommendation:** Start with status, add waveform, consider spectrum later. + +### Camera: Skip For Now ❌ +- High overhead +- Complex implementation +- Not core functionality +- Revisit in Phase 3+ + +**Focus on nailing the voice assistant first, then add visual features incrementally!** + +--- + +**TL;DR:** Use the LCD for status + waveform visualization (~15% overhead total). Skip the camera for now. Your K210 can easily handle this! 🎉 diff --git a/docs/MYCROFT_PRECISE_GUIDE.md b/docs/MYCROFT_PRECISE_GUIDE.md new file mode 100755 index 0000000..b3e9b64 --- /dev/null +++ b/docs/MYCROFT_PRECISE_GUIDE.md @@ -0,0 +1,638 @@ +# Mycroft Precise Wake Word Training Guide + +## Overview + +Mycroft Precise is a neural network-based wake word detector that you can train on custom wake words. This guide covers two deployment approaches for your Maix Duino voice assistant: + +1. **Server-side detection** (Recommended to start) - Run Precise on Heimdall +2. **Edge detection** (Advanced) - Convert model for K210 on Maix Duino + +## Architecture Options + +### Option A: Server-Side Wake Word Detection (Recommended) + +``` +Maix Duino Heimdall +┌─────────────────┐ ┌──────────────────────┐ +│ Continuous │ Audio Stream │ Mycroft Precise │ +│ Audio Capture │───────────────>│ Wake Word Detection │ +│ │ │ │ +│ LED Feedback │<───────────────│ Whisper STT │ +│ Speaker Output │ Response │ HA Integration │ +│ │ │ Piper TTS │ +└─────────────────┘ └──────────────────────┘ +``` + +**Pros:** +- Easier setup and debugging +- Better accuracy (more compute available) +- Easy to retrain and update models +- Can use ensemble models + +**Cons:** +- Continuous audio streaming (bandwidth) +- Slightly higher latency (~100-200ms) +- Requires stable network + +### Option B: Edge Detection on Maix Duino (Advanced) + +``` +Maix Duino Heimdall +┌─────────────────┐ ┌──────────────────────┐ +│ Precise Model │ │ │ +│ (K210 KPU) │ │ │ +│ Wake Detection │ Audio (on wake)│ Whisper STT │ +│ │───────────────>│ HA Integration │ +│ Audio Capture │ │ Piper TTS │ +│ LED Feedback │<───────────────│ │ +└─────────────────┘ Response └──────────────────────┘ +``` + +**Pros:** +- Lower latency (~50ms wake detection) +- Less network traffic +- Works even if server is down +- Better privacy (no continuous streaming) + +**Cons:** +- Complex model conversion (TensorFlow → ONNX → KMODEL) +- Limited by K210 compute +- Harder to update models +- Requires careful optimization + +## Recommended Approach: Start with Server-Side + +Begin with server-side detection on Heimdall, then optimize to edge detection once everything works. + +## Phase 1: Mycroft Precise Setup on Heimdall + +### Install Mycroft Precise + +```bash +# SSH to Heimdall +ssh alan@10.1.10.71 + +# Create conda environment for Precise +conda create -n precise python=3.7 -y +conda activate precise + +# Install TensorFlow 1.x (Precise requires this) +pip install tensorflow==1.15.5 --break-system-packages + +# Install Precise +pip install mycroft-precise --break-system-packages + +# Install audio dependencies +sudo apt-get install -y portaudio19-dev sox libatlas-base-dev + +# Install precise-engine (for faster inference) +wget https://github.com/MycroftAI/mycroft-precise/releases/download/v0.3.0/precise-engine_0.3.0_x86_64.tar.gz +tar xvf precise-engine_0.3.0_x86_64.tar.gz +sudo cp precise-engine/precise-engine /usr/local/bin/ +sudo chmod +x /usr/local/bin/precise-engine +``` + +### Verify Installation + +```bash +precise-engine --version +# Should output: Precise v0.3.0 + +precise-listen --help +# Should show help text +``` + +## Phase 2: Training Your Custom Wake Word + +### Step 1: Collect Wake Word Samples + +You'll need ~50-100 samples of your wake word. Choose something: +- 2-3 syllables long +- Easy to pronounce +- Unlikely to occur in normal speech + +Example wake words: +- "Hey Computer" (recommended - similar to commercial products) +- "Okay Jarvis" +- "Hello Assistant" +- "Activate Assistant" + +```bash +# Create project directory +mkdir -p ~/precise-models/hey-computer +cd ~/precise-models/hey-computer + +# Record wake word samples +precise-collect +``` + +When prompted: +1. Type your wake word ("hey computer") +2. Press SPACE to record +3. Say the wake word clearly +4. Press SPACE to stop +5. Repeat 50-100 times + +**Tips for good samples:** +- Vary your tone and speed +- Different distances from mic +- Different background noise levels +- Different pronunciations +- Have family members record too + +### Step 2: Collect "Not Wake Word" Samples + +Record background audio and similar-sounding phrases: + +```bash +# Create not-wake-word directory +mkdir -p not-wake-word + +# Record random speech, music, TV, etc. +# These help the model learn what NOT to trigger on +precise-collect -f not-wake-word/random.wav +``` + +Collect ~200-500 samples of: +- Normal conversation +- TV/music in background +- Similar sounding phrases ("hey commuter", "they computed", etc.) +- Ambient noise +- Other household sounds + +### Step 3: Generate Training Data + +```bash +# Organize samples +mkdir -p hey-computer/{wake-word,not-wake-word,test/wake-word,test/not-wake-word} + +# Split samples (80% train, 20% test) +# Move 80% of wake-word samples to hey-computer/wake-word/ +# Move 20% to hey-computer/test/wake-word/ +# Move 80% of not-wake-word to hey-computer/not-wake-word/ +# Move 20% to hey-computer/test/not-wake-word/ + +# Generate training data +precise-train-incremental hey-computer.net hey-computer/ +``` + +### Step 4: Train the Model + +```bash +# Basic training (will take 30-60 minutes) +precise-train -e 60 hey-computer.net hey-computer/ + +# For better accuracy, train longer +precise-train -e 120 hey-computer.net hey-computer/ + +# Watch for overfitting - validation loss should decrease +# Stop if validation loss starts increasing +``` + +Training output will show: +``` +Epoch 1/60 +loss: 0.4523 - val_loss: 0.3891 +Epoch 2/60 +loss: 0.3102 - val_loss: 0.2845 +... +``` + +### Step 5: Test the Model + +```bash +# Test with microphone +precise-listen hey-computer.net + +# Speak your wake word - should see "!" when detected +# Speak other phrases - should not trigger + +# Test with audio files +precise-test hey-computer.net hey-computer/test/ + +# Should show accuracy metrics: +# Wake word accuracy: 95%+ +# False positive rate: <5% +``` + +### Step 6: Optimize Sensitivity + +```bash +# Adjust activation threshold +precise-listen hey-computer.net -t 0.5 # Default +precise-listen hey-computer.net -t 0.7 # More conservative +precise-listen hey-computer.net -t 0.3 # More aggressive + +# Find optimal threshold for your use case +# Higher = fewer false positives, more false negatives +# Lower = more false positives, fewer false negatives +``` + +## Phase 3: Integration with Voice Server + +### Update voice_server.py + +Add Mycroft Precise support to the server: + +```python +# Add to imports +from precise_runner import PreciseEngine, PreciseRunner +import pyaudio + +# Add to configuration +PRECISE_MODEL = os.getenv("PRECISE_MODEL", + "/home/alan/precise-models/hey-computer.net") +PRECISE_SENSITIVITY = float(os.getenv("PRECISE_SENSITIVITY", "0.5")) + +# Global precise runner +precise_runner = None + +def on_activation(): + """Called when wake word is detected""" + print("Wake word detected!") + # Trigger recording and processing + # (Implementation depends on your audio streaming setup) + +def start_precise_listener(): + """Start Mycroft Precise wake word detection""" + global precise_runner + + engine = PreciseEngine( + '/usr/local/bin/precise-engine', + PRECISE_MODEL + ) + + precise_runner = PreciseRunner( + engine, + sensitivity=PRECISE_SENSITIVITY, + on_activation=on_activation + ) + + precise_runner.start() + print(f"Precise listening with model: {PRECISE_MODEL}") +``` + +### Server-Side Wake Word Detection Architecture + +For server-side detection, you need continuous audio streaming from Maix Duino: + +```python +# New endpoint for audio streaming +@app.route('/stream', methods=['POST']) +def stream_audio(): + """ + Receive continuous audio stream for wake word detection + + This endpoint processes incoming audio chunks and runs them + through Mycroft Precise for wake word detection. + """ + # Implementation here + pass +``` + +## Phase 4: Maix Duino Integration (Server-Side Detection) + +### Update maix_voice_client.py + +For server-side detection, stream audio continuously: + +```python +# Add to configuration +STREAM_ENDPOINT = "/stream" +WAKE_WORD_CHECK_INTERVAL = 0.1 # Check every 100ms + +def stream_audio_continuous(): + """ + Stream audio to server for wake word detection + + Server will notify us when wake word is detected + """ + import socket + import struct + + # Create socket connection + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_addr = (VOICE_SERVER_URL.replace('http://', '').split(':')[0], 8888) + + try: + sock.connect(server_addr) + print("Connected to wake word server") + + while True: + # Capture audio chunk + chunk = i2s_dev.record(CHUNK_SIZE) + + if chunk: + # Send chunk size first, then chunk + sock.sendall(struct.pack('>I', len(chunk))) + sock.sendall(chunk) + + # Check for wake word detection signal + # (simplified - actual implementation needs non-blocking socket) + + time.sleep(0.01) + + except Exception as e: + print(f"Streaming error: {e}") + finally: + sock.close() +``` + +## Phase 5: Edge Detection on Maix Duino (Advanced) + +### Convert Precise Model to KMODEL + +This is complex and requires several conversion steps: + +```bash +# Step 1: Convert TensorFlow model to ONNX +pip install tf2onnx --break-system-packages + +python -m tf2onnx.convert \ + --saved-model hey-computer.net \ + --output hey-computer.onnx + +# Step 2: Optimize ONNX model +pip install onnx --break-system-packages + +python -c " +import onnx +from onnx import optimizer + +model = onnx.load('hey-computer.onnx') +passes = ['eliminate_deadend', 'eliminate_identity', + 'eliminate_nop_dropout', 'eliminate_nop_pad'] +optimized = optimizer.optimize(model, passes) +onnx.save(optimized, 'hey-computer-opt.onnx') +" + +# Step 3: Convert ONNX to KMODEL (for K210) +# Use nncase (https://github.com/kendryte/nncase) +# This step is hardware-specific and complex + +# Install nncase +pip install nncase --break-system-packages + +# Convert (adjust parameters based on your model) +ncc compile hey-computer-opt.onnx \ + -i onnx \ + --dataset calibration_data \ + -o hey-computer.kmodel \ + --target k210 +``` + +**Note:** KMODEL conversion is non-trivial and may require model architecture adjustments. The K210 has limitations: +- Max model size: ~6MB +- Limited operators support +- Quantization required for performance + +### Testing KMODEL on Maix Duino + +```python +# Load model in maix_voice_client.py +import KPU as kpu + +def load_wake_word_model_kmodel(): + """Load converted KMODEL for wake word detection""" + global kpu_task + + try: + kpu_task = kpu.load("/sd/models/hey-computer.kmodel") + print("Wake word model loaded on K210") + return True + except Exception as e: + print(f"Failed to load model: {e}") + return False + +def detect_wake_word_kmodel(): + """Run wake word detection using K210 KPU""" + global kpu_task + + # Capture audio + audio_chunk = i2s_dev.record(CHUNK_SIZE) + + # Preprocess for model (depends on model input format) + # This is model-specific - adjust based on your training + + # Run inference + features = preprocess_audio(audio_chunk) + output = kpu.run_yolo2(kpu_task, features) # Adjust based on model type + + # Check confidence + if output[0] > WAKE_WORD_THRESHOLD: + return True + + return False +``` + +## Recommended Wake Words + +Based on testing and community feedback: + +**Best performers:** +1. "Hey Computer" - Clear, distinct, 2-syllable, hard consonants +2. "Okay Jarvis" - Pop culture reference, easy to say +3. "Hey Mycroft" - Original Mycroft wake word (lots of training data available) + +**Avoid:** +- Single syllable words (too easy to trigger) +- Common phrases ("okay", "hey there") +- Names of people in your household +- Words that sound like common speech patterns + +## Training Tips + +### For Best Accuracy + +1. **Diverse training data:** + - Multiple speakers + - Various distances (1ft to 15ft) + - Different noise conditions + - Accent variations + +2. **Quality over quantity:** + - 50 good samples > 200 poor samples + - Clear pronunciation + - Consistent volume + +3. **Hard negatives:** + - Include similar-sounding phrases + - Include partial wake words + - Include common false triggers you notice + +4. **Regular retraining:** + - Add false positives to training set + - Add missed detections + - Retrain every few weeks initially + +### Collecting Hard Negatives + +```bash +# Run Precise in test mode and collect false positives +precise-listen hey-computer.net --save-false-positives + +# This will save audio clips when model triggers incorrectly +# Add these to your not-wake-word training set +# Retrain to reduce false positives +``` + +## Performance Benchmarks + +### Server-Side Detection (Heimdall) +- **Latency:** 100-200ms from utterance to detection +- **Accuracy:** 95%+ with good training +- **False positive rate:** <1 per hour with tuning +- **CPU usage:** ~5-10% (single core) +- **Network:** ~128kbps continuous stream + +### Edge Detection (Maix Duino) +- **Latency:** 50-100ms +- **Accuracy:** 80-90% (limited by K210 quantization) +- **False positive rate:** Varies by model optimization +- **CPU usage:** ~30% K210 (leaves room for other tasks) +- **Network:** 0 until wake detected + +## Monitoring and Debugging + +### Log Wake Word Detections + +```python +# Add to voice_server.py +import datetime + +def log_wake_word(confidence, timestamp=None): + """Log wake word detections for analysis""" + if timestamp is None: + timestamp = datetime.datetime.now() + + log_file = "/home/alan/voice-assistant/logs/wake_words.log" + + with open(log_file, 'a') as f: + f.write(f"{timestamp.isoformat()},{confidence}\n") +``` + +### Analyze False Positives + +```bash +# Check wake word log +tail -f ~/voice-assistant/logs/wake_words.log + +# Find patterns in false positives +grep "wake_word" ~/voice-assistant/logs/wake_words.log | \ + awk -F',' '{print $2}' | \ + sort -n | uniq -c +``` + +## Production Deployment + +### Systemd Service with Precise + +Update the systemd service to include Precise: + +```ini +[Unit] +Description=Voice Assistant with Wake Word Detection +After=network.target + +[Service] +Type=simple +User=alan +WorkingDirectory=/home/alan/voice-assistant +Environment="PATH=/home/alan/miniconda3/envs/precise/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/home/alan/voice-assistant/config/.env +ExecStart=/home/alan/miniconda3/envs/precise/bin/python voice_server.py --enable-precise +Restart=on-failure +RestartSec=10 + +[Install] +WantedBy=multi-user.target +``` + +## Troubleshooting + +### Precise Won't Start + +```bash +# Check TensorFlow version +python -c "import tensorflow as tf; print(tf.__version__)" +# Should be 1.15.x + +# Check model file +file hey-computer.net +# Should be "TensorFlow SavedModel" + +# Test model directly +precise-engine hey-computer.net +# Should load without errors +``` + +### Low Accuracy + +1. **Collect more training data** - Especially hard negatives +2. **Increase training epochs** - Try 200-300 epochs +3. **Verify training/test split** - Should be 80/20 +4. **Check audio quality** - Sample rate should match (16kHz) +5. **Try different wake words** - Some are easier to detect + +### High False Positive Rate + +1. **Increase threshold** - Try 0.6, 0.7, 0.8 +2. **Add false positives to training** - Retrain with false triggers +3. **Collect more negative samples** - Expand not-wake-word set +4. **Use ensemble models** - Run multiple models, require agreement + +### KMODEL Conversion Fails + +This is expected - K210 conversion is complex: + +1. **Simplify model architecture** - Reduce layer count +2. **Use quantization-aware training** - Train with quantization in mind +3. **Check operator support** - K210 doesn't support all TF ops +4. **Consider alternatives:** + - Use pre-trained models for K210 + - Stick with server-side detection + - Use Porcupine instead (has K210 support) + +## Alternative: Use Pre-trained Models + +Mycroft provides some pre-trained models: + +```bash +# Download Hey Mycroft model +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +tar xzf hey-mycroft.tar.gz + +# Test it +precise-listen hey-mycroft.net +``` + +Then train your own wake word starting from this base: + +```bash +# Fine-tune from pre-trained model +precise-train -e 60 my-wake-word.net my-wake-word/ \ + --from-checkpoint hey-mycroft.net +``` + +## Next Steps + +1. **Start with server-side** - Get it working on Heimdall first +2. **Collect good training data** - Quality samples are key +3. **Test and tune threshold** - Find the sweet spot for your environment +4. **Monitor performance** - Track false positives and misses +5. **Iterate on training** - Add hard examples, retrain +6. **Consider edge deployment** - Once server-side is solid + +## Resources + +- Mycroft Precise Docs: https://github.com/MycroftAI/mycroft-precise +- Training Guide: https://mycroft-ai.gitbook.io/docs/mycroft-technologies/precise +- Community Models: https://github.com/MycroftAI/precise-data +- K210 Docs: https://canaan-creative.com/developer +- nncase: https://github.com/kendryte/nncase + +## Conclusion + +Mycroft Precise gives you full control over your wake word detection with complete privacy. Start with server-side detection for easier development, then optimize to edge detection once you have a well-trained model. + +The key to success is good training data - invest time in collecting diverse, high-quality samples! diff --git a/docs/PRECISE_DEPLOYMENT.md b/docs/PRECISE_DEPLOYMENT.md new file mode 100755 index 0000000..8cfb313 --- /dev/null +++ b/docs/PRECISE_DEPLOYMENT.md @@ -0,0 +1,577 @@ +# Mycroft Precise Deployment Guide + +## Quick Reference: Server vs Edge Detection + +### Server-Side Detection (Recommended for Start) + +**Setup:** +```bash +# 1. On Heimdall: Setup Precise +./setup_precise.sh --wake-word "hey computer" + +# 2. Train your model (follow scripts in ~/precise-models/hey-computer/) +cd ~/precise-models/hey-computer +./1-record-wake-word.sh +./2-record-not-wake-word.sh +# Organize samples, then: +./3-train-model.sh +./4-test-model.sh + +# 3. Start voice server with Precise +cd ~/voice-assistant +conda activate precise +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/hey-computer/hey-computer.net \ + --precise-sensitivity 0.5 +``` + +**Architecture:** +- Maix Duino → Continuous audio stream → Heimdall +- Heimdall runs Precise on audio stream +- On wake word: Process command with Whisper +- Response → TTS → Stream back to Maix Duino + +**Pros:** Easier setup, better accuracy, simple updates +**Cons:** More network traffic, requires stable connection + +### Edge Detection (Advanced - Future Phase) + +**Setup:** +```bash +# 1. Train model on Heimdall (same as above) +# 2. Convert to KMODEL for K210 +# 3. Deploy to Maix Duino +# (See MYCROFT_PRECISE_GUIDE.md for detailed conversion steps) +``` + +**Architecture:** +- Maix Duino runs Precise locally on K210 +- Only sends audio after wake word detected +- Lower latency, less network traffic + +**Pros:** Lower latency, less bandwidth, works offline +**Cons:** Complex conversion, lower accuracy, harder updates + +## Phase-by-Phase Deployment + +### Phase 1: Server Setup (Day 1) + +```bash +# On Heimdall +ssh alan@10.1.10.71 + +# 1. Setup voice assistant base +./setup_voice_assistant.sh + +# 2. Setup Mycroft Precise +./setup_precise.sh --wake-word "hey computer" + +# 3. Configure environment +vim ~/voice-assistant/config/.env +``` + +Update `.env`: +```bash +HA_URL=http://your-home-assistant:8123 +HA_TOKEN=your_token_here +PRECISE_MODEL=/home/alan/precise-models/hey-computer/hey-computer.net +PRECISE_SENSITIVITY=0.5 +``` + +### Phase 2: Wake Word Training (Day 1-2) + +```bash +# Navigate to training directory +cd ~/precise-models/hey-computer +conda activate precise + +# Record samples (30-60 minutes) +./1-record-wake-word.sh # Record 50-100 wake word samples +./2-record-not-wake-word.sh # Record 200-500 negative samples + +# Organize samples +# Move 80% of wake-word recordings to wake-word/ +# Move 20% of wake-word recordings to test/wake-word/ +# Move 80% of not-wake-word to not-wake-word/ +# Move 20% of not-wake-word to test/not-wake-word/ + +# Train model (30-60 minutes) +./3-train-model.sh + +# Test model +./4-test-model.sh + +# Evaluate on test set +./5-evaluate-model.sh + +# Tune threshold +./6-tune-threshold.sh +``` + +### Phase 3: Server Integration (Day 2) + +#### Option A: Manual Testing + +```bash +cd ~/voice-assistant +conda activate precise + +# Start server with Precise enabled +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/hey-computer/hey-computer.net \ + --precise-sensitivity 0.5 \ + --ha-url http://your-ha:8123 \ + --ha-token your_token +``` + +#### Option B: Systemd Service + +Update systemd service to use Precise environment: + +```bash +sudo vim /etc/systemd/system/voice-assistant.service +``` + +```ini +[Unit] +Description=Voice Assistant with Wake Word Detection +After=network.target + +[Service] +Type=simple +User=alan +WorkingDirectory=/home/alan/voice-assistant +Environment="PATH=/home/alan/miniconda3/envs/precise/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/home/alan/voice-assistant/config/.env +ExecStart=/home/alan/miniconda3/envs/precise/bin/python voice_server.py \ + --enable-precise \ + --precise-model /home/alan/precise-models/hey-computer/hey-computer.net \ + --precise-sensitivity 0.5 +Restart=on-failure +RestartSec=10 +StandardOutput=append:/home/alan/voice-assistant/logs/voice_assistant.log +StandardError=append:/home/alan/voice-assistant/logs/voice_assistant_error.log + +[Install] +WantedBy=multi-user.target +``` + +Enable and start: +```bash +sudo systemctl daemon-reload +sudo systemctl enable voice-assistant +sudo systemctl start voice-assistant +sudo systemctl status voice-assistant +``` + +### Phase 4: Maix Duino Setup (Day 2-3) + +For server-side wake word detection, Maix Duino streams audio: + +Update `maix_voice_client.py`: + +```python +# Use simplified mode - just stream audio +# Server handles wake word detection +CONTINUOUS_STREAM = True # Enable continuous streaming +WAKE_WORD_CHECK_INTERVAL = 0 # Server-side detection +``` + +Flash and test: +1. Copy updated script to SD card +2. Boot Maix Duino +3. Check serial console for connection +4. Speak wake word +5. Verify server logs show detection + +### Phase 5: Testing & Tuning (Day 3-7) + +#### Test Wake Word Detection + +```bash +# Monitor server logs +journalctl -u voice-assistant -f + +# Or check detections via API +curl http://10.1.10.71:5000/wake-word/detections +``` + +#### Test End-to-End Flow + +1. Say wake word: "Hey Computer" +2. Wait for LED/beep on Maix Duino +3. Say command: "Turn on the living room lights" +4. Verify HA command executes +5. Hear TTS response + +#### Monitor Performance + +```bash +# Check wake word log +tail -f ~/voice-assistant/logs/wake_words.log + +# Check false positive rate +grep "wake_word" ~/voice-assistant/logs/wake_words.log | wc -l + +# Check accuracy +# Should see detections when you say wake word +# Should NOT see detections during normal conversation +``` + +#### Tune Sensitivity + +If too many false positives: +```bash +# Increase threshold (more conservative) +# Edit systemd service or restart with: +python voice_server.py --precise-sensitivity 0.7 +``` + +If missing wake words: +```bash +# Decrease threshold (more aggressive) +python voice_server.py --precise-sensitivity 0.3 +``` + +#### Collect Hard Examples + +```bash +# When you notice false positives, record them +cd ~/precise-models/hey-computer +precise-collect -f not-wake-word/false-positive-$(date +%s).wav + +# When wake word is missed, record it +precise-collect -f wake-word/missed-$(date +%s).wav + +# After collecting 10-20 examples, retrain +./3-train-model.sh +``` + +## Monitoring Commands + +### Check System Status + +```bash +# Service status +sudo systemctl status voice-assistant + +# Server health +curl http://10.1.10.71:5000/health + +# Wake word status +curl http://10.1.10.71:5000/wake-word/status + +# Recent detections +curl http://10.1.10.71:5000/wake-word/detections +``` + +### View Logs + +```bash +# Real-time server logs +journalctl -u voice-assistant -f + +# Last 50 lines +journalctl -u voice-assistant -n 50 + +# Specific log file +tail -f ~/voice-assistant/logs/voice_assistant.log + +# Wake word detections +tail -f ~/voice-assistant/logs/wake_words.log + +# Maix Duino serial console +screen /dev/ttyUSB0 115200 +``` + +### Performance Metrics + +```bash +# CPU usage (should be ~5-10% idle, spikes during processing) +top -p $(pgrep -f voice_server.py) + +# Memory usage +ps aux | grep voice_server.py + +# Network traffic (if streaming audio) +iftop -i eth0 # or your network interface +``` + +## Troubleshooting + +### Wake Word Not Detecting + +**Check model is loaded:** +```bash +curl http://10.1.10.71:5000/wake-word/status +# Should show: "enabled": true +``` + +**Test model directly:** +```bash +conda activate precise +precise-listen ~/precise-models/hey-computer/hey-computer.net +# Speak wake word - should see "!" +``` + +**Check sensitivity:** +```bash +# Try lower threshold +precise-listen ~/precise-models/hey-computer/hey-computer.net -t 0.3 +``` + +**Verify audio input:** +```bash +# Test microphone +arecord -d 5 test.wav +aplay test.wav +``` + +### Too Many False Positives + +**Increase threshold:** +```bash +# Edit service or restart with higher sensitivity +python voice_server.py --precise-sensitivity 0.7 +``` + +**Retrain with false positives:** +```bash +cd ~/precise-models/hey-computer +# Record false triggers in not-wake-word/ +precise-collect -f not-wake-word/false-triggers.wav +# Add to not-wake-word training set +./3-train-model.sh +``` + +### Server Won't Start with Precise + +**Check Precise installation:** +```bash +conda activate precise +python -c "from precise_runner import PreciseRunner; print('OK')" +``` + +**Check engine:** +```bash +precise-engine --version +# Should show: Precise v0.3.0 +``` + +**Check model file:** +```bash +ls -lh ~/precise-models/hey-computer/hey-computer.net +file ~/precise-models/hey-computer/hey-computer.net +``` + +**Check permissions:** +```bash +chmod +x /usr/local/bin/precise-engine +chmod 644 ~/precise-models/hey-computer/hey-computer.net +``` + +### Audio Quality Issues + +**Test audio path:** +```bash +# Record test on server +arecord -f S16_LE -r 16000 -c 1 -d 5 test.wav + +# Transcribe with Whisper +conda activate voice-assistant +python -c " +import whisper +model = whisper.load_model('base') +result = model.transcribe('test.wav') +print(result['text']) +" +``` + +**If poor quality:** +- Check microphone connection +- Verify sample rate (16kHz) +- Test with USB microphone +- Check for interference/noise + +### Maix Duino Connection Issues + +**Check WiFi:** +```python +# In Maix Duino serial console +import network +wlan = network.WLAN(network.STA_IF) +print(wlan.isconnected()) +print(wlan.ifconfig()) +``` + +**Check server reachability:** +```python +# From Maix Duino +import urequests +response = urequests.get('http://10.1.10.71:5000/health') +print(response.json()) +``` + +**Check audio streaming:** +```bash +# On Heimdall, monitor network +sudo tcpdump -i any -n host +# Should see continuous packets when streaming +``` + +## Optimization Tips + +### Reduce Latency + +1. **Use smaller Whisper model:** + ```bash + # Edit .env + WHISPER_MODEL=base # or tiny + ``` + +2. **Optimize Precise sensitivity:** + ```bash + # Find sweet spot between false positives and latency + # Lower threshold = faster trigger but more false positives + ``` + +3. **Pre-load models:** + ```python + # Models load on startup, not first request + # Adds ~30s startup time but eliminates first-request delay + ``` + +### Improve Accuracy + +1. **Use larger Whisper model:** + ```bash + WHISPER_MODEL=large + ``` + +2. **Train more wake word samples:** + ```bash + # Aim for 100+ high-quality samples + # Diverse speakers, conditions, distances + ``` + +3. **Increase training epochs:** + ```bash + # In 3-train-model.sh + precise-train -e 120 hey-computer.net . # vs default 60 + ``` + +### Reduce False Positives + +1. **Collect hard negatives:** + ```bash + # Record TV, music, similar phrases + # Add to not-wake-word training set + ``` + +2. **Increase threshold:** + ```bash + --precise-sensitivity 0.7 # vs default 0.5 + ``` + +3. **Use ensemble model:** + ```python + # Run multiple models, require agreement + # Advanced - requires code modification + ``` + +## Production Checklist + +- [ ] Wake word model trained with 50+ samples +- [ ] Model tested with <5% false positive rate +- [ ] Server service enabled and auto-starting +- [ ] Home Assistant token configured +- [ ] Maix Duino WiFi configured +- [ ] End-to-end test successful +- [ ] Logs rotating properly +- [ ] Monitoring in place +- [ ] Backup of trained model +- [ ] Documentation updated + +## Backup and Recovery + +### Backup Trained Model + +```bash +# Backup model +cp ~/precise-models/hey-computer/hey-computer.net \ + ~/precise-models/hey-computer/hey-computer.net.backup + +# Backup to another host +scp ~/precise-models/hey-computer/hey-computer.net \ + user@backup-host:/path/to/backups/ +``` + +### Restore from Backup + +```bash +# Restore model +cp ~/precise-models/hey-computer/hey-computer.net.backup \ + ~/precise-models/hey-computer/hey-computer.net + +# Restart service +sudo systemctl restart voice-assistant +``` + +## Next Steps + +Once basic server-side detection is working: + +1. **Add more intents** - Expand Home Assistant control +2. **Implement TTS playback** - Complete the audio response loop +3. **Multi-room support** - Deploy multiple Maix Duino units +4. **Voice profiles** - Train model on family members +5. **Edge deployment** - Convert model for K210 (advanced) + +## Resources + +- Main guide: MYCROFT_PRECISE_GUIDE.md +- Quick start: QUICKSTART.md +- Architecture: maix-voice-assistant-architecture.md +- Mycroft Docs: https://github.com/MycroftAI/mycroft-precise +- Community: https://community.mycroft.ai/ + +## Support + +### Log an Issue + +```bash +# Collect debug info +echo "=== System Info ===" > debug.log +uname -a >> debug.log +conda list >> debug.log +echo "=== Service Status ===" >> debug.log +systemctl status voice-assistant >> debug.log +echo "=== Recent Logs ===" >> debug.log +journalctl -u voice-assistant -n 100 >> debug.log +echo "=== Wake Word Status ===" >> debug.log +curl http://10.1.10.71:5000/wake-word/status >> debug.log +``` + +Then share `debug.log` when asking for help. + +### Common Issues Database + +| Symptom | Likely Cause | Solution | +|---------|--------------|----------| +| No wake detection | Model not loaded | Check `/wake-word/status` | +| Service won't start | Missing dependencies | Reinstall Precise | +| High false positives | Low threshold | Increase to 0.7+ | +| Missing wake words | High threshold | Decrease to 0.3-0.4 | +| Poor transcription | Bad audio quality | Check microphone | +| HA commands fail | Wrong token | Update .env | +| High CPU usage | Large Whisper model | Use smaller model | + +## Conclusion + +With Mycroft Precise, you have complete control over your wake word detection. Start with server-side detection for easier debugging, collect good training data, and tune the threshold for your environment. Once it's working well, you can optionally optimize to edge detection for lower latency. + +The key to success: **Quality training data > Quantity** + +Happy voice assisting! 🎙️ diff --git a/docs/QUESTIONS_ANSWERED.md b/docs/QUESTIONS_ANSWERED.md new file mode 100755 index 0000000..1d2e6be --- /dev/null +++ b/docs/QUESTIONS_ANSWERED.md @@ -0,0 +1,470 @@ +# Your Questions Answered - Quick Reference + +## TL;DR: Yes, Yes, and Multiple Options! + +### Q1: Pre-trained "Hey Mycroft" Model? + +**Answer: YES! ✅** + +Download and use immediately: +```bash +./quick_start_hey_mycroft.sh +# Done in 5 minutes - no training! +``` + +The pre-trained model works great and saves you 1-2 hours of training time. + +### Q2: Multiple Wake Words? + +**Answer: YES! ✅ (with considerations)** + +**Server-side (Heimdall):** Easy, run 3-5 wake words +```bash +python voice_server_enhanced.py \ + --enable-precise \ + --multi-wake-word +``` + +**Edge (K210):** Feasible for 1-2, challenging for 3+ + +### Q3: Adopting New Users' Voices? + +**Answer: Multiple approaches ✅** + +**Best option:** Train one model with everyone's voices upfront +**Alternative:** Incremental retraining as new users join +**Advanced:** Speaker identification with personalization + +--- + +## Detailed Answers + +### 1. Pre-trained "Hey Mycroft" Model + +#### Where to Get It + +```bash +# Quick start script does this for you +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +tar xzf hey-mycroft.tar.gz +``` + +#### How to Use + +**Instant deployment:** +```bash +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/pretrained/hey-mycroft.net +``` + +**Fine-tune with your voice:** +```bash +# Record 20-30 samples of your voice saying "Hey Mycroft" +precise-collect + +# Fine-tune from pre-trained +precise-train -e 30 my-hey-mycroft.net . \ + --from-checkpoint ~/precise-models/pretrained/hey-mycroft.net +``` + +#### Advantages + +✅ **Zero training time** - Works immediately +✅ **Proven accuracy** - Tested by thousands +✅ **Good baseline** - Already includes diverse voices +✅ **Easy fine-tuning** - Add your voice in 30 mins vs 60+ mins from scratch + +#### When to Use Pre-trained vs Custom + +**Use Pre-trained "Hey Mycroft" when:** +- You want to test quickly +- "Hey Mycroft" is an acceptable wake word +- You want proven accuracy out-of-box + +**Train Custom when:** +- You want a different wake word ("Hey Computer", "Jarvis", etc.) +- Maximum accuracy for your specific environment +- Family-specific wake word + +**Hybrid (Recommended):** +- Start with pre-trained "Hey Mycroft" +- Test and learn the system +- Fine-tune with your samples +- Or add custom wake word later + +--- + +### 2. Multiple Wake Words + +#### Can You Have Multiple? + +**Yes!** Options: + +#### Option A: Server-Side (Recommended) + +**Easy implementation:** +```bash +# Use the enhanced server +python voice_server_enhanced.py \ + --enable-precise \ + --multi-wake-word +``` + +**Configured wake words:** +- "Hey Mycroft" (pre-trained) +- "Hey Computer" (custom) +- "Jarvis" (custom) + +**Resource impact:** +- 3 models = ~15-30% CPU (Heimdall handles easily) +- ~300-600MB RAM +- Each model runs independently + +**Example use cases:** +```python +"Hey Mycroft, what's the time?" → General assistant +"Jarvis, run diagnostics" → Personal assistant mode +"Emergency, call help" → Priority/emergency mode +``` + +#### Option B: Edge (K210) + +**Feasible for 1-2 wake words:** +```python +# Sequential checking +for model in ['hey-mycroft.kmodel', 'emergency.kmodel']: + if detect_wake_word(model): + return model +``` + +**Limitations:** +- +50-100ms latency per additional model +- Memory constraints (6MB total for all models) +- More models = more power consumption + +**Recommendation:** +- K210: 1 wake word (optimal) +- K210: 2 wake words (acceptable) +- K210: 3+ wake words (not recommended) + +#### Option C: Contextual Wake Words + +Different wake words for different purposes: +```python +wake_word_contexts = { + 'hey_mycroft': 'general_assistant', + 'emergency': 'priority_emergency', + 'goodnight': 'bedtime_routine', +} +``` + +#### Should You Use Multiple? + +**One wake word is usually enough!** + +Commercial products (Alexa, Google) use one wake word and they work fine. + +**Use multiple when:** +- Different family members want different wake words +- You want context-specific behaviors (emergency vs. general) +- You enjoy the flexibility + +**Start with one, add more later if needed.** + +--- + +### 3. Adopting New Users' Voices + +#### Challenge + +Same wake word, different voices: +- Mom says "Hey Mycroft" (soprano) +- Dad says "Hey Mycroft" (bass) +- Kids say "Hey Mycroft" (high-pitched) + +All need to work! + +#### Solution 1: Diverse Training (Recommended) + +**During initial training, have everyone record samples:** + +```bash +cd ~/precise-models/family-hey-mycroft + +# Session 1: Mom records 30 samples +precise-collect # Mom speaks "Hey Mycroft" 30 times + +# Session 2: Dad records 30 samples +precise-collect # Dad speaks "Hey Mycroft" 30 times + +# Session 3: Kids record 20 samples each +precise-collect # Kids speak "Hey Mycroft" 40 times total + +# Train one model with all voices +precise-train -e 60 family-hey-mycroft.net . + +# Deploy +python voice_server.py \ + --enable-precise \ + --precise-model family-hey-mycroft.net +``` + +**Pros:** +✅ One model works for everyone +✅ Simple deployment +✅ No switching needed +✅ Works from day one + +**Cons:** +❌ Need everyone's time upfront +❌ Slightly lower per-person accuracy than individual models + +#### Solution 2: Incremental Training + +**Start with one person, add others over time:** + +```bash +# Week 1: Train with Dad's voice +precise-train -e 60 hey-mycroft.net . + +# Week 2: Mom wants to use it +# Collect Mom's samples +precise-collect # Mom records 20-30 samples + +# Add to training set +cp mom-samples/* wake-word/ + +# Retrain from checkpoint (faster!) +precise-train -e 30 hey-mycroft.net . \ + --from-checkpoint hey-mycroft.net + +# Now works for both Dad and Mom! + +# Week 3: Kids want in +# Repeat process... +``` + +**Pros:** +✅ Don't need everyone upfront +✅ Easy to add new users +✅ Model improves gradually + +**Cons:** +❌ New users may have issues initially +❌ Requires periodic retraining + +#### Solution 3: Speaker Identification (Advanced) + +**Identify who's speaking, use personalized model/settings:** + +```bash +# Install speaker ID +pip install pyannote.audio scipy --break-system-packages + +# Use enhanced server +python voice_server_enhanced.py \ + --enable-precise \ + --enable-speaker-id \ + --hf-token YOUR_HF_TOKEN +``` + +**Enroll users:** +```bash +# Record 30-second voice sample from each person +# POST to /speakers/enroll with audio + name + +curl -F "name=alan" \ + -F "audio=@alan_voice.wav" \ + http://localhost:5000/speakers/enroll + +curl -F "name=sarah" \ + -F "audio=@sarah_voice.wav" \ + http://localhost:5000/speakers/enroll +``` + +**Benefits:** +```python +# Different responses per user +if speaker == 'alan': + turn_on('light.alan_office') +elif speaker == 'sarah': + turn_on('light.sarah_office') + +# Different permissions +if speaker == 'kids' and command.startswith('buy'): + return "Sorry, kids can't make purchases" +``` + +**Pros:** +✅ Personalized responses +✅ User-specific settings +✅ Better accuracy (optimized per voice) +✅ Can track who said what + +**Cons:** +❌ More complex +❌ Privacy considerations +❌ Additional CPU/RAM (~10% + 200MB) +❌ Requires voice enrollment + +#### Solution 4: Pre-trained Model (Easiest) + +**"Hey Mycroft" already includes diverse voices!** + +```bash +# Just use it - already trained on many voices +./quick_start_hey_mycroft.sh +``` + +The community model was trained with: +- Male and female voices +- Different accents +- Different ages +- Various environments + +**It should work for most family members out-of-box!** + +Then fine-tune if needed. + +--- + +## Recommended Path for Your Situation + +### Scenario: Family of 3-4 People + +**Week 1: Quick Start** +```bash +# Use pre-trained "Hey Mycroft" +./quick_start_hey_mycroft.sh + +# Test with all family members +# Likely works for everyone already! +``` + +**Week 2: Fine-tune if Needed** +```bash +# If someone has issues: +# Have them record 20 samples +# Fine-tune the model + +precise-train -e 30 family-hey-mycroft.net . \ + --from-checkpoint ~/precise-models/pretrained/hey-mycroft.net +``` + +**Week 3: Add Features** +```bash +# If you want personalization: +python voice_server_enhanced.py \ + --enable-speaker-id + +# Enroll each family member +``` + +### Scenario: Just You (or 1-2 People) + +**Option 1: Pre-trained** +```bash +./quick_start_hey_mycroft.sh +# Done! +``` + +**Option 2: Custom Wake Word** +```bash +# Train custom "Hey Computer" +cd ~/precise-models/hey-computer +./1-record-wake-word.sh # 50 samples +./2-record-not-wake-word.sh # 200 samples +./3-train-model.sh +``` + +### Scenario: Multiple People + Multiple Wake Words + +**Full setup:** +```bash +# Pre-trained for family +./quick_start_hey_mycroft.sh + +# Personal wake word for Dad +cd ~/precise-models/jarvis +# Train custom wake word + +# Emergency wake word +cd ~/precise-models/emergency +# Train emergency wake word + +# Run multi-wake-word server +python voice_server_enhanced.py \ + --enable-precise \ + --multi-wake-word \ + --enable-speaker-id +``` + +--- + +## Quick Decision Matrix + +| Your Situation | Recommendation | +|----------------|----------------| +| **Just getting started** | Pre-trained "Hey Mycroft" | +| **Want different wake word** | Train custom model | +| **Family of 3-4** | Pre-trained + fine-tune if needed | +| **Want personalization** | Add speaker ID | +| **Multiple purposes** | Multiple wake words (server-side) | +| **Deploying to K210** | 1 wake word, no speaker ID | + +--- + +## Files to Use + +**Quick start with pre-trained:** +- `quick_start_hey_mycroft.sh` - Zero training, 5 minutes! + +**Multiple wake words:** +- `voice_server_enhanced.py` - Multi-wake-word + speaker ID support + +**Training custom:** +- `setup_precise.sh` - Setup training environment +- Scripts in `~/precise-models/your-wake-word/` + +**Documentation:** +- `WAKE_WORD_ADVANCED.md` - Detailed guide (this is comprehensive!) +- `PRECISE_DEPLOYMENT.md` - Production deployment + +--- + +## Summary + +✅ **Yes**, pre-trained "Hey Mycroft" exists and works great +✅ **Yes**, you can have multiple wake words (server-side is easy) +✅ **Yes**, multiple approaches for multi-user support + +**Recommended approach:** +1. Start with `./quick_start_hey_mycroft.sh` (5 mins) +2. Test with all family members +3. Fine-tune if anyone has issues +4. Add speaker ID later if you want personalization +5. Consider multiple wake words only if you have specific use cases + +**Keep it simple!** One pre-trained wake word works for most people. + +--- + +## Next Actions + +**Ready to start?** + +```bash +# 5-minute quick start +./quick_start_hey_mycroft.sh + +# Or read more first +cat WAKE_WORD_ADVANCED.md +``` + +**Questions?** +- Pre-trained models: See WAKE_WORD_ADVANCED.md § Pre-trained +- Multiple wake words: See WAKE_WORD_ADVANCED.md § Multiple Wake Words +- Voice adaptation: See WAKE_WORD_ADVANCED.md § Voice Adaptation + +**Happy voice assisting! 🎙️** diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md new file mode 100755 index 0000000..8baddea --- /dev/null +++ b/docs/QUICKSTART.md @@ -0,0 +1,421 @@ +# Maix Duino Voice Assistant - Quick Start Guide + +## Overview +This guide will walk you through setting up a local, privacy-focused voice assistant using your Maix Duino board and Home Assistant integration. All processing happens on your local network - no cloud services required. + +## What You'll Build +- Wake word detection on Maix Duino (edge device) +- Speech-to-text using Whisper on Heimdall +- Home Assistant integration for smart home control +- Text-to-speech responses using Piper +- All processing local to your 10.1.10.0/24 network + +## Hardware Requirements +- [x] Sipeed Maix Duino board (you have this!) +- [ ] I2S MEMS microphone (or microphone array) +- [ ] Small speaker (3-5W) or audio output +- [ ] MicroSD card (4GB+) formatted as FAT32 +- [ ] USB-C cable for power and programming + +## Network Prerequisites +- Maix Duino will need WiFi access to your 10.1.10.0/24 network +- Heimdall (10.1.10.71) for AI processing +- Home Assistant instance (configure URL in setup) + +## Setup Process + +### Phase 1: Server Setup (Heimdall) + +#### Step 1: Run the setup script +```bash +# Transfer files to Heimdall +scp setup_voice_assistant.sh voice_server.py alan@10.1.10.71:~/ + +# SSH to Heimdall +ssh alan@10.1.10.71 + +# Make setup script executable and run it +chmod +x setup_voice_assistant.sh +./setup_voice_assistant.sh +``` + +#### Step 2: Configure Home Assistant access +```bash +# Edit the config file +vim ~/voice-assistant/config/.env +``` + +Update these values: +```env +HA_URL=http://your-home-assistant:8123 +HA_TOKEN=your_long_lived_access_token_here +``` + +To get a long-lived access token: +1. Open Home Assistant +2. Click your profile (bottom left) +3. Scroll to "Long-Lived Access Tokens" +4. Click "Create Token" +5. Copy the token and paste it in .env + +#### Step 3: Test the server +```bash +cd ~/voice-assistant +./test_server.sh +``` + +You should see: +``` +Loading Whisper model: medium +Whisper model loaded successfully +Starting voice processing server on 0.0.0.0:5000 +``` + +#### Step 4: Test with curl (from another terminal) +```bash +# Test health endpoint +curl http://10.1.10.71:5000/health + +# Should return: +# {"status":"healthy","whisper_loaded":true,"ha_connected":true} +``` + +### Phase 2: Maix Duino Setup + +#### Step 1: Flash MaixPy firmware +1. Download latest MaixPy firmware from: https://dl.sipeed.com/MAIX/MaixPy/release/ +2. Download Kflash GUI: https://github.com/sipeed/kflash_gui +3. Connect Maix Duino via USB +4. Flash firmware using Kflash GUI + +#### Step 2: Prepare SD card +```bash +# Format SD card as FAT32 +# Create directory structure: +mkdir -p /path/to/sdcard/models + +# Copy the client script +cp maix_voice_client.py /path/to/sdcard/main.py +``` + +#### Step 3: Configure WiFi settings +Edit `/path/to/sdcard/main.py`: +```python +# WiFi Settings +WIFI_SSID = "YourNetworkName" +WIFI_PASSWORD = "YourPassword" + +# Server Settings +VOICE_SERVER_URL = "http://10.1.10.71:5000" +``` + +#### Step 4: Test the board +1. Insert SD card into Maix Duino +2. Connect to serial console (115200 baud) + ```bash + screen /dev/ttyUSB0 115200 + # or + minicom -D /dev/ttyUSB0 -b 115200 + ``` +3. Power on the board +4. Watch the serial output for connection status + +### Phase 3: Integration & Testing + +#### Test 1: Basic connectivity +1. Maix Duino should connect to WiFi and display IP on LCD +2. Server should show in logs when Maix connects + +#### Test 2: Audio capture +The current implementation uses amplitude-based wake word detection as a placeholder. To test: +1. Clap loudly near the microphone +2. Speak a command (e.g., "turn on the living room lights") +3. Watch the LCD for transcription and response + +#### Test 3: Home Assistant control +Supported commands (add more in voice_server.py): +- "Turn on the living room lights" +- "Turn off the bedroom lights" +- "What's the temperature?" +- "Toggle the kitchen lights" + +### Phase 4: Wake Word Training (Advanced) + +The placeholder wake word detection uses simple amplitude triggering. For production use: + +#### Option A: Use Porcupine (easiest) +1. Sign up at: https://console.picovoice.ai/ +2. Train custom wake word +3. Download .ppn model +4. Convert to .kmodel for K210 + +#### Option B: Use Mycroft Precise (FOSS) +```bash +# On a machine with GPU +conda create -n precise python=3.6 +conda activate precise +pip install precise-runner + +# Record wake word samples +precise-collect + +# Train model +precise-train -e 60 my-wake-word.net my-wake-word/ + +# Convert to .kmodel +# (requires additional tools - see MaixPy docs) +``` + +## Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Your Home Network (10.1.10.0/24) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Maix Duino │────────>│ Heimdall │ │ +│ │ 10.1.10.xxx │ Audio │ 10.1.10.71 │ │ +│ │ │<────────│ │ │ +│ │ - Wake Word │ Response│ - Whisper │ │ +│ │ - Mic Input │ │ - Piper TTS │ │ +│ │ - Speaker │ │ - Flask API │ │ +│ └──────────────┘ └──────┬───────┘ │ +│ │ │ +│ │ REST API │ +│ v │ +│ ┌──────────────┐ │ +│ │ Home Asst. │ │ +│ │ homeassistant│ │ +│ │ │ │ +│ │ - Devices │ │ +│ │ - Automation │ │ +│ └──────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Troubleshooting + +### Maix Duino won't connect to WiFi +```python +# Check serial output for errors +# Common issues: +# - Wrong SSID/password +# - WPA3 not supported (use WPA2) +# - 5GHz network (use 2.4GHz) +``` + +### Whisper transcription is slow +```bash +# Use a smaller model on Heimdall +# Edit ~/voice-assistant/config/.env: +WHISPER_MODEL=base # or tiny for fastest +``` + +### Home Assistant commands don't work +```bash +# Check server logs +journalctl -u voice-assistant -f + +# Test HA connection manually +curl -H "Authorization: Bearer YOUR_TOKEN" \ + http://your-ha:8123/api/states +``` + +### Audio quality is poor +1. Check microphone connections +2. Adjust `SAMPLE_RATE` in maix_voice_client.py +3. Test with USB microphone first +4. Consider microphone array for better pickup + +### Out of memory on Maix Duino +```python +# In main_loop(), add more frequent GC: +if gc.mem_free() < 200000: # Increase threshold + gc.collect() +``` + +## Adding New Intents + +Edit `voice_server.py` and add patterns to `IntentParser.PATTERNS`: + +```python +PATTERNS = { + # Existing patterns... + + 'set_temperature': [ + r'set (?:the )?temperature to (\d+)', + r'make it (\d+) degrees', + ], +} +``` + +Then add the handler in `execute_intent()`: + +```python +elif intent == 'set_temperature': + temp = params.get('temperature') + success = ha_client.call_service( + 'climate', 'set_temperature', + entity_id, temperature=temp + ) + return f"Set temperature to {temp} degrees" +``` + +## Entity Mapping + +Add your Home Assistant entities to `IntentParser.ENTITY_MAP`: + +```python +ENTITY_MAP = { + # Lights + 'living room light': 'light.living_room', + 'bedroom light': 'light.bedroom', + + # Climate + 'thermostat': 'climate.main_floor', + 'temperature': 'sensor.main_floor_temperature', + + # Switches + 'coffee maker': 'switch.coffee_maker', + 'fan': 'switch.bedroom_fan', + + # Media + 'tv': 'media_player.living_room_tv', + 'music': 'media_player.whole_house', +} +``` + +## Performance Tuning + +### Reduce latency +1. Use Whisper `tiny` or `base` model +2. Implement streaming audio (currently batch) +3. Pre-load TTS models +4. Use faster TTS engine (e.g., espeak) + +### Improve accuracy +1. Use Whisper `large` model (slower) +2. Train custom wake word +3. Add NLU layer (Rasa, spaCy) +4. Collect and fine-tune on your voice + +## Next Steps + +### Short term +- [ ] Add more Home Assistant entity mappings +- [ ] Implement Piper TTS playback on Maix Duino +- [ ] Train custom wake word model +- [ ] Add LED animations for better feedback +- [ ] Implement conversation context + +### Medium term +- [ ] Multi-room support (multiple Maix Duino units) +- [ ] Voice profiles for different users +- [ ] Integration with Plex for media control +- [ ] Calendar and reminder functionality +- [ ] Weather updates from local weather station + +### Long term +- [ ] Custom skills/plugins system +- [ ] Integration with other services (Nextcloud, Matrix) +- [ ] Sound event detection (doorbell, smoke alarm) +- [ ] Intercom functionality between rooms +- [ ] Voice-controlled automation creation + +## Alternatives & Fallbacks + +If the Maix Duino proves limiting: + +### Raspberry Pi Zero 2 W +- More processing power +- Better software support +- USB audio support +- Cost: ~$15 + +### ESP32-S3 +- Better WiFi +- More RAM (8MB) +- Cheaper (~$10) +- Good community support + +### Orange Pi Zero 2 +- ARM Cortex-A53 quad-core +- 512MB-1GB RAM +- Full Linux support +- Cost: ~$20 + +## Resources + +### Documentation +- Maix Duino: https://wiki.sipeed.com/hardware/en/maix/ +- MaixPy: https://maixpy.sipeed.com/ +- Whisper: https://github.com/openai/whisper +- Piper TTS: https://github.com/rhasspy/piper +- Home Assistant API: https://developers.home-assistant.io/ + +### Community Projects +- Rhasspy: https://rhasspy.readthedocs.io/ +- Willow: https://github.com/toverainc/willow +- Mycroft: https://mycroft.ai/ + +### Wake Word Tools +- Porcupine: https://picovoice.ai/platform/porcupine/ +- Mycroft Precise: https://github.com/MycroftAI/mycroft-precise +- Snowboy (archived): https://github.com/Kitt-AI/snowboy + +## Getting Help + +### Check logs +```bash +# Server logs (if using systemd) +sudo journalctl -u voice-assistant -f + +# Or manual log file +tail -f ~/voice-assistant/logs/voice_assistant.log + +# Maix Duino serial console +screen /dev/ttyUSB0 115200 +``` + +### Common issues and solutions +See the Troubleshooting section above + +### Useful commands +```bash +# Restart service +sudo systemctl restart voice-assistant + +# Check service status +sudo systemctl status voice-assistant + +# Test HA connection +curl http://10.1.10.71:5000/health + +# Monitor Maix Duino +minicom -D /dev/ttyUSB0 -b 115200 +``` + +## Cost Breakdown + +| Item | Cost | Status | +|------|------|--------| +| Maix Duino | $30 | Have it! | +| I2S Microphone | $5-10 | Need | +| Speaker | $10 | Need (or use existing) | +| MicroSD Card | $5 | Have it? | +| **Total** | **$15-25** | (vs $50+ commercial) | + +**Benefits of local solution:** +- No subscription fees +- Complete privacy (no cloud) +- Customizable to your needs +- Integration with existing infrastructure +- Learning experience! + +## Conclusion + +You now have everything you need to build a local, privacy-focused voice assistant! The setup leverages your existing infrastructure (Heimdall for processing, Home Assistant for automation) while keeping costs minimal. + +Start with the basic setup, test each component, then iterate and improve. The beauty of this approach is you can enhance it over time without being locked into a commercial platform. + +Good luck, and enjoy your new voice assistant! 🎙️ diff --git a/docs/WAKE_WORD_ADVANCED.md b/docs/WAKE_WORD_ADVANCED.md new file mode 100755 index 0000000..5f80066 --- /dev/null +++ b/docs/WAKE_WORD_ADVANCED.md @@ -0,0 +1,723 @@ +# Wake Word Models: Pre-trained, Multiple, and Voice Adaptation + +## Pre-trained Wake Word Models + +### Yes! "Hey Mycroft" Already Exists + +Mycroft provides several pre-trained models that you can use immediately: + +#### Available Pre-trained Models + +**Hey Mycroft** (Official) +```bash +# Download from Mycroft's model repository +cd ~/precise-models/pretrained +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +tar xzf hey-mycroft.tar.gz + +# Test immediately +conda activate precise +precise-listen hey-mycroft.net + +# Should detect "Hey Mycroft" right away! +``` + +**Other Available Models:** +- **Hey Mycroft** - Best tested, most reliable +- **Christopher** - Alternative wake word +- **Hey Jarvis** - Community contributed +- **Computer** - Star Trek style + +#### Using Pre-trained Models + +**Option 1: Use as-is** +```bash +# Just point your server to the pre-trained model +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/pretrained/hey-mycroft.net \ + --precise-sensitivity 0.5 +``` + +**Option 2: Fine-tune for your voice** +```bash +# Use pre-trained as starting point, add your samples +cd ~/precise-models/my-hey-mycroft + +# Record additional samples +precise-collect + +# Train from checkpoint (much faster than from scratch!) +precise-train -e 30 my-hey-mycroft.net . \ + --from-checkpoint ~/precise-models/pretrained/hey-mycroft.net + +# This adds your voice/environment while keeping the base model +``` + +**Option 3: Ensemble with custom** +```python +# Use both pre-trained and custom model +# Require both to agree (reduces false positives) +# See implementation below +``` + +### Advantages of Pre-trained Models + +✅ **Instant deployment** - No training required +✅ **Proven accuracy** - Tested by thousands of users +✅ **Good starting point** - Fine-tune rather than train from scratch +✅ **Multiple speakers** - Already includes diverse voices +✅ **Save time** - Skip 1-2 hours of training + +### Disadvantages + +❌ **Generic** - Not optimized for your voice/environment +❌ **May need tuning** - Threshold adjustment required +❌ **Limited choice** - Only a few wake words available + +### Recommendation + +**Start with "Hey Mycroft"** pre-trained model: +1. Deploy immediately (zero training time) +2. Test in your environment +3. Collect false positives/negatives +4. Fine-tune with your examples +5. Best of both worlds! + +## Multiple Wake Words + +### Can You Have Multiple Wake Words? + +**Short answer:** Yes, but with tradeoffs. + +### Implementation Approaches + +#### Approach 1: Server-Side Multiple Models (Recommended) + +Run multiple Precise models in parallel on Heimdall: + +```python +# In voice_server.py +from precise_runner import PreciseEngine, PreciseRunner + +# Global runners for each wake word +precise_runners = {} +wake_word_configs = { + 'hey_mycroft': { + 'model': '~/precise-models/pretrained/hey-mycroft.net', + 'sensitivity': 0.5, + 'response': 'Yes?' + }, + 'hey_computer': { + 'model': '~/precise-models/hey-computer/hey-computer.net', + 'sensitivity': 0.5, + 'response': 'I\'m listening' + }, + 'jarvis': { + 'model': '~/precise-models/jarvis/jarvis.net', + 'sensitivity': 0.6, + 'response': 'At your service, sir' + } +} + +def on_wake_word_detected(wake_word_name): + """Callback with wake word identifier""" + def callback(): + print(f"Wake word detected: {wake_word_name}") + wake_word_queue.put({ + 'timestamp': time.time(), + 'wake_word': wake_word_name, + 'response': wake_word_configs[wake_word_name]['response'] + }) + return callback + +def start_multiple_wake_words(): + """Start multiple Precise listeners""" + for name, config in wake_word_configs.items(): + engine = PreciseEngine( + '/usr/local/bin/precise-engine', + os.path.expanduser(config['model']) + ) + + runner = PreciseRunner( + engine, + sensitivity=config['sensitivity'], + on_activation=on_wake_word_detected(name) + ) + + runner.start() + precise_runners[name] = runner + print(f"Started wake word listener: {name}") +``` + +**Resource Usage:** +- CPU: ~5-10% per model (3 models = ~15-30%) +- RAM: ~100-200MB per model +- Still very manageable on Heimdall + +**Pros:** +✅ Different wake words for different purposes +✅ Family members can choose preferred wake word +✅ Context-aware responses +✅ Easy to add/remove models + +**Cons:** +❌ Higher CPU usage (scales linearly) +❌ Increased false positive risk (3x models = 3x chance) +❌ More complex configuration + +#### Approach 2: Edge Multiple Models (K210) + +**Challenge:** K210 has limited resources + +**Option A: Sequential checking** (Feasible) +```python +# Check each model in sequence +models = ['hey-mycroft.kmodel', 'hey-computer.kmodel'] + +for model in models: + kpu_task = kpu.load(f"/sd/models/{model}") + result = kpu.run(kpu_task, audio_features) + if result > threshold: + return model # Wake word detected +``` + +**Resource impact:** +- Latency: +50-100ms per additional model +- Memory: Models must fit in 6MB total +- CPU: ~30% per model check + +**Option B: Combined model** (Advanced) +```python +# Train a single model that recognizes multiple phrases +# Each phrase maps to different output class +# More complex training but single inference +``` + +**Recommendation for edge:** +- **1-2 wake words max** on K210 +- **Server-side** for 3+ wake words + +#### Approach 3: Contextual Wake Words + +Different wake words trigger different behaviors: + +```python +wake_word_contexts = { + 'hey_mycroft': 'general', # General commands + 'hey_assistant': 'general', # Alternative general + 'emergency': 'priority', # High priority + 'goodnight': 'bedtime', # Bedtime routine +} + +def handle_wake_word(wake_word, command): + context = wake_word_contexts[wake_word] + + if context == 'priority': + # Skip queue, process immediately + # Maybe call emergency contact + pass + elif context == 'bedtime': + # Trigger bedtime automation + # Lower volume for responses + pass + else: + # Normal processing + pass +``` + +### Best Practices for Multiple Wake Words + +1. **Start with one** - Get it working well first +2. **Add gradually** - One at a time, test thoroughly +3. **Different purposes** - Each wake word should have a reason +4. **Monitor performance** - Track false positives per wake word +5. **User preference** - Let family members choose their favorite + +### Recommended Configuration + +**For most users:** +```python +wake_words = { + 'hey_mycroft': 'primary', # Main wake word (pre-trained) + 'hey_computer': 'alternative' # Custom trained for your voice +} +``` + +**For power users:** +```python +wake_words = { + 'hey_mycroft': 'general', + 'jarvis': 'personal_assistant', # Custom responses + 'computer': 'technical_queries', # Different intent parser +} +``` + +**For families:** +```python +wake_words = { + 'hey_mycroft': 'shared', # Everyone can use + 'dad': 'user_alan', # Personalized + 'mom': 'user_sarah', # Personalized + 'kids': 'user_children', # Kid-safe responses +} +``` + +## Voice Adaptation and Multi-User Support + +### Challenge: Different Voices, Same Wake Word + +When multiple people use the system: +- Different accents +- Different speech patterns +- Different pronunciations +- Different vocal characteristics + +### Solution Approaches + +#### Approach 1: Diverse Training Data (Recommended) + +**During initial training:** +```bash +# Have everyone in household record samples +cd ~/precise-models/hey-computer + +# Alan records 30 samples +precise-collect # Record as user 1 + +# Sarah records 30 samples +precise-collect # Record as user 2 + +# Kids record 20 samples +precise-collect # Record as user 3 + +# Combine all in training set +# Train one model that works for everyone +./3-train-model.sh +``` + +**Pros:** +✅ Single model for everyone +✅ No user switching needed +✅ Simple to maintain +✅ Works immediately for all users + +**Cons:** +❌ May have lower per-person accuracy +❌ Requires upfront time from everyone +❌ Hard to add new users later + +#### Approach 2: Incremental Training + +Start with your voice, add others over time: + +```bash +# Week 1: Train with Alan's voice +cd ~/precise-models/hey-computer +# Record and train with Alan's samples +precise-train -e 60 hey-computer.net . + +# Week 2: Sarah wants to use it +# Collect Sarah's samples +mkdir -p sarah-samples/wake-word +precise-collect # Sarah records 20-30 samples + +# Add to existing training set +cp sarah-samples/wake-word/* wake-word/ + +# Retrain (continue from checkpoint) +precise-train -e 30 hey-computer.net . \ + --from-checkpoint hey-computer.net + +# Now works for both Alan and Sarah! +``` + +**Pros:** +✅ Gradual improvement +✅ Don't need everyone upfront +✅ Easy to add new users +✅ Maintains accuracy for existing users + +**Cons:** +❌ May not work well for new users initially +❌ Requires retraining periodically + +#### Approach 3: Per-User Models with Speaker Identification + +Train separate models + identify who's speaking: + +**Step 1: Train per-user wake word models** +```bash +# Alan's model +~/precise-models/hey-computer-alan/ + +# Sarah's model +~/precise-models/hey-computer-sarah/ + +# Kids' model +~/precise-models/hey-computer-kids/ +``` + +**Step 2: Use speaker identification** +```python +# Pseudo-code for speaker identification +def identify_speaker(audio): + """ + Identify speaker from voice characteristics + Using speaker embeddings (x-vectors, d-vectors) + """ + # Extract speaker embedding + embedding = speaker_encoder.encode(audio) + + # Compare to known users + similarities = { + 'alan': cosine_similarity(embedding, alan_embedding), + 'sarah': cosine_similarity(embedding, sarah_embedding), + 'kids': cosine_similarity(embedding, kids_embedding), + } + + # Return most similar + return max(similarities, key=similarities.get) + +def process_command(audio): + # Detect wake word with all models + wake_detected = check_all_models(audio) + + if wake_detected: + # Identify speaker + speaker = identify_speaker(audio) + + # Use speaker-specific model for better accuracy + model = f'~/precise-models/hey-computer-{speaker}/' + + # Continue with speaker context + process_with_context(audio, speaker) +``` + +**Speaker identification libraries:** +- **Resemblyzer** - Simple speaker verification +- **speechbrain** - Complete toolkit +- **pyannote.audio** - You already use this for diarization! + +**Implementation:** +```bash +# You already have pyannote for diarization! +conda activate voice-assistant +pip install pyannote.audio --break-system-packages + +# Can use speaker embeddings for identification +``` + +```python +from pyannote.audio import Inference + +# Load speaker embedding model +inference = Inference( + "pyannote/embedding", + use_auth_token=hf_token +) + +# Extract embeddings for known users +alan_embedding = inference("alan_voice_sample.wav") +sarah_embedding = inference("sarah_voice_sample.wav") + +# Compare with incoming audio +unknown_embedding = inference(audio_buffer) + +from scipy.spatial.distance import cosine +alan_similarity = 1 - cosine(unknown_embedding, alan_embedding) +sarah_similarity = 1 - cosine(unknown_embedding, sarah_embedding) + +if alan_similarity > 0.8: + user = 'alan' +elif sarah_similarity > 0.8: + user = 'sarah' +else: + user = 'unknown' +``` + +**Pros:** +✅ Personalized responses per user +✅ Better accuracy (model optimized for each voice) +✅ User-specific preferences/permissions +✅ Can track who said what + +**Cons:** +❌ More complex setup +❌ Higher resource usage +❌ Requires voice samples from each user +❌ Privacy considerations + +#### Approach 4: Adaptive/Online Learning + +Model improves automatically based on usage: + +```python +class AdaptiveWakeWord: + def __init__(self, base_model): + self.base_model = base_model + self.user_samples = [] + self.retrain_threshold = 50 # Retrain after N samples + + def on_detection(self, audio, user_confirmed=True): + """User confirms this was correct detection""" + if user_confirmed: + self.user_samples.append(audio) + + # Periodically retrain + if len(self.user_samples) >= self.retrain_threshold: + self.retrain_with_samples() + self.user_samples = [] + + def retrain_with_samples(self): + """Background retraining with collected samples""" + # Add samples to training set + # Retrain model + # Swap in new model + pass +``` + +**Pros:** +✅ Automatic improvement +✅ Adapts to user's voice over time +✅ No manual retraining +✅ Gets better with use + +**Cons:** +❌ Complex implementation +❌ Requires user feedback mechanism +❌ Risk of drift/degradation +❌ Background training overhead + +## Recommended Strategy + +### Phase 1: Single Wake Word, Single Model +```bash +# Week 1-2 +# Use pre-trained "Hey Mycroft" +# OR train custom "Hey Computer" with all family members' voices +# Keep it simple, get it working +``` + +### Phase 2: Add Fine-tuning +```bash +# Week 3-4 +# Collect false positives/negatives +# Retrain with household-specific data +# Optimize threshold +``` + +### Phase 3: Consider Multiple Wake Words +```bash +# Month 2 +# If needed, add second wake word +# "Hey Mycroft" for general +# "Jarvis" for personal assistant tasks +``` + +### Phase 4: Personalization +```bash +# Month 3+ +# If desired, add speaker identification +# Personalized responses +# User-specific preferences +``` + +## Practical Examples + +### Example 1: Family of 4, Single Model + +```bash +# Training session with everyone +cd ~/precise-models/hey-mycroft-family + +# Dad records 25 samples +precise-collect + +# Mom records 25 samples +precise-collect + +# Kid 1 records 15 samples +precise-collect + +# Kid 2 records 15 samples +precise-collect + +# Collect shared negative samples (200+) +# TV, music, conversation, etc. +precise-collect -f not-wake-word/household.wav + +# Train single model for everyone +precise-train -e 60 hey-mycroft-family.net . + +# Deploy +python voice_server.py \ + --enable-precise \ + --precise-model hey-mycroft-family.net +``` + +**Result:** Everyone can use it, one model, simple. + +### Example 2: Two Wake Words, Different Purposes + +```python +# voice_server.py configuration +wake_words = { + 'hey_mycroft': { + 'model': 'hey-mycroft.net', + 'sensitivity': 0.5, + 'intent_parser': 'general', # All commands + 'response': 'Yes?' + }, + 'emergency': { + 'model': 'emergency.net', + 'sensitivity': 0.7, # Higher threshold + 'intent_parser': 'emergency', # Limited commands + 'response': 'Emergency mode activated' + } +} + +# "Hey Mycroft, turn on the lights" - works +# "Emergency, call for help" - triggers emergency protocol +``` + +### Example 3: Speaker Identification + Personalization + +```python +# Enhanced processing with speaker ID +def process_with_speaker_id(audio, speaker): + # Different HA entity based on speaker + entity_maps = { + 'alan': { + 'bedroom_light': 'light.master_bedroom', + 'office_light': 'light.alan_office', + }, + 'sarah': { + 'bedroom_light': 'light.master_bedroom', + 'office_light': 'light.sarah_office', + }, + 'kids': { + 'bedroom_light': 'light.kids_bedroom', + 'tv': None, # Kids can't control TV + } + } + + # Transcribe command + text = whisper_transcribe(audio) + + # "Turn on bedroom light" + if 'bedroom light' in text: + entity = entity_maps[speaker]['bedroom_light'] + ha_client.turn_on(entity) + + response = f"Turned on your bedroom light" + + return response +``` + +## Resource Requirements + +### Single Wake Word +- **CPU:** 5-10% (Heimdall) +- **RAM:** 100-200MB +- **Model size:** 1-3MB +- **Training time:** 30-60 min + +### Multiple Wake Words (3 models) +- **CPU:** 15-30% (Heimdall) +- **RAM:** 300-600MB +- **Model size:** 3-9MB total +- **Training time:** 90-180 min + +### With Speaker Identification +- **CPU:** +5-10% for speaker ID +- **RAM:** +200-300MB for embedding model +- **Model size:** +50MB for speaker model +- **Setup time:** +30-60 min for voice enrollment + +### K210 Edge (Maix Duino) +- **Single model:** Feasible, ~30% CPU +- **2 models:** Feasible, ~60% CPU, higher latency +- **3+ models:** Not recommended +- **Speaker ID:** Not feasible (limited RAM/compute) + +## Quick Decision Guide + +**Just getting started?** +→ Use pre-trained "Hey Mycroft" + +**Want custom wake word?** +→ Train one model with all family voices + +**Need multiple wake words?** +→ Start server-side with 2-3 models + +**Want personalization?** +→ Add speaker identification + +**Deploying to edge (K210)?** +→ Stick to 1-2 wake words maximum + +**Family of 4+ people?** +→ Train single model with everyone's voice + +**Privacy is paramount?** +→ Skip speaker ID, use single universal model + +## Testing Multiple Wake Words + +```bash +# Test all wake words quickly +conda activate precise + +# Terminal 1: Hey Mycroft +precise-listen hey-mycroft.net + +# Terminal 2: Hey Computer +precise-listen hey-computer.net + +# Terminal 3: Emergency +precise-listen emergency.net + +# Say each wake word, verify correct detection +``` + +## Conclusion + +### For Your Maix Duino Project: + +**Recommended approach:** +1. **Start with "Hey Mycroft"** - Use pre-trained model +2. **Fine-tune if needed** - Add your household's voices +3. **Consider 2nd wake word** - Only if you have a specific use case +4. **Speaker ID** - Phase 2/3 enhancement, not critical for MVP +5. **Keep it simple** - One wake word works great for most users + +**The pre-trained "Hey Mycroft" model saves you 1-2 hours** and works immediately. You can always fine-tune or add custom wake words later! + +**Multiple wake words are cool but not necessary** - Most commercial products use just one. Focus on making one wake word work really well before adding more. + +**Voice adaptation** - Training with multiple voices upfront is simpler than per-user models. Save speaker ID for later if you need personalization. + +## Quick Start with Pre-trained + +```bash +# On Heimdall +cd ~/precise-models/pretrained +wget https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz +tar xzf hey-mycroft.tar.gz + +# Test it +conda activate precise +precise-listen hey-mycroft.net + +# Deploy +cd ~/voice-assistant +python voice_server.py \ + --enable-precise \ + --precise-model ~/precise-models/pretrained/hey-mycroft.net + +# You're done! No training needed! +``` + +**That's it - you have a working wake word in 5 minutes!** 🎉 diff --git a/docs/WAKE_WORD_QUICK_REF.md b/docs/WAKE_WORD_QUICK_REF.md new file mode 100755 index 0000000..9ab0ff8 --- /dev/null +++ b/docs/WAKE_WORD_QUICK_REF.md @@ -0,0 +1,411 @@ +# Wake Word Quick Reference Card + +## 🎯 TL;DR: What Should I Do? + +### Recommendation for Your Setup + +**Week 1:** Use pre-trained "Hey Mycroft" +```bash +./download_pretrained_models.sh --model hey-mycroft +precise-listen ~/precise-models/pretrained/hey-mycroft.net +``` + +**Week 2-3:** Fine-tune with all family members' voices +```bash +cd ~/precise-models/hey-mycroft-family +precise-train -e 30 custom.net . --from-checkpoint ../pretrained/hey-mycroft.net +``` + +**Week 4+:** Add speaker identification +```bash +pip install resemblyzer +python enroll_speaker.py --name Alan --duration 20 +python enroll_speaker.py --name [Family] --duration 20 +``` + +**Month 2+:** Add second wake word (Hey Jarvis for Plex?) +```bash +./download_pretrained_models.sh --model hey-jarvis +# Run both in parallel on server +``` + +--- + +## 📋 Pre-trained Models + +### Available Models (Ready to Use!) + +| Wake Word | Download | Best For | +|-----------|----------|----------| +| **Hey Mycroft** ⭐ | `--model hey-mycroft` | Default choice, most data | +| **Hey Jarvis** | `--model hey-jarvis` | Pop culture, media control | +| **Christopher** | `--model christopher` | Unique, less common | +| **Hey Ezra** | `--model hey-ezra` | Alternative option | + +### Quick Download + +```bash +# Download one +./download_pretrained_models.sh --model hey-mycroft + +# Download all +./download_pretrained_models.sh --test-all + +# Test immediately +precise-listen ~/precise-models/pretrained/hey-mycroft.net +``` + +--- + +## 🔢 Multiple Wake Words + +### Option 1: Multiple Models (Server-Side) ⭐ RECOMMENDED + +**What:** Run 2-3 different wake word models simultaneously +**Where:** Heimdall (server) +**Performance:** ~15-30% CPU for 3 models + +```bash +# Start with multiple wake words +python voice_server.py \ + --enable-precise \ + --precise-models "\ +hey-mycroft:~/models/hey-mycroft.net:0.5,\ +hey-jarvis:~/models/hey-jarvis.net:0.5" +``` + +**Pros:** +- ✅ Can identify which wake word was used +- ✅ Different contexts (Mycroft=commands, Jarvis=media) +- ✅ Easy to add/remove wake words +- ✅ Each can have different sensitivity + +**Cons:** +- ❌ Only works server-side (not on Maix Duino) +- ❌ Higher CPU usage (but still reasonable) + +**Use When:** +- You want different wake words for different purposes +- Server has CPU to spare (yours does!) +- Want flexibility to add wake words later + +### Option 2: Single Multi-Phrase Model (Edge-Compatible) + +**What:** One model responds to multiple phrases +**Where:** Server OR Maix Duino +**Performance:** Same as single model + +```bash +# Train on multiple phrases +cd ~/precise-models/multi-wake +# Record "Hey Mycroft" samples → wake-word/ +# Record "Hey Computer" samples → wake-word/ +# Record negatives → not-wake-word/ +precise-train -e 60 multi-wake.net . +``` + +**Pros:** +- ✅ Single model = less compute +- ✅ Works on edge (K210) +- ✅ Simple deployment + +**Cons:** +- ❌ Can't tell which wake word was used +- ❌ May reduce accuracy +- ❌ Higher false positive risk + +**Use When:** +- Deploying to Maix Duino (edge) +- Want backup wake words +- Don't care which was used + +--- + +## 👥 Multi-User Support + +### Option 1: Inclusive Training ⭐ START HERE + +**What:** One model, all voices +**How:** All family members record samples + +```bash +cd ~/precise-models/family-wake +# Alice records 30 samples +# Bob records 30 samples +# You record 30 samples +precise-train -e 60 family-wake.net . +``` + +**Pros:** +- ✅ Everyone can use it +- ✅ Simple deployment +- ✅ Single model + +**Cons:** +- ❌ Can't identify who spoke +- ❌ No personalization + +**Use When:** +- Just getting started +- Don't need to know who spoke +- Want simplicity + +### Option 2: Speaker Identification (Week 4+) + +**What:** Detect wake word, then identify speaker +**How:** Voice embeddings (resemblyzer or pyannote) + +```bash +# Install +pip install resemblyzer + +# Enroll users +python enroll_speaker.py --name Alan --duration 20 +python enroll_speaker.py --name Alice --duration 20 +python enroll_speaker.py --name Bob --duration 20 + +# Server identifies speaker automatically +``` + +**Pros:** +- ✅ Personalized responses +- ✅ User-specific permissions +- ✅ Better privacy +- ✅ Track preferences + +**Cons:** +- ❌ More complex +- ❌ Requires enrollment +- ❌ +100-200ms latency +- ❌ May fail with similar voices + +**Use When:** +- Want personalization +- Need user-specific commands +- Ready for advanced features + +### Option 3: Per-User Wake Words (Advanced) + +**What:** Each person has their own wake word +**How:** Multiple models, one per person + +```bash +# Alice: "Hey Mycroft" +# Bob: "Hey Jarvis" +# You: "Hey Computer" + +# Run all 3 models in parallel +``` + +**Pros:** +- ✅ Automatic user ID +- ✅ Highest accuracy per user +- ✅ Clear separation + +**Cons:** +- ❌ 3x models = 3x CPU +- ❌ Users must remember their word +- ❌ Server-only (not edge) + +**Use When:** +- Need automatic user ID +- Have CPU to spare +- Users want their own wake word + +--- + +## 🎯 Decision Tree + +``` +START: Want to use voice assistant + │ + ├─ Single user or don't care who spoke? + │ └─ Use: Inclusive Training (Option 1) + │ └─ Download: Hey Mycroft (pre-trained) + │ + ├─ Multiple users AND need to know who spoke? + │ └─ Use: Speaker Identification (Option 2) + │ └─ Start with: Hey Mycroft + resemblyzer + │ + ├─ Want different wake words for different purposes? + │ └─ Use: Multiple Models (Option 1) + │ └─ Download: Hey Mycroft + Hey Jarvis + │ + └─ Deploying to Maix Duino (edge)? + └─ Use: Single Multi-Phrase Model (Option 2) + └─ Train: Custom model with 2-3 phrases +``` + +--- + +## 📊 Comparison Table + +| Feature | Inclusive | Speaker ID | Per-User Wake | Multiple Wake | +|---------|-----------|------------|---------------|---------------| +| **Setup Time** | 2 hours | 4 hours | 6 hours | 3 hours | +| **Complexity** | ⭐ Easy | ⭐⭐⭐ Medium | ⭐⭐⭐⭐ Hard | ⭐⭐ Easy | +| **CPU Usage** | 5-10% | 10-15% | 15-30% | 15-30% | +| **Latency** | 100ms | 300ms | 100ms | 100ms | +| **User ID** | ❌ No | ✅ Yes | ✅ Yes | ❌ No | +| **Edge Deploy** | ✅ Yes | ⚠️ Maybe | ❌ No | ⚠️ Partial | +| **Personalize** | ❌ No | ✅ Yes | ✅ Yes | ⚠️ Partial | + +--- + +## 🚀 Recommended Timeline + +### Week 1: Get It Working +```bash +# Use pre-trained Hey Mycroft +./download_pretrained_models.sh --model hey-mycroft + +# Test it +precise-listen ~/precise-models/pretrained/hey-mycroft.net + +# Deploy to server +python voice_server.py --enable-precise \ + --precise-model ~/precise-models/pretrained/hey-mycroft.net +``` + +### Week 2-3: Make It Yours +```bash +# Fine-tune with your family's voices +cd ~/precise-models/hey-mycroft-family + +# Have everyone record 20-30 samples +precise-collect # Alice +precise-collect # Bob +precise-collect # You + +# Train +precise-train -e 30 custom.net . \ + --from-checkpoint ../pretrained/hey-mycroft.net +``` + +### Week 4+: Add Intelligence +```bash +# Speaker identification +pip install resemblyzer +python enroll_speaker.py --name Alan --duration 20 +python enroll_speaker.py --name Alice --duration 20 + +# Now server knows who's speaking! +``` + +### Month 2+: Expand Features +```bash +# Add second wake word for media control +./download_pretrained_models.sh --model hey-jarvis + +# Run both: Mycroft for commands, Jarvis for Plex +python voice_server.py --enable-precise \ + --precise-models "mycroft:hey-mycroft.net:0.5,jarvis:hey-jarvis.net:0.5" +``` + +--- + +## 💡 Pro Tips + +### Wake Word Selection +- ✅ **DO:** Choose clear, distinct wake words +- ✅ **DO:** Test in your environment +- ❌ **DON'T:** Use similar-sounding words +- ❌ **DON'T:** Use common phrases + +### Training +- ✅ **DO:** Include all intended users +- ✅ **DO:** Record in various conditions +- ✅ **DO:** Add false positives to training +- ❌ **DON'T:** Rush the training process + +### Deployment +- ✅ **DO:** Start simple (one wake word) +- ✅ **DO:** Test thoroughly before adding features +- ✅ **DO:** Monitor false positive rate +- ❌ **DON'T:** Deploy too many wake words at once + +### Speaker ID +- ✅ **DO:** Use 20+ seconds for enrollment +- ✅ **DO:** Re-enroll if accuracy drops +- ✅ **DO:** Test threshold values +- ❌ **DON'T:** Expect 100% accuracy + +--- + +## 🔧 Quick Commands + +```bash +# Download pre-trained model +./download_pretrained_models.sh --model hey-mycroft + +# Test model +precise-listen ~/precise-models/pretrained/hey-mycroft.net + +# Fine-tune from pre-trained +precise-train -e 30 custom.net . \ + --from-checkpoint ~/precise-models/pretrained/hey-mycroft.net + +# Enroll speaker +python enroll_speaker.py --name Alan --duration 20 + +# Start with single wake word +python voice_server.py --enable-precise \ + --precise-model hey-mycroft.net + +# Start with multiple wake words +python voice_server.py --enable-precise \ + --precise-models "mycroft:hey-mycroft.net:0.5,jarvis:hey-jarvis.net:0.5" + +# Check status +curl http://10.1.10.71:5000/wake-word/status + +# Monitor detections +curl http://10.1.10.71:5000/wake-word/detections +``` + +--- + +## 📚 See Also + +- **Full guide:** [ADVANCED_WAKE_WORD_TOPICS.md](ADVANCED_WAKE_WORD_TOPICS.md) +- **Training:** [MYCROFT_PRECISE_GUIDE.md](MYCROFT_PRECISE_GUIDE.md) +- **Deployment:** [PRECISE_DEPLOYMENT.md](PRECISE_DEPLOYMENT.md) +- **Getting started:** [QUICKSTART.md](QUICKSTART.md) + +--- + +## ❓ FAQ + +**Q: Can I use "Hey Mycroft" right away?** +A: Yes! Download with `./download_pretrained_models.sh --model hey-mycroft` + +**Q: How many wake words can I run at once?** +A: 2-3 comfortably on server. Maix Duino can handle 1. + +**Q: Can I train my own custom wake word?** +A: Yes! See MYCROFT_PRECISE_GUIDE.md Phase 2. + +**Q: Does speaker ID work with multiple wake words?** +A: Yes! Wake word detected → Speaker identified → Personalized response. + +**Q: Can I use this on Maix Duino?** +A: Server-side (start here), then convert to KMODEL (advanced). + +**Q: How accurate is speaker identification?** +A: 85-95% with good enrollment. Re-enroll if accuracy drops. + +**Q: What if someone has a cold?** +A: May reduce accuracy temporarily. System should recover when voice returns to normal. + +**Q: Can kids use it?** +A: Yes! Include their voices in training or enroll them separately. + +--- + +**Quick Decision:** Start with pre-trained Hey Mycroft. Add features later! + +```bash +./download_pretrained_models.sh --model hey-mycroft +precise-listen ~/precise-models/pretrained/hey-mycroft.net +# It just works! ✨ +``` diff --git a/docs/maix-voice-assistant-architecture.md b/docs/maix-voice-assistant-architecture.md new file mode 100755 index 0000000..9e4424d --- /dev/null +++ b/docs/maix-voice-assistant-architecture.md @@ -0,0 +1,347 @@ +# Maix Duino Voice Assistant - System Architecture + +## Overview +Local voice assistant using Sipeed Maix Duino board integrated with Home Assistant, leveraging existing home lab infrastructure for AI processing. + +## Hardware Components + +### Maix Duino Board +- **Processor**: K210 dual-core RISC-V @ 400MHz +- **AI Accelerator**: KPU for neural network inference +- **Audio**: I2S microphone + speaker output +- **Connectivity**: ESP32 for WiFi/BLE +- **Programming**: MaixPy (MicroPython) + +### Recommended Accessories +- I2S MEMS microphone (or microphone array for better pickup) +- Small speaker (3-5W) or audio output to existing speakers +- USB-C power supply (5V/2A minimum) + +## Software Architecture + +### Edge Layer (Maix Duino) +``` +┌─────────────────────────────────────┐ +│ Maix Duino (MaixPy) │ +├─────────────────────────────────────┤ +│ • Wake Word Detection (KPU) │ +│ • Audio Capture (I2S) │ +│ • Audio Streaming → Heimdall │ +│ • Audio Playback ← Heimdall │ +│ • LED Feedback (listening status) │ +└─────────────────────────────────────┘ + ↕ WiFi/HTTP/WebSocket +┌─────────────────────────────────────┐ +│ Voice Processing Server │ +│ (Heimdall - 10.1.10.71) │ +├─────────────────────────────────────┤ +│ • Whisper STT (existing setup!) │ +│ • Intent Recognition (Rasa/custom) │ +│ • Piper TTS │ +│ • Home Assistant API Client │ +└─────────────────────────────────────┘ + ↕ REST API/MQTT +┌─────────────────────────────────────┐ +│ Home Assistant │ +│ (Your HA instance) │ +├─────────────────────────────────────┤ +│ • Device Control │ +│ • State Management │ +│ • Automation Triggers │ +└─────────────────────────────────────┘ +``` + +## Communication Flow + +### 1. Wake Word Detection (Local) +``` +User says "Hey Assistant" + ↓ +Maix Duino KPU detects wake word + ↓ +LED turns on (listening mode) + ↓ +Start audio streaming to Heimdall +``` + +### 2. Speech Processing (Heimdall) +``` +Audio stream received + ↓ +Whisper transcribes to text + ↓ +Intent parser extracts command + ↓ +Query Home Assistant API + ↓ +Generate response text + ↓ +Piper TTS creates audio + ↓ +Stream audio back to Maix Duino +``` + +### 3. Playback & Feedback +``` +Receive audio stream + ↓ +Play through speaker + ↓ +LED indicates completion + ↓ +Return to wake word detection +``` + +## Network Configuration + +### Maix Duino Network Settings +- **IP**: 10.1.10.xxx (assign static via DHCP reservation) +- **Gateway**: 10.1.10.1 +- **DNS**: 10.1.10.4 (Pi-hole) + +### Service Endpoints +- **Voice Processing Server**: http://10.1.10.71:5000 +- **Home Assistant**: (your existing HA URL) +- **MQTT Broker**: (optional, if using MQTT) + +### Caddy Reverse Proxy Entry +Add to `/mnt/project/epona_-_Caddyfile`: +```caddy +# Voice Assistant API +handle /voice-assistant* { + uri strip_prefix /voice-assistant + reverse_proxy http://10.1.10.71:5000 +} +``` + +## Software Stack + +### Maix Duino (MaixPy) +- **Firmware**: Latest MaixPy release +- **Libraries**: + - `Maix.KPU` - Neural network inference + - `Maix.I2S` - Audio capture/playback + - `socket` - Network communication + - `ujson` - JSON handling + +### Heimdall Server (Python) +- **Environment**: Create new conda env + ```bash + conda create -n voice-assistant python=3.10 + conda activate voice-assistant + ``` +- **Dependencies**: + - `openai-whisper` (already installed!) + - `piper-tts` - Text-to-speech + - `flask` - REST API server + - `requests` - HTTP client + - `pyaudio` - Audio handling + - `websockets` - Real-time streaming + +### Optional: Intent Recognition +- **Rasa** - Full NLU framework (heavier but powerful) +- **Simple pattern matching** - Lightweight, start here +- **LLM-based** - Use your existing LLM setup on Heimdall + +## Data Flow Examples + +### Example 1: Turn on lights +``` +User: "Hey Assistant, turn on the living room lights" + ↓ +Wake word detected → Start recording + ↓ +Whisper STT: "turn on the living room lights" + ↓ +Intent Parser: { + "action": "turn_on", + "entity": "light.living_room" +} + ↓ +Home Assistant API: + POST /api/services/light/turn_on + {"entity_id": "light.living_room"} + ↓ +Response: "Living room lights turned on" + ↓ +Piper TTS → Audio playback +``` + +### Example 2: Get status +``` +User: "What's the temperature?" + ↓ +Whisper STT: "what's the temperature" + ↓ +Intent Parser: { + "action": "get_state", + "entity": "sensor.temperature" +} + ↓ +Home Assistant API: + GET /api/states/sensor.temperature + ↓ +Response: "The temperature is 72 degrees" + ↓ +Piper TTS → Audio playback +``` + +## Phase 1 Implementation Plan + +### Step 1: Maix Duino Setup (Week 1) +- [ ] Flash latest MaixPy firmware +- [ ] Test audio input/output +- [ ] Implement basic network communication +- [ ] Test streaming audio to server + +### Step 2: Server Setup (Week 1-2) +- [ ] Create conda environment on Heimdall +- [ ] Set up Flask API server +- [ ] Integrate Whisper (already have this!) +- [ ] Install and test Piper TTS +- [ ] Create basic Home Assistant API client + +### Step 3: Wake Word Training (Week 2) +- [ ] Record wake word samples +- [ ] Train custom wake word model +- [ ] Convert model for K210 KPU +- [ ] Test on-device detection + +### Step 4: Integration (Week 3) +- [ ] Connect all components +- [ ] Test end-to-end flow +- [ ] Add error handling +- [ ] Implement fallbacks + +### Step 5: Enhancement (Week 4+) +- [ ] Add more intents +- [ ] Improve NLU accuracy +- [ ] Add multi-room support +- [ ] Implement conversation context + +## Development Tools + +### Testing Wake Word +```python +# Use existing diarization.py for testing audio quality +python3 /path/to/diarization.py test_audio.wav \ + --format vtt \ + --model medium +``` + +### Monitoring +- Heimdall logs: `/var/log/voice-assistant/` +- Maix Duino serial console: 115200 baud +- Home Assistant logs: Standard HA logging + +## Security Considerations + +1. **No external cloud services** - Everything local +2. **Network isolation** - Keep on 10.1.10.0/24 +3. **Authentication** - Use HA long-lived tokens +4. **Rate limiting** - Prevent abuse +5. **Audio privacy** - Only stream after wake word + +## Resource Requirements + +### Heimdall +- **CPU**: Minimal (< 5% idle, spikes during STT) +- **RAM**: ~2GB for Whisper medium model +- **Storage**: ~5GB for models +- **Network**: Low bandwidth (16kHz audio stream) + +### Maix Duino +- **Power**: ~1-2W typical +- **Storage**: 16MB flash (plenty for wake word model) +- **RAM**: 8MB SRAM (sufficient for audio buffering) + +## Alternative Architectures + +### Option A: Fully On-Device (Limited) +- Everything on Maix Duino +- Very limited vocabulary +- No internet required +- Lower accuracy + +### Option B: Hybrid (Recommended) +- Wake word on Maix Duino +- Processing on Heimdall +- Best balance of speed/accuracy + +### Option C: Raspberry Pi Alternative +- If K210 proves limiting +- More processing power +- Still local/FOSS +- Higher cost + +## Expansion Ideas + +### Future Enhancements +1. **Multi-room**: Deploy multiple Maix Duino units +2. **Music playback**: Integrate with Plex +3. **Timers/Reminders**: Local scheduling +4. **Weather**: Pull from local weather station +5. **Calendar**: Sync with Nextcloud +6. **Intercom**: Room-to-room communication +7. **Sound events**: Doorbell, smoke alarm detection + +### Integration with Existing Infrastructure +- **Plex**: Voice control for media playback +- **qBittorrent**: Status queries, torrent management +- **Nextcloud**: Calendar/contact queries +- **Matrix**: Send messages via voice + +## Cost Estimate + +- Maix Duino board: ~$20-30 (already have!) +- Microphone: ~$5-10 (if not included) +- Speaker: ~$10-15 (or use existing) +- **Total**: $0-55 (mostly already have) + +Compare to commercial solutions: +- Google Home Mini: $50 (requires cloud) +- Amazon Echo Dot: $50 (requires cloud) +- Apple HomePod Mini: $99 (requires cloud) + +## Success Criteria + +### Minimum Viable Product (MVP) +- ✓ Wake word detection < 1 second +- ✓ Speech-to-text accuracy > 90% +- ✓ Home Assistant command execution +- ✓ Response time < 3 seconds total +- ✓ All processing local (no cloud) + +### Enhanced Version +- ✓ Multi-intent conversations +- ✓ Context awareness +- ✓ Multiple wake words +- ✓ Room-aware responses +- ✓ Custom voice training + +## Resources & Documentation + +### Official Documentation +- Maix Duino: https://wiki.sipeed.com/hardware/en/maix/ +- MaixPy: https://maixpy.sipeed.com/ +- Home Assistant API: https://developers.home-assistant.io/ + +### Wake Word Tools +- Mycroft Precise: https://github.com/MycroftAI/mycroft-precise +- Porcupine: https://github.com/Picovoice/porcupine + +### TTS Options +- Piper: https://github.com/rhasspy/piper +- Coqui TTS: https://github.com/coqui-ai/TTS + +### Community Projects +- Rhasspy: https://rhasspy.readthedocs.io/ (full voice assistant framework) +- Willow: https://github.com/toverainc/willow (ESP32-based alternative) + +## Next Steps + +1. **Test current setup**: Verify Maix Duino boots and can connect to WiFi +2. **Audio test**: Record and playback test on the board +3. **Server setup**: Create conda environment and install dependencies +4. **Simple prototype**: Wake word → beep (no processing yet) +5. **Iterate**: Add complexity step by step diff --git a/hardware/maixduino/MICROPYTHON_QUIRKS.md b/hardware/maixduino/MICROPYTHON_QUIRKS.md new file mode 100755 index 0000000..b53a819 --- /dev/null +++ b/hardware/maixduino/MICROPYTHON_QUIRKS.md @@ -0,0 +1,348 @@ +# MicroPython/MaixPy Quirks and Compatibility Notes + +**Date:** 2025-12-03 +**MicroPython Version:** v0.6.2-89-gd8901fd22 on 2024-06-17 +**Hardware:** Sipeed Maixduino (K210) + +This document captures all the compatibility issues and workarounds discovered while developing the voice assistant client for Maixduino. + +--- + +## String Formatting + +### ❌ F-strings NOT supported +```python +# WRONG - SyntaxError +message = f"IP: {ip}" +temperature = f"Temp: {temp}°C" +``` + +### ✅ Use string concatenation +```python +# CORRECT +message = "IP: " + str(ip) +temperature = "Temp: " + str(temp) + "°C" +``` + +--- + +## Conditional Expressions (Ternary Operator) + +### ❌ Inline ternary expressions NOT supported +```python +# WRONG - SyntaxError +plural = "s" if count > 1 else "" +message = "Found " + str(count) + " item" + ("s" if count > 1 else "") +``` + +### ✅ Use explicit if/else blocks +```python +# CORRECT +if count > 1: + plural = "s" +else: + plural = "" +message = "Found " + str(count) + " item" + plural +``` + +--- + +## String Methods + +### ❌ decode() doesn't accept keyword arguments +```python +# WRONG - TypeError: function doesn't take keyword arguments +text = response.decode('utf-8', errors='ignore') +``` + +### ✅ Use positional arguments only (or catch exceptions) +```python +# CORRECT +try: + text = response.decode('utf-8') +except: + text = str(response) +``` + +--- + +## Display/LCD Color Format + +### ❌ RGB tuples NOT accepted +```python +# WRONG - TypeError: can't convert tuple to int +COLOR_RED = (255, 0, 0) +lcd.draw_string(10, 50, "Hello", COLOR_RED, 0) +``` + +### ✅ Use bit-packed integers +```python +# CORRECT - Pack RGB into 16-bit or 24-bit integer +def rgb_to_int(r, g, b): + return (r << 16) | (g << 8) | b + +COLOR_RED = rgb_to_int(255, 0, 0) +lcd.draw_string(10, 50, "Hello", COLOR_RED, 0) +``` + +--- + +## Network - WiFi Module + +### ❌ Standard network.WLAN NOT available +```python +# WRONG - AttributeError: 'module' object has no attribute 'WLAN' +import network +nic = network.WLAN(network.STA_IF) +``` + +### ✅ Use network.ESP32_SPI for Maixduino +```python +# CORRECT - Requires full pin configuration +from network import ESP32_SPI +from fpioa_manager import fm + +# Register all 6 SPI pins +fm.register(25, fm.fpioa.GPIOHS10, force=True) # CS +fm.register(8, fm.fpioa.GPIOHS11, force=True) # RST +fm.register(9, fm.fpioa.GPIOHS12, force=True) # RDY +fm.register(28, fm.fpioa.GPIOHS13, force=True) # MOSI +fm.register(26, fm.fpioa.GPIOHS14, force=True) # MISO +fm.register(27, fm.fpioa.GPIOHS15, force=True) # SCLK + +nic = ESP32_SPI( + cs=fm.fpioa.GPIOHS10, + rst=fm.fpioa.GPIOHS11, + rdy=fm.fpioa.GPIOHS12, + mosi=fm.fpioa.GPIOHS13, + miso=fm.fpioa.GPIOHS14, + sclk=fm.fpioa.GPIOHS15 +) + +nic.connect(SSID, PASSWORD) +``` + +### ❌ active() method NOT available +```python +# WRONG - AttributeError: 'ESP32_SPI' object has no attribute 'active' +nic.active(True) +``` + +### ✅ Just use connect() directly +```python +# CORRECT +nic.connect(SSID, PASSWORD) +``` + +--- + +## I2S Audio + +### ❌ record() doesn't accept size parameter only +```python +# WRONG - TypeError: object with buffer protocol required +chunk = i2s_dev.record(1024) +``` + +### ✅ Returns Audio object, use to_bytes() +```python +# CORRECT +audio_obj = i2s_dev.record(total_bytes) +audio_data = audio_obj.to_bytes() +``` + +**Note:** Audio data often comes in unexpected formats: +- Expected: 16-bit mono PCM +- Reality: Often 32-bit or stereo (4x expected size) +- Solution: Implement format detection and conversion + +--- + +## Memory Management + +### Memory is VERY limited (~6MB total, much less available) + +**Problems encountered:** +- Creating large bytearrays fails (>100KB can fail) +- Multiple allocations cause fragmentation +- In-place operations preferred over creating new buffers + +### ❌ Creating new buffers +```python +# WRONG - MemoryError on large data +compressed = bytearray() +for i in range(0, len(data), 4): + compressed.extend(data[i:i+2]) # Allocates new memory +``` + +### ✅ Work with smaller chunks or compress during transmission +```python +# CORRECT - Process in smaller pieces +chunk_size = 512 +for i in range(0, len(data), chunk_size): + chunk = data[i:i+chunk_size] + process_chunk(chunk) # Handle incrementally +``` + +**Solutions implemented:** +1. Reduce recording duration (3s → 1s) +2. Compress audio (μ-law: 50% size reduction) +3. Stream transmission in small chunks (512 bytes) +4. Add delays between sends to prevent buffer overflow + +--- + +## String Operations + +### ❌ Arithmetic in string concatenation +```python +# WRONG - SyntaxError (sometimes) +message = "Count: #" + str(count + 1) +``` + +### ✅ Separate arithmetic from concatenation +```python +# CORRECT +next_count = count + 1 +message = "Count: #" + str(next_count) +``` + +--- + +## Bytearray Operations + +### ❌ Item deletion NOT supported +```python +# WRONG - TypeError: 'bytearray' object doesn't support item deletion +del audio_data[expected_size:] +``` + +### ✅ Create new bytearray with slice +```python +# CORRECT +audio_data = audio_data[:expected_size] +# Or create new buffer +trimmed = bytearray(expected_size) +trimmed[:] = audio_data[:expected_size] +``` + +--- + +## HTTP Requests + +### ❌ urequests module NOT available +```python +# WRONG - ImportError: no module named 'urequests' +import urequests +response = urequests.post(url, data=data) +``` + +### ✅ Use raw socket HTTP +```python +# CORRECT +import socket + +s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +s.connect((host, port)) + +# Manual HTTP headers +headers = "POST /path HTTP/1.1\r\n" +headers += "Host: " + host + "\r\n" +headers += "Content-Type: audio/wav\r\n" +headers += "Content-Length: " + str(len(data)) + "\r\n" +headers += "Connection: close\r\n\r\n" + +s.send(headers.encode()) +s.send(data) + +response = s.recv(1024) +s.close() +``` + +**Socket I/O errors common:** +- `[Errno 5] EIO` - Buffer overflow or disconnect +- Solutions: + - Send smaller chunks (512-1024 bytes) + - Add delays between sends (`time.sleep_ms(10)`) + - Enable keepalive if supported + +--- + +## Best Practices for MaixPy + +1. **Avoid complex expressions** - Break into simple steps +2. **Pre-allocate when possible** - Reduce fragmentation +3. **Use small buffers** - 512-1024 byte chunks work well +4. **Add delays in loops** - Prevent watchdog/buffer issues +5. **Explicit type conversions** - Always use `str()`, `int()`, etc. +6. **Test incrementally** - Memory errors appear suddenly +7. **Monitor serial output** - Errors often give hints +8. **Simplify, simplify** - Complexity = bugs in MicroPython + +--- + +## Testing Methodology + +When porting Python code to MaixPy: + +1. Start with simplest version (hardcoded values) +2. Test each function individually via REPL +3. Add features incrementally +4. Watch for memory errors (usually allocation failures) +5. If error occurs, simplify the last change +6. Use print statements liberally (no debugger available) + +--- + +## Hardware-Specific Notes + +### Maixduino ESP32 WiFi +- Requires manual pin registration +- 6 pins must be configured (CS, RST, RDY, MOSI, MISO, SCLK) +- Connection can be slow (20+ seconds) +- Stability improves with smaller packet sizes + +### I2S Microphone +- Returns Audio objects, not raw bytes +- Format is often different than configured +- May return stereo when mono requested +- May return 32-bit when 16-bit requested +- Always implement format detection/conversion + +### BOOT Button (GPIO 16) +- Active low (0 = pressed, 1 = released) +- Requires pull-up configuration +- Debounce by waiting for release +- Can be used without interrupts (polling is fine) + +--- + +## Resources + +- **MaixPy Documentation:** https://maixpy.sipeed.com/ +- **K210 Datasheet:** https://canaan.io/product/kendryteai +- **ESP32 SPI Firmware:** https://github.com/sipeed/MaixPy_scripts/tree/master/network + +--- + +## Summary of Successful Patterns + +```python +# Audio recording and transmission pipeline +1. Record audio → Audio object (128KB for 1 second) +2. Convert to bytes → to_bytes() (still 128KB) +3. Detect format → Check size vs expected +4. Convert to mono 16-bit → In-place copy (32KB) +5. Compress with μ-law → 50% reduction (16KB) +6. Send in chunks → 512 bytes at a time with delays +7. Parse response → Simple string operations + +# Total: ~85% size reduction, fits in memory! +``` + +This approach works reliably on K210 with ~6MB RAM. + +--- + +**Last Updated:** 2025-12-03 +**Status:** Fully tested and working diff --git a/hardware/maixduino/README.md b/hardware/maixduino/README.md new file mode 100755 index 0000000..bcf6bf5 --- /dev/null +++ b/hardware/maixduino/README.md @@ -0,0 +1,184 @@ +# Maixduino Scripts + +Scripts to copy/paste into MaixPy IDE for running on the Maix Duino board. + +## Files + +### 1. maix_test_simple.py +**Purpose:** Hardware and connectivity test +**Use:** Copy/paste into MaixPy IDE to test before deploying full application + +**Tests:** +- LCD display functionality +- WiFi connection +- Network connection to Heimdall server (port 3006) +- I2S audio hardware initialization + +**Before running:** +1. Edit WiFi credentials (lines 16-17): + ```python + WIFI_SSID = "YourNetworkName" + WIFI_PASSWORD = "YourPassword" + ``` +2. Verify server URL is correct (line 18): + ```python + SERVER_URL = "http://10.1.10.71:3006" + ``` +3. Copy entire file contents +4. Paste into MaixPy IDE +5. Click RUN button + +**Expected output:** +- Display will show test results +- Serial console will print detailed progress +- Will report OK/FAIL for each test + +--- + +### 2. maix_voice_client.py +**Purpose:** Full voice assistant client +**Use:** Copy/paste into MaixPy IDE after test passes + +**Features:** +- Wake word detection (placeholder - uses amplitude trigger) +- Audio recording after wake word +- Sends audio to Heimdall server for processing +- Displays transcription and response on LCD +- LED feedback for status + +**Before running:** +1. Edit WiFi credentials (lines 38-39) +2. Verify server URL (line 42) +3. Adjust audio settings if needed (lines 45-62) + +**For SD card deployment:** +1. Copy this file to SD card as `main.py` +2. Board will auto-run on boot + +--- + +## Deployment Workflow + +### Step 1: Test Hardware (maix_test_simple.py) +``` +1. Edit WiFi settings +2. Paste into MaixPy IDE +3. Click RUN +4. Verify all tests pass +``` + +### Step 2: Deploy Full Client (maix_voice_client.py) +**Option A - IDE Testing:** +``` +1. Edit WiFi settings +2. Paste into MaixPy IDE +3. Click RUN for testing +``` + +**Option B - Permanent SD Card:** +``` +1. Edit WiFi settings +2. Save to SD card as: /sd/main.py +3. Reboot board - auto-runs on boot +``` + +--- + +## Hardware Requirements + +### Maix Duino Board +- K210 processor with KPU +- LCD display (built-in) +- I2S microphone (check connections) +- ESP32 WiFi module (built-in) + +### I2S Pin Configuration (Default) +```python +Pin 20: I2S0_IN_D0 (Data) +Pin 19: I2S0_WS (Word Select) +Pin 18: I2S0_SCLK (Clock) +``` + +**Note:** If your microphone uses different pins, edit the pin assignments in the scripts. + +--- + +## Troubleshooting + +### WiFi Won't Connect +- Verify SSID and password are correct +- Ensure WiFi is 2.4GHz (not 5GHz - Maix doesn't support 5GHz) +- Check signal strength +- Try moving closer to router + +### Server Connection Fails +- Verify Heimdall server is running on port 3006 +- Check firewall allows port 3006 +- Ensure Maix is on same network (10.1.10.0/24) +- Test from another device: `curl http://10.1.10.71:3006/health` + +### Audio Initialization Fails +- Check microphone is properly connected +- Verify I2S pins match your hardware +- Try alternate pin configuration if needed +- Check microphone requires 3.3V (not 5V) + +### Script Errors in MaixPy IDE +- Ensure using latest MaixPy firmware +- Check for typos when editing WiFi credentials +- Verify entire script was copied (check for truncation) +- Look at serial console for detailed error messages + +--- + +## MaixPy IDE Tips + +### Running Scripts +1. Connect board via USB +2. Select correct board model: Tools → Select Board +3. Click connect button (turns red when connected) +4. Paste code into editor +5. Click run button (red triangle) +6. Watch serial console and LCD for output + +### Stopping Scripts +- Click run button again to stop +- Or press reset button on board + +### Serial Console +- Shows detailed debug output +- Useful for troubleshooting +- Can copy errors for debugging + +--- + +## Network Configuration + +- **Heimdall Server:** 10.1.10.71:3006 +- **Maix Duino:** Gets IP via DHCP (shown on LCD during test) +- **Network:** 10.1.10.0/24 + +--- + +## Next Steps + +After both scripts work: +1. Verify Heimdall server is processing audio +2. Test wake word detection +3. Integrate with Home Assistant (optional) +4. Train custom wake word (optional) +5. Deploy to SD card for permanent installation + +--- + +## Related Documentation + +- **Project overview:** `../PROJECT_SUMMARY.md` +- **Heimdall setup:** `../QUICKSTART.md` +- **Wake word training:** `../MYCROFT_PRECISE_GUIDE.md` +- **Server deployment:** `../docs/PRECISE_DEPLOYMENT.md` + +--- + +**Last Updated:** 2025-12-03 +**Location:** `/Library/Development/devl/Devops/projects/mycroft-precise/maixduino-scripts/` diff --git a/hardware/maixduino/SESSION_PROGRESS_2025-12-03.md b/hardware/maixduino/SESSION_PROGRESS_2025-12-03.md new file mode 100755 index 0000000..e389a53 --- /dev/null +++ b/hardware/maixduino/SESSION_PROGRESS_2025-12-03.md @@ -0,0 +1,376 @@ +# Maixduino Voice Assistant - Session Progress + +**Date:** 2025-12-03 +**Session Duration:** ~4 hours +**Goal:** Get audio recording and transcription working on Maixduino → Heimdall server + +--- + +## 🎉 Major Achievements + +### ✅ Full Audio Pipeline Working! +We successfully built and tested the complete audio capture → compression → transmission → transcription pipeline: + +1. **WiFi Connection** - Maixduino connects to network (10.1.10.98) +2. **Audio Recording** - I2S microphone captures audio (MSM261S4030H0 MEMS mic) +3. **Format Conversion** - Converts 32-bit stereo to 16-bit mono (4x size reduction) +4. **μ-law Compression** - Compresses PCM audio by 50% +5. **HTTP Transmission** - Sends compressed WAV to Heimdall server +6. **Whisper Transcription** - Server transcribes and returns text +7. **LCD Display** - Shows transcription on Maixduino screen +8. **Button Loop** - Press BOOT button for repeated recordings + +**Total size reduction:** 128KB → 32KB (mono) → 16KB (compressed) = **87.5% reduction!** + +--- + +## 🔧 Technical Accomplishments + +### Audio Recording Pipeline +- **Initial Problem:** `i2s_dev.record()` returned immediately (1ms instead of 1000ms) +- **Root Cause:** Recording API is asynchronous/non-blocking +- **Solution:** Use chunked recording with `wait_record()` blocking calls +- **Pattern:** + ```python + for i in range(frame_cnt): + audio_chunk = i2s_dev.record(chunk_size) + i2s_dev.wait_record() # CRITICAL: blocks until complete + chunks.append(audio_chunk.to_bytes()) + ``` + +### Memory Management +- **K210 has very limited RAM** (~6MB total, much less available) +- Successfully handled 128KB → 16KB data transformation without OOM errors +- Techniques used: + - Record in small chunks (2048 samples) + - Stream HTTP transmission (512-byte chunks with delays) + - In-place data conversion where possible + - Explicit garbage collection hints (`audio_data = None`) + +### Network Communication +- **Raw socket HTTP** (no urequests library available) +- **Chunked streaming** with flow control (10ms delays) +- **Simple WAV format** with μ-law compression (format code 7) +- **Robust error handling** with serial output debugging + +--- + +## 🐛 MicroPython/MaixPy Quirks Discovered + +### String Operations +- ❌ **F-strings NOT supported** - Must use `"text " + str(var)` concatenation +- ❌ **Ternary operators fail** - Use explicit `if/else` blocks instead +- ❌ **`split()` needs explicit delimiter** - `text.split(" ")` not `text.split()` +- ❌ **Escape sequences problematic** - Avoid `\n` in strings, causes syntax errors + +### Data Types & Methods +- ❌ **`decode()` doesn't accept kwargs** - Use `decode('utf-8')` not `decode('utf-8', errors='ignore')` +- ❌ **RGB tuples not accepted** - Must convert to packed integers: `(r << 16) | (g << 8) | b` +- ❌ **Bytearray item deletion unsupported** - `del arr[n:]` fails, use slicing instead +- ❌ **Arithmetic in string concat** - Separate calculations: `next = count + 1; "text" + str(next)` + +### I2S Audio Specific +- ❌ **`record()` is non-blocking** - Returns immediately, must use `wait_record()` +- ❌ **Audio object not directly iterable** - Must call `.to_bytes()` first +- ⚠️ **Data format mismatch** - Hardware returns 32-bit stereo even when configured for 16-bit mono (4x expected size) + +### Network/WiFi +- ❌ **`network.WLAN` not available** - Must use `network.ESP32_SPI` with full pin config +- ❌ **`active()` method doesn't exist** - Just call `connect()` directly +- ⚠️ **Requires ALL 6 pins configured** - CS, RST, RDY, MOSI, MISO, SCLK + +### General Syntax +- ⚠️ **`if __name__ == "__main__"` sometimes causes syntax errors** - Safer to just call `main()` directly +- ⚠️ **Import statements mid-function can cause syntax errors** - Keep imports at top of file +- ⚠️ **Some valid Python causes "invalid syntax" for unknown reasons** - Simplify complex expressions + +--- + +## 📊 Current Status + +### ✅ Working +- WiFi connectivity (ESP32 SPI) +- I2S audio initialization +- Chunked audio recording with `wait_record()` +- Audio format detection and conversion (32-bit stereo → 16-bit mono) +- μ-law compression (50% size reduction) +- HTTP transmission to server (chunked streaming) +- Whisper transcription (server-side) +- JSON response parsing +- LCD display (with word wrapping) +- Button-triggered recording loop +- Countdown timer before recording + +### ⚠️ Partially Working +- **Recording duration** - Currently getting ~0.9 seconds instead of full 1 second + - Formula: `frame_cnt = seconds * sample_rate // chunk_size` + - Current: `7 frames × (2048/16000) = 0.896s` + - May need to increase `frame_cnt` or adjust chunk size + +### ❌ Not Yet Implemented +- Mycroft Precise wake word detection +- Full voice assistant loop +- Command processing +- Home Assistant integration +- Multi-second recording support +- Real-time audio streaming + +--- + +## 🔬 Technical Details + +### Hardware Configuration + +**Maixduino Board:** +- Processor: K210 dual-core RISC-V @ 400MHz +- RAM: ~6MB total (limited available memory) +- WiFi: ESP32 module via SPI +- Microphone: MSM261S4030H0 MEMS (onboard) +- IP Address: 10.1.10.98 + +**I2S Pins:** +- Pin 20: I2S0_IN_D0 (data) +- Pin 19: I2S0_WS (word select) +- Pin 18: I2S0_SCLK (clock) + +**ESP32 SPI Pins:** +- Pin 25: CS (chip select) +- Pin 8: RST (reset) +- Pin 9: RDY (ready) +- Pin 28: MOSI (master out) +- Pin 26: MISO (master in) +- Pin 27: SCLK (clock) + +**GPIO:** +- Pin 16: BOOT button (active low, pull-up) + +### Server Configuration + +**Heimdall Server:** +- IP: 10.1.10.71 +- Port: 3006 +- Framework: Flask +- Model: Whisper base +- Environment: Conda `whisper_cli` + +**Endpoints:** +- `/health` - Health check +- `/transcribe` - POST audio for transcription + +### Audio Format + +**Recording:** +- Sample Rate: 16kHz +- Hardware Output: 32-bit stereo (128KB for 1 second) +- After Conversion: 16-bit mono (32KB for 1 second) +- After Compression: 8-bit μ-law (16KB for 1 second) + +**WAV Header:** +- Format Code: 7 (μ-law) +- Channels: 1 (mono) +- Sample Rate: 16000 Hz +- Bits per Sample: 8 +- Includes `fact` chunk (required for μ-law) + +--- + +## 📝 Code Files + +### Main Script +**File:** `/Library/Development/devl/Devops/projects/mycroft-precise/maixduino-scripts/maix_simple_record_test.py` + +**Key Functions:** +- `init_wifi()` - ESP32 SPI WiFi connection +- `init_audio()` - I2S microphone setup +- `record_audio()` - Chunked recording with `wait_record()` +- `convert_to_mono_16bit()` - Format conversion (32-bit stereo → 16-bit mono) +- `compress_ulaw()` - μ-law compression +- `create_wav_header()` - WAV file header generation +- `send_to_server()` - HTTP POST with chunked streaming +- `display_transcription()` - LCD output with word wrapping +- `main()` - Button loop for repeated recordings + +### Server Script +**File:** `/devl/voice-assistant/simple_transcribe_server.py` + +**Features:** +- Accepts raw WAV or multipart uploads +- Whisper base model transcription +- JSON response with transcription text +- Handles μ-law compressed audio + +### Documentation +**File:** `/Library/Development/devl/Devops/projects/mycroft-precise/maixduino-scripts/MICROPYTHON_QUIRKS.md` + +Complete reference of all MicroPython compatibility issues discovered during development. + +--- + +## 🎯 Next Steps + +### Immediate (Tonight) +1. ✅ Switch to Linux laptop with direct serial access +2. ⏭️ Tune recording duration to get full 1 second + - Try `frame_cnt = 8` instead of 7 + - Or adjust chunk size to get exact timing +3. ⏭️ Test transcription quality with proper-length recordings + +### Short Term (This Week) +1. Increase recording duration to 2-3 seconds for better transcription +2. Test memory limits with longer recordings +3. Optimize compression/transmission for speed +4. Add visual feedback during transmission + +### Medium Term (Next Week) +1. Install Mycroft Precise in `whisper_cli` environment +2. Test "hey mycroft" wake word detection on server +3. Integrate wake word into recording loop +4. Add command processing and Home Assistant integration + +### Long Term (Future) +1. Explore edge wake word detection (Precise on K210) +2. Multi-device deployment +3. Continuous listening mode +4. Voice profiles and speaker identification + +--- + +## 🐛 Known Issues + +### Recording Duration +- **Issue:** Recording is ~0.9 seconds instead of 1.0 seconds +- **Cause:** Integer division `16000 // 2048 = 7.8` rounds down to 7 frames +- **Impact:** Minor - transcription still works +- **Fix:** Increase `frame_cnt` to 8 or adjust chunk size + +### Data Format Mismatch +- **Issue:** Hardware returns 4x expected data (128KB vs 32KB) +- **Cause:** I2S outputting 32-bit stereo despite 16-bit mono config +- **Impact:** None - conversion function handles it +- **Status:** Working as intended + +### Syntax Error Sensitivity +- **Issue:** Some valid Python causes "invalid syntax" in MicroPython +- **Patterns:** Import statements mid-function, certain arithmetic expressions +- **Workaround:** Simplify code, avoid complex expressions +- **Status:** Documented in MICROPYTHON_QUIRKS.md + +--- + +## 💡 Key Learnings + +### I2S Recording Pattern +The correct pattern for MaixPy I2S recording: +```python +chunk_size = 2048 +frame_cnt = seconds * sample_rate // chunk_size + +for i in range(frame_cnt): + audio_chunk = i2s_dev.record(chunk_size) + i2s_dev.wait_record() # BLOCKS until recording complete + data.append(audio_chunk.to_bytes()) +``` + +**Critical:** `wait_record()` is REQUIRED or recording returns immediately! + +### Memory Management +K210 has very limited RAM. Successful strategies: +- Work in small chunks (512-2048 bytes) +- Stream data instead of buffering +- Free variables explicitly when done +- Avoid creating large intermediate buffers + +### MicroPython Compatibility +MicroPython is NOT Python. Many standard features missing: +- F-strings, ternary operators, keyword arguments +- Some string methods, complex expressions +- Standard libraries (urequests, json parsing) + +**Rule:** Test incrementally, simplify everything, check quirks doc. + +--- + +## 📚 Resources Used + +### Documentation +- [MaixPy I2S API Reference](https://wiki.sipeed.com/soft/maixpy/en/api_reference/Maix/i2s.html) +- [MaixPy I2S Usage Guide](https://wiki.sipeed.com/soft/maixpy/en/modules/on_chip/i2s.html) +- [Maixduino Hardware Wiki](https://wiki.sipeed.com/hardware/en/maix/maixpy_develop_kit_board/maix_duino.html) + +### Code Examples +- [Official record_wav.py](https://github.com/sipeed/MaixPy-v1_scripts/blob/master/multimedia/audio/record_wav.py) +- [MaixPy Scripts Repository](https://github.com/sipeed/MaixPy-v1_scripts) + +### Tools +- MaixPy IDE (copy/paste to board) +- Serial monitor (debugging) +- Heimdall server (Whisper transcription) + +--- + +## 🔄 Ready for Next Session + +### Current State +- ✅ Code is working and stable +- ✅ Can record, compress, transmit, transcribe, display +- ✅ Button loop allows repeated testing +- ⚠️ Recording duration slightly short (~0.9s) + +### Files Ready +- `/Library/Development/devl/Devops/projects/mycroft-precise/maixduino-scripts/maix_simple_record_test.py` +- `/Library/Development/devl/Devops/projects/mycroft-precise/maixduino-scripts/MICROPYTHON_QUIRKS.md` +- `/devl/voice-assistant/simple_transcribe_server.py` + +### For Serial Access Session +1. Connect Maixduino via USB to Linux laptop +2. Install pyserial: `pip install pyserial` +3. Find device: `ls /dev/ttyUSB*` or `/dev/ttyACM*` +4. Connect: `screen /dev/ttyUSB0 115200` or use MaixPy IDE +5. Can directly modify code, test immediately, see serial output + +### Quick Test Commands +```python +# Test WiFi +from network import ESP32_SPI +# ... (full init code in maix_test_simple.py) + +# Test I2S +from Maix import I2S +rx = I2S(I2S.DEVICE_0) +# ... + +# Test recording +audio = rx.record(2048) +rx.wait_record() +print(len(audio.to_bytes())) +``` + +--- + +## 🎊 Success Metrics + +Today we achieved: +- ✅ WiFi connection working +- ✅ Audio recording working (with proper blocking) +- ✅ Format conversion working (4x reduction) +- ✅ Compression working (2x reduction) +- ✅ Network transmission working (chunked streaming) +- ✅ Server transcription working +- ✅ Display output working +- ✅ Button loop working +- ✅ End-to-end pipeline complete! + +**Total:** 9/9 core features working! 🚀 + +Minor tuning needed, but the foundation is solid and ready for wake word integration. + +--- + +**Session Summary:** Massive progress! From zero to working audio transcription pipeline in one session. Overcame significant MicroPython compatibility challenges and memory limitations. Ready for next phase: wake word detection. + +**Status:** ✅ Ready for Linux serial access and fine-tuning +**Next Session:** Tune recording duration, then integrate Mycroft Precise wake word detection + +--- + +*End of Session Report - 2025-12-03* diff --git a/hardware/maixduino/maix_debug_wifi.py b/hardware/maixduino/maix_debug_wifi.py new file mode 100755 index 0000000..fcab3ba --- /dev/null +++ b/hardware/maixduino/maix_debug_wifi.py @@ -0,0 +1,41 @@ +# Debug script to discover WiFi module methods +# This will help us figure out the correct API + +import lcd + +lcd.init() +lcd.clear() + +print("=" * 40) +print("WiFi Module Debug") +print("=" * 40) + +# Try to import WiFi module +try: + from network_esp32 import wifi + print("SUCCESS: Imported network_esp32.wifi") + lcd.draw_string(10, 10, "WiFi module found!", 0xFFFF, 0x0000) + + # List all attributes/methods + print("\nAvailable methods:") + lcd.draw_string(10, 30, "Checking methods...", 0xFFFF, 0x0000) + + attrs = dir(wifi) + y = 50 + for i, attr in enumerate(attrs): + if not attr.startswith('_'): + print(" - " + attr) + if i < 10: # Only show first 10 on screen + lcd.draw_string(10, y, attr[:20], 0x07E0, 0x0000) + y += 15 + + print("\nTotal methods: " + str(len(attrs))) + +except Exception as e: + print("ERROR importing wifi: " + str(e)) + lcd.draw_string(10, 10, "WiFi import failed!", 0xF800, 0x0000) + lcd.draw_string(10, 30, str(e)[:30], 0xF800, 0x0000) + +print("\n" + "=" * 40) +print("Debug complete - check serial output") +print("=" * 40) diff --git a/hardware/maixduino/maix_discover_modules.py b/hardware/maixduino/maix_discover_modules.py new file mode 100755 index 0000000..476c263 --- /dev/null +++ b/hardware/maixduino/maix_discover_modules.py @@ -0,0 +1,51 @@ +# Discover what network/WiFi modules are actually available +import lcd +import sys + +lcd.init() +lcd.clear() + +print("=" * 40) +print("Module Discovery") +print("=" * 40) + +# Try different possible module names +modules_to_try = [ + "network", + "network_esp32", + "network_esp8285", + "esp32_spi", + "esp8285", + "wifi", + "ESP32_SPI", + "WIFI" +] + +found = [] +y = 10 + +for module_name in modules_to_try: + try: + mod = __import__(module_name) + msg = "FOUND: " + module_name + print(msg) + lcd.draw_string(10, y, msg[:25], 0x07E0, 0x0000) # Green + y += 15 + found.append(module_name) + + # Show methods + print(" Methods: " + str(dir(mod))) + + except Exception as e: + msg = "NONE: " + module_name + print(msg + " (" + str(e) + ")") + +print("\n" + "=" * 40) +if found: + print("Found modules: " + str(found)) + lcd.draw_string(10, y + 20, "Found: " + str(len(found)), 0xFFFF, 0x0000) +else: + print("No WiFi modules found!") + lcd.draw_string(10, y + 20, "No WiFi found!", 0xF800, 0x0000) + +print("=" * 40) diff --git a/hardware/maixduino/maix_simple_record_test.py b/hardware/maixduino/maix_simple_record_test.py new file mode 100644 index 0000000..0d9db28 --- /dev/null +++ b/hardware/maixduino/maix_simple_record_test.py @@ -0,0 +1,461 @@ +# Simple Audio Recording and Transcription Test +# Record audio for 3 seconds, send to server, display transcription +# +# This tests the full audio pipeline without wake word detection + +import time +import lcd +import socket +import struct +from Maix import GPIO, I2S +from fpioa_manager import fm + +# ===== CONFIGURATION ===== +# Load credentials from secrets.py (gitignored) +try: + from secrets import SECRETS +except ImportError: + SECRETS = {} + +WIFI_SSID = "Tell My WiFi Love Her" +WIFI_PASSWORD = SECRETS.get("wifi_password", "") # set in secrets.py +SERVER_HOST = "10.1.10.71" +SERVER_PORT = 3006 +RECORD_SECONDS = 1 # Reduced to 1 second to save memory +SAMPLE_RATE = 16000 +# ========================== + +# Colors +def rgb_to_int(r, g, b): + return (r << 16) | (g << 8) | b + +COLOR_BLACK = 0 +COLOR_WHITE = rgb_to_int(255, 255, 255) +COLOR_RED = rgb_to_int(255, 0, 0) +COLOR_GREEN = rgb_to_int(0, 255, 0) +COLOR_BLUE = rgb_to_int(0, 0, 255) +COLOR_YELLOW = rgb_to_int(255, 255, 0) +COLOR_CYAN = 0x00FFFF # Cyan: rgb_to_int(0, 255, 255) + +def display_msg(msg, color=COLOR_WHITE, y=50, clear=False): + """Display message on LCD""" + if clear: + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, y, msg[:30], color, COLOR_BLACK) + print(msg) + +def init_wifi(): + """Initialize WiFi connection""" + from network import ESP32_SPI + + lcd.init() + lcd.clear(COLOR_BLACK) + display_msg("Connecting WiFi...", COLOR_BLUE, 10) + + # Register ESP32 SPI pins + fm.register(25, fm.fpioa.GPIOHS10, force=True) # CS + fm.register(8, fm.fpioa.GPIOHS11, force=True) # RST + fm.register(9, fm.fpioa.GPIOHS12, force=True) # RDY + fm.register(28, fm.fpioa.GPIOHS13, force=True) # MOSI + fm.register(26, fm.fpioa.GPIOHS14, force=True) # MISO + fm.register(27, fm.fpioa.GPIOHS15, force=True) # SCLK + + nic = ESP32_SPI( + cs=fm.fpioa.GPIOHS10, rst=fm.fpioa.GPIOHS11, rdy=fm.fpioa.GPIOHS12, + mosi=fm.fpioa.GPIOHS13, miso=fm.fpioa.GPIOHS14, sclk=fm.fpioa.GPIOHS15 + ) + + nic.connect(WIFI_SSID, WIFI_PASSWORD) + + # Wait for connection + timeout = 20 + while timeout > 0: + time.sleep(1) + if nic.isconnected(): + ip = nic.ifconfig()[0] + display_msg("WiFi OK: " + str(ip), COLOR_GREEN, 30) + return nic + timeout -= 1 + + display_msg("WiFi FAILED!", COLOR_RED, 30) + return None + +def init_audio(): + """Initialize I2S audio""" + display_msg("Init audio...", COLOR_BLUE, 50) + + # Register I2S pins + fm.register(20, fm.fpioa.I2S0_IN_D0, force=True) + fm.register(19, fm.fpioa.I2S0_WS, force=True) + fm.register(18, fm.fpioa.I2S0_SCLK, force=True) + + # Initialize I2S + rx = I2S(I2S.DEVICE_0) + rx.channel_config(rx.CHANNEL_0, rx.RECEIVER, align_mode=I2S.STANDARD_MODE) + rx.set_sample_rate(SAMPLE_RATE) + + display_msg("Audio OK!", COLOR_GREEN, 70) + return rx + +def convert_to_mono_16bit(audio_data): + """Convert audio to mono 16-bit by returning a slice""" + expected_size = SAMPLE_RATE * RECORD_SECONDS * 2 # 16-bit mono + actual_size = len(audio_data) + + print("Expected size: " + str(expected_size) + ", Actual: " + str(actual_size)) + + # If we got 4x the expected data, downsample to mono + if actual_size == expected_size * 4: + print("Extracting mono from stereo/32-bit...") + # Create new buffer with only the data we need (every 4th pair of bytes) + mono_data = bytearray(expected_size) + write_pos = 0 + # Read every 4 bytes, take first 2 bytes only + for read_pos in range(0, actual_size, 4): + if write_pos + 1 < expected_size and read_pos + 1 < actual_size: + mono_data[write_pos] = audio_data[read_pos] + mono_data[write_pos + 1] = audio_data[read_pos + 1] + write_pos += 2 + + # Free original buffer explicitly + audio_data = None + return mono_data + + # If we got 2x the expected data, extract mono + elif actual_size == expected_size * 2: + print("Extracting mono from stereo...") + mono_data = bytearray(expected_size) + write_pos = 0 + for read_pos in range(0, actual_size, 4): + if write_pos + 1 < expected_size and read_pos + 1 < actual_size: + mono_data[write_pos] = audio_data[read_pos] + mono_data[write_pos + 1] = audio_data[read_pos + 1] + write_pos += 2 + + # Free original + audio_data = None + return mono_data + + # Otherwise assume it's already correct format + print("Audio data appears to be correct format") + return audio_data + +def record_audio(i2s_dev, seconds): + """Record audio for specified seconds using chunked recording with wait""" + # Clear screen and show big recording indicator + lcd.clear(COLOR_BLACK) + + # Show large "RECORDING" text + display_msg("*** RECORDING ***", COLOR_RED, 60) + display_msg("Speak now!", COLOR_YELLOW, 100) + display_msg("(listening...)", COLOR_WHITE, 130) + + chunk_size = 2048 + channels = 1 + + # Calculate number of chunks needed + frame_cnt = seconds * SAMPLE_RATE // chunk_size + print("Recording " + str(frame_cnt) + " frames...") + + # Recording loop with wait + all_chunks = [] + for i in range(frame_cnt): + # Start recording this chunk + audio_chunk = i2s_dev.record(chunk_size * channels) + + # CRITICAL: Wait for recording to complete + i2s_dev.wait_record() + + # Convert to bytes and store + chunk_bytes = audio_chunk.to_bytes() + all_chunks.append(chunk_bytes) + + # Combine all chunks + print("Combining " + str(len(all_chunks)) + " chunks...") + audio_data = bytearray() + for chunk in all_chunks: + audio_data.extend(chunk) + + print("Recorded " + str(len(audio_data)) + " bytes") + + # Convert to mono 16-bit if needed + audio_data = convert_to_mono_16bit(audio_data) + print("Final size: " + str(len(audio_data)) + " bytes") + + return audio_data + +def compress_ulaw(data): + """Compress 16-bit PCM to 8-bit μ-law (50% size reduction)""" + # μ-law compression lookup table (simplified) + BIAS = 0x84 + CLIP = 32635 + + compressed = bytearray() + + # Process 16-bit samples (2 bytes each) + for i in range(0, len(data), 2): + # Get 16-bit sample (little endian) + sample = struct.unpack(' CLIP: + sample = CLIP + + # Add bias + sample = sample + BIAS + + # Find exponent (position of highest bit) + exponent = 7 + for exp in range(7, -1, -1): + if sample & (1 << (exp + 7)): + exponent = exp + break + + # Get mantissa (top 4 bits after exponent) + mantissa = (sample >> (exponent + 3)) & 0x0F + + # Combine: sign (1 bit) + exponent (3 bits) + mantissa (4 bits) + ulaw_byte = sign | (exponent << 4) | mantissa + + # Invert bits (μ-law standard) + compressed.append(ulaw_byte ^ 0xFF) + + return compressed + +def create_wav_header(data_size, sample_rate=16000, is_ulaw=False): + """Create WAV file header""" + header = bytearray() + + # RIFF header + header.extend(b'RIFF') + header.extend(struct.pack(' " + str(len(compressed_data)) + " bytes") + + # Update display + display_msg("Sending to server", COLOR_WHITE, 130) + + # Create WAV file with μ-law format + wav_header = create_wav_header(len(compressed_data), is_ulaw=True) + wav_size = len(wav_header) + len(compressed_data) + + # Simple HTTP POST with raw WAV data + headers = "POST /transcribe HTTP/1.1\r\n" + headers += "Host: " + SERVER_HOST + "\r\n" + headers += "Content-Type: audio/wav\r\n" + headers += "Content-Length: " + str(wav_size) + "\r\n" + headers += "Connection: close\r\n\r\n" + + # Connect with better socket settings + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(30) + + # Try to set socket options for better stability + try: + s.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + except: + pass # Some MicroPython builds don't support this + + print("Connecting to " + SERVER_HOST + ":" + str(SERVER_PORT)) + s.connect((SERVER_HOST, SERVER_PORT)) + + # Send headers + print("Sending headers...") + sent = s.send(headers.encode()) + print("Sent " + str(sent) + " bytes of headers") + + # Send WAV header + print("Sending WAV header...") + sent = s.send(wav_header) + print("Sent " + str(sent) + " bytes of WAV header") + + # Send audio data in small chunks with delay + print("Sending audio data (" + str(len(compressed_data)) + " bytes)...") + chunk_size = 512 # Even smaller chunks for stability + total_chunks = (len(compressed_data) + chunk_size - 1) // chunk_size + + bytes_sent = 0 + for i in range(0, len(compressed_data), chunk_size): + chunk = compressed_data[i:i+chunk_size] + try: + sent = s.send(chunk) + bytes_sent += sent + chunk_num = i // chunk_size + 1 + if chunk_num % 10 == 0: # Progress update every 10 chunks + print("Sent " + str(bytes_sent) + "/" + str(len(compressed_data)) + " bytes") + # Small delay to let socket buffer drain + time.sleep_ms(10) + except Exception as e: + print("Send error at byte " + str(bytes_sent) + ": " + str(e)) + raise + + print("All data sent! Total: " + str(bytes_sent) + " bytes") + + # Update display for waiting + lcd.clear(COLOR_BLACK) + display_msg("Transcribing...", COLOR_CYAN, 60) + display_msg("Please wait", COLOR_WHITE, 100) + + # Read response + response = b"" + while True: + chunk = s.recv(1024) + if not chunk: + break + response += chunk + + s.close() + + # Parse response (MicroPython decode doesn't accept keyword args) + try: + response_str = response.decode('utf-8') + except: + response_str = str(response) + print("Response: " + response_str[:200]) + + # Extract JSON from response + if '{"' in response_str: + json_start = response_str.index('{"') + json_str = response_str[json_start:] + + # Simple JSON parsing (MicroPython doesn't have json module) + if '"text":' in json_str: + text_start = json_str.index('"text":') + 7 + text_str = json_str[text_start:] + # Find the value between quotes + if '"' in text_str: + quote_start = text_str.index('"') + 1 + quote_end = text_str.index('"', quote_start) + transcription = text_str[quote_start:quote_end] + return transcription + + return "Error parsing response" + + except Exception as e: + print("Error: " + str(e)) + return "Error: " + str(e) + +def display_transcription(text): + """Display transcription on LCD""" + lcd.clear(COLOR_BLACK) + display_msg("TRANSCRIPTION:", COLOR_GREEN, 10) + + # Simple line splitting every 20 chars + y = 40 + while len(text) > 0: + chunk = text[:20] + display_msg(chunk, COLOR_WHITE, y) + text = text[20:] + y += 20 + if y > 200: + break + + print("Transcription: " + text) + +def main(): + """Main program with loop for multiple recordings""" + print("=" * 40) + print("Simple Audio Recording Test") + print("=" * 40) + + # Initialize + nic = init_wifi() + if not nic: + return + + i2s = init_audio() + + # Setup button (boot button on GPIO 16) + fm.register(16, fm.fpioa.GPIOHS0, force=True) + button = GPIO(GPIO.GPIOHS0, GPIO.IN, GPIO.PULL_UP) + + display_msg("Ready!", COLOR_GREEN, 110, clear=True) + display_msg("Press BOOT button", COLOR_WHITE, 130) + display_msg("to record", COLOR_WHITE, 150) + print("Press BOOT button to record, or Ctrl+C to exit") + + recording_count = 0 + + # Main loop + while True: + # Wait for button press (button is active low) + if button.value() == 0: + recording_count += 1 + print("\n--- Recording #" + str(recording_count) + " ---") + + # Debounce - wait for button release + while button.value() == 0: + time.sleep_ms(10) + + # Give user time to prepare (countdown) + lcd.clear(COLOR_BLACK) + display_msg("GET READY!", COLOR_YELLOW, 80) + display_msg("3...", COLOR_WHITE, 120) + time.sleep(1) + display_msg("2...", COLOR_WHITE, 140) + time.sleep(1) + display_msg("1...", COLOR_WHITE, 160) + time.sleep(1) + + # Record + audio_data = record_audio(i2s, RECORD_SECONDS) + + # Send to server + transcription = send_to_server(audio_data) + + # Display result + display_transcription(transcription) + + # Wait a bit before showing ready again + time.sleep(2) + + # Show ready for next recording + display_msg("Ready!", COLOR_GREEN, 110, clear=True) + display_msg("Press BOOT button", COLOR_WHITE, 130) + next_count = recording_count + 1 + display_msg("to record (#" + str(next_count) + ")", COLOR_WHITE, 150) + print("Ready for next recording. Press BOOT button.") + + time.sleep_ms(50) # Small delay to reduce CPU usage + +# Run main +main() + diff --git a/hardware/maixduino/maix_test_simple.py b/hardware/maixduino/maix_test_simple.py new file mode 100644 index 0000000..ae89ee0 --- /dev/null +++ b/hardware/maixduino/maix_test_simple.py @@ -0,0 +1,252 @@ +# Maix Duino - Simple Test Script +# Copy/paste this into MaixPy IDE and click RUN +# +# This script tests: +# 1. LCD display +# 2. WiFi connectivity +# 3. Network connection to Heimdall server +# 4. I2S audio initialization (without recording yet) + +import time +import lcd +from Maix import GPIO, I2S +from fpioa_manager import fm + +# Import the correct network module +try: + import network + # Create ESP32_SPI instance (for Maix Duino with ESP32) + nic = None # Will be initialized in test_wifi +except Exception as e: + print("Network module import error: " + str(e)) + nic = None + +# ===== CONFIGURATION - EDIT THESE ===== +# Load credentials from secrets.py (gitignored) +try: + from secrets import SECRETS +except ImportError: + SECRETS = {} + +WIFI_SSID = "Tell My WiFi Love Her" # <<< CHANGE THIS +WIFI_PASSWORD = SECRETS.get("wifi_password", "") # set in secrets.py # <<< CHANGE THIS +SERVER_URL = "http://10.1.10.71:3006" # Heimdall voice server +# ======================================= + +# Colors (as tuples for easy reference) +COLOR_BLACK = (0, 0, 0) +COLOR_WHITE = (255, 255, 255) +COLOR_RED = (255, 0, 0) +COLOR_GREEN = (0, 255, 0) +COLOR_BLUE = (0, 0, 255) +COLOR_YELLOW = (255, 255, 0) + +def display_msg(msg, color=COLOR_WHITE, y=50): + """Display message on LCD""" + # lcd.draw_string needs RGB as separate ints: lcd.draw_string(x, y, text, color_int, bg_color_int) + # Convert RGB tuple to single integer: (R << 16) | (G << 8) | B + color_int = (color[0] << 16) | (color[1] << 8) | color[2] + bg_int = 0 # Black background + lcd.draw_string(10, y, msg, color_int, bg_int) + print(msg) + +def test_lcd(): + """Test LCD display""" + lcd.init() + lcd.clear(COLOR_BLACK) + display_msg("MaixDuino Test", COLOR_YELLOW, 10) + display_msg("Initializing...", COLOR_WHITE, 30) + time.sleep(1) + return True + +def test_wifi(): + """Test WiFi connection""" + global nic + display_msg("Connecting WiFi...", COLOR_BLUE, 50) + + try: + # Initialize ESP32_SPI network interface + print("Initializing ESP32_SPI...") + + # Create network interface instance with Maix Duino pins + # Maix Duino ESP32 default pins: + # CS=25, RST=8, RDY=9, MOSI=28, MISO=26, SCLK=27 + from network import ESP32_SPI + from fpioa_manager import fm + from Maix import GPIO + + # Register pins for ESP32 SPI communication + fm.register(25, fm.fpioa.GPIOHS10, force=True) # CS + fm.register(8, fm.fpioa.GPIOHS11, force=True) # RST + fm.register(9, fm.fpioa.GPIOHS12, force=True) # RDY + fm.register(28, fm.fpioa.GPIOHS13, force=True) # MOSI + fm.register(26, fm.fpioa.GPIOHS14, force=True) # MISO + fm.register(27, fm.fpioa.GPIOHS15, force=True) # SCLK + + nic = ESP32_SPI( + cs=fm.fpioa.GPIOHS10, + rst=fm.fpioa.GPIOHS11, + rdy=fm.fpioa.GPIOHS12, + mosi=fm.fpioa.GPIOHS13, + miso=fm.fpioa.GPIOHS14, + sclk=fm.fpioa.GPIOHS15 + ) + + print("Connecting to " + WIFI_SSID + "...") + + # Connect to WiFi (no need to call active() first) + nic.connect(WIFI_SSID, WIFI_PASSWORD) + + # Wait for connection + timeout = 20 + while timeout > 0: + time.sleep(1) + timeout -= 1 + + if nic.isconnected(): + # Successfully connected! + ip_info = nic.ifconfig() + ip = ip_info[0] if ip_info else "Unknown" + display_msg("WiFi OK!", COLOR_GREEN, 70) + display_msg("IP: " + str(ip), COLOR_WHITE, 90) + print("Connected! IP: " + str(ip)) + time.sleep(2) + return True + else: + print("Waiting... " + str(timeout) + "s") + + # Timeout reached + display_msg("WiFi FAILED!", COLOR_RED, 70) + print("Connection timeout") + return False + + except Exception as e: + display_msg("WiFi error!", COLOR_RED, 70) + print("WiFi error: " + str(e)) + import sys + sys.print_exception(e) + return False + +def test_server(): + """Test connection to Heimdall server""" + display_msg("Testing server...", COLOR_BLUE, 110) + + try: + # Try socket connection to server + import socket + + url = SERVER_URL + "/health" + print("Trying: " + url) + + # Parse URL to get host and port + host = "10.1.10.71" + port = 3006 + + # Create socket + s = socket.socket() + s.settimeout(5) + + print("Connecting to " + host + ":" + str(port)) + s.connect((host, port)) + + # Send HTTP GET request + request = "GET /health HTTP/1.1\r\nHost: " + host + "\r\nConnection: close\r\n\r\n" + s.send(request.encode()) + + # Read response + response = s.recv(1024).decode() + s.close() + + print("Server response received") + + if "200" in response or "OK" in response: + display_msg("Server OK!", COLOR_GREEN, 130) + print("Server is reachable!") + time.sleep(2) + return True + else: + display_msg("Server responded", COLOR_YELLOW, 130) + print("Response: " + response[:100]) + return True # Still counts as success if we got a response + + except Exception as e: + display_msg("Server FAILED!", COLOR_RED, 130) + error_msg = str(e)[:30] + display_msg(error_msg, COLOR_RED, 150) + print("Server connection failed: " + str(e)) + return False + +def test_audio(): + """Test I2S audio initialization""" + display_msg("Testing audio...", COLOR_BLUE, 170) + + try: + # Register I2S pins (Maix Duino pinout) + fm.register(20, fm.fpioa.I2S0_IN_D0, force=True) + fm.register(19, fm.fpioa.I2S0_WS, force=True) + fm.register(18, fm.fpioa.I2S0_SCLK, force=True) + + # Initialize I2S + rx = I2S(I2S.DEVICE_0) + rx.channel_config(rx.CHANNEL_0, rx.RECEIVER, align_mode=I2S.STANDARD_MODE) + rx.set_sample_rate(16000) + + display_msg("Audio OK!", COLOR_GREEN, 190) + print("I2S initialized: " + str(rx)) + time.sleep(2) + return True + except Exception as e: + display_msg("Audio FAILED!", COLOR_RED, 190) + print("Audio init failed: " + str(e)) + return False + +def main(): + """Run all tests""" + print("=" * 40) + print("MaixDuino Voice Assistant Test") + print("=" * 40) + + # Test LCD + if not test_lcd(): + print("LCD test failed!") + return + + # Test WiFi + if not test_wifi(): + print("WiFi test failed!") + red_int = (255 << 16) | (0 << 8) | 0 # Red color + lcd.draw_string(10, 210, "STOPPED - Check WiFi", red_int, 0) + return + + # Test server connection + server_ok = test_server() + + # Test audio + audio_ok = test_audio() + + # Summary + lcd.clear(COLOR_BLACK) + display_msg("=== TEST RESULTS ===", COLOR_YELLOW, 10) + display_msg("LCD: OK", COLOR_GREEN, 40) + display_msg("WiFi: OK", COLOR_GREEN, 60) + + if server_ok: + display_msg("Server: OK", COLOR_GREEN, 80) + else: + display_msg("Server: FAIL", COLOR_RED, 80) + + if audio_ok: + display_msg("Audio: OK", COLOR_GREEN, 100) + else: + display_msg("Audio: FAIL", COLOR_RED, 100) + + if server_ok and audio_ok: + display_msg("Ready for voice app!", COLOR_GREEN, 140) + else: + display_msg("Fix errors first", COLOR_YELLOW, 140) + + print("\nTest complete!") + +# Run the test +if __name__ == "__main__": + main() diff --git a/hardware/maixduino/maix_voice_client.py b/hardware/maixduino/maix_voice_client.py new file mode 100755 index 0000000..9d9f056 --- /dev/null +++ b/hardware/maixduino/maix_voice_client.py @@ -0,0 +1,465 @@ +# Maix Duino Voice Assistant Client +# Path: maix_voice_client.py (upload to Maix Duino SD card) +# +# Purpose and usage: +# This script runs on the Maix Duino board and handles: +# - Wake word detection using KPU +# - Audio capture from I2S microphone +# - Streaming audio to voice processing server +# - Playing back TTS responses +# - LED feedback for user interaction +# +# Requirements: +# - MaixPy firmware (latest version) +# - I2S microphone connected +# - Speaker or audio output connected +# - WiFi configured (see config below) +# +# Upload to board: +# 1. Copy this file to SD card as boot.py or main.py +# 2. Update WiFi credentials below +# 3. Update server URL to your Heimdall IP +# 4. Power cycle the board + +import time +import audio +import image +from Maix import GPIO +from fpioa_manager import fm +from machine import I2S +import KPU as kpu +import sensor +import lcd +import gc + +# ----- Configuration ----- + +# WiFi Settings +WIFI_SSID = "YourSSID" +WIFI_PASSWORD = "YourPassword" + +# Server Settings +VOICE_SERVER_URL = "http://10.1.10.71:5000" +PROCESS_ENDPOINT = "/process" + +# Audio Settings +SAMPLE_RATE = 16000 # 16kHz for Whisper +CHANNELS = 1 # Mono +SAMPLE_WIDTH = 2 # 16-bit +CHUNK_SIZE = 1024 + +# Wake Word Settings +WAKE_WORD_THRESHOLD = 0.7 # Confidence threshold (0.0-1.0) +WAKE_WORD_MODEL = "/sd/models/wake_word.kmodel" # Path to wake word model + +# LED Pin for feedback +LED_PIN = 13 # Onboard LED (adjust if needed) + +# Recording Settings +MAX_RECORD_TIME = 10 # Maximum seconds to record after wake word +SILENCE_THRESHOLD = 500 # Amplitude threshold for silence detection +SILENCE_DURATION = 2 # Seconds of silence before stopping recording + +# ----- Color definitions for LCD ----- +COLOR_RED = (255, 0, 0) +COLOR_GREEN = (0, 255, 0) +COLOR_BLUE = (0, 0, 255) +COLOR_YELLOW = (255, 255, 0) +COLOR_BLACK = (0, 0, 0) +COLOR_WHITE = (255, 255, 255) + +# ----- Global Variables ----- +led = None +i2s_dev = None +kpu_task = None +listening = False + + +def init_hardware(): + """Initialize hardware components""" + global led, i2s_dev + + # Initialize LED + fm.register(LED_PIN, fm.fpioa.GPIO0) + led = GPIO(GPIO.GPIO0, GPIO.OUT) + led.value(0) # Turn off initially + + # Initialize LCD + lcd.init() + lcd.clear(COLOR_BLACK) + lcd.draw_string(lcd.width()//2 - 50, lcd.height()//2, + "Initializing...", + lcd.WHITE, lcd.BLACK) + + # Initialize I2S for audio (microphone) + # Note: Pin configuration may vary based on your specific hardware + fm.register(20, fm.fpioa.I2S0_IN_D0) + fm.register(19, fm.fpioa.I2S0_WS) + fm.register(18, fm.fpioa.I2S0_SCLK) + + i2s_dev = I2S(I2S.DEVICE_0) + i2s_dev.channel_config(I2S.CHANNEL_0, I2S.RECEIVER, + align_mode=I2S.STANDARD_MODE, + data_width=I2S.RESOLUTION_16_BIT) + i2s_dev.set_sample_rate(SAMPLE_RATE) + + print("Hardware initialized") + + +def init_network(): + """Initialize WiFi connection""" + import network + + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, "Connecting to WiFi...", COLOR_WHITE, COLOR_BLACK) + + wlan = network.WLAN(network.STA_IF) + wlan.active(True) + + if not wlan.isconnected(): + print(f"Connecting to {WIFI_SSID}...") + wlan.connect(WIFI_SSID, WIFI_PASSWORD) + + # Wait for connection + timeout = 20 + while not wlan.isconnected() and timeout > 0: + time.sleep(1) + timeout -= 1 + print(f"Waiting for connection... {timeout}s") + + if not wlan.isconnected(): + print("Failed to connect to WiFi") + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, "WiFi Failed!", COLOR_RED, COLOR_BLACK) + return False + + print("Network connected:", wlan.ifconfig()) + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, "WiFi Connected", COLOR_GREEN, COLOR_BLACK) + lcd.draw_string(10, 70, f"IP: {wlan.ifconfig()[0]}", COLOR_WHITE, COLOR_BLACK) + time.sleep(2) + + return True + + +def load_wake_word_model(): + """Load wake word detection model""" + global kpu_task + + try: + # This is a placeholder - you'll need to train and convert a wake word model + # For now, we'll skip KPU wake word and use a simpler approach + print("Wake word model loading skipped (implement after model training)") + return True + except Exception as e: + print(f"Failed to load wake word model: {e}") + return False + + +def detect_wake_word(): + """ + Detect wake word in audio stream + + Returns: + True if wake word detected, False otherwise + + Note: This is a simplified version. For production, you should: + 1. Train a wake word model using Mycroft Precise or similar + 2. Convert the model to .kmodel format for K210 + 3. Load and run inference using KPU + + For now, we'll use a simple amplitude-based trigger + """ + # Simple amplitude-based detection (placeholder) + # Replace with actual KPU inference + + audio_data = i2s_dev.record(CHUNK_SIZE) + + if audio_data: + # Calculate amplitude + amplitude = 0 + for i in range(0, len(audio_data), 2): + sample = int.from_bytes(audio_data[i:i+2], 'little', True) + amplitude += abs(sample) + + amplitude = amplitude / (len(audio_data) // 2) + + # Simple threshold detection (replace with KPU inference) + if amplitude > 3000: # Adjust threshold based on your microphone + return True + + return False + + +def record_audio(max_duration=MAX_RECORD_TIME): + """ + Record audio until silence or max duration + + Returns: + bytes: Recorded audio data in WAV format + """ + print(f"Recording audio (max {max_duration}s)...") + + audio_buffer = bytearray() + start_time = time.time() + silence_start = None + + # Record in chunks + while True: + elapsed = time.time() - start_time + + # Check max duration + if elapsed > max_duration: + print("Max recording duration reached") + break + + # Record chunk + chunk = i2s_dev.record(CHUNK_SIZE) + + if chunk: + audio_buffer.extend(chunk) + + # Calculate amplitude for silence detection + amplitude = 0 + for i in range(0, len(chunk), 2): + sample = int.from_bytes(chunk[i:i+2], 'little', True) + amplitude += abs(sample) + + amplitude = amplitude / (len(chunk) // 2) + + # Silence detection + if amplitude < SILENCE_THRESHOLD: + if silence_start is None: + silence_start = time.time() + elif time.time() - silence_start > SILENCE_DURATION: + print("Silence detected, stopping recording") + break + else: + silence_start = None + + # Update LCD with recording time + if int(elapsed) % 1 == 0: + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, f"Recording... {int(elapsed)}s", + COLOR_RED, COLOR_BLACK) + + print(f"Recorded {len(audio_buffer)} bytes") + + # Convert to WAV format + return create_wav(audio_buffer) + + +def create_wav(audio_data): + """Create WAV file header and combine with audio data""" + import struct + + # WAV header + sample_rate = SAMPLE_RATE + channels = CHANNELS + sample_width = SAMPLE_WIDTH + data_size = len(audio_data) + + # RIFF header + wav = bytearray(b'RIFF') + wav.extend(struct.pack(' lcd.width() - 20: # Rough character width + if current_line: + lines.append(current_line.strip()) + current_line = word + " " + else: + current_line = test_line + + if current_line: + lines.append(current_line.strip()) + + # Display lines + y = 30 + for line in lines[:5]: # Max 5 lines + lcd.draw_string(10, y, line, COLOR_GREEN, COLOR_BLACK) + y += 20 + + +def set_led(state): + """Control LED state""" + if led: + led.value(1 if state else 0) + + +def main_loop(): + """Main voice assistant loop""" + global listening + + # Show ready status + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, lcd.height()//2 - 10, "Say wake word...", + COLOR_BLUE, COLOR_BLACK) + + print("Voice assistant ready. Listening for wake word...") + + while True: + try: + # Listen for wake word + if detect_wake_word(): + print("Wake word detected!") + + # Visual feedback + set_led(True) + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, "Listening...", COLOR_RED, COLOR_BLACK) + + # Small delay to skip the wake word itself + time.sleep(0.5) + + # Record command + audio_data = record_audio() + + # Send to server + response = send_audio_to_server(audio_data) + + if response and response.get('success'): + transcription = response.get('transcription', '') + response_text = response.get('response', 'No response') + + print(f"You said: {transcription}") + print(f"Response: {response_text}") + + # Display response + display_response(response_text) + + # TODO: Play TTS audio response + + else: + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, 50, "Error processing", + COLOR_RED, COLOR_BLACK) + + # Turn off LED + set_led(False) + + # Pause before listening again + time.sleep(2) + + # Reset display + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, lcd.height()//2 - 10, "Say wake word...", + COLOR_BLUE, COLOR_BLACK) + + # Small delay to prevent tight loop + time.sleep(0.1) + + # Garbage collection + if gc.mem_free() < 100000: # If free memory < 100KB + gc.collect() + + except KeyboardInterrupt: + print("Exiting...") + break + except Exception as e: + print(f"Error in main loop: {e}") + time.sleep(1) + + +def main(): + """Main entry point""" + print("=" * 40) + print("Maix Duino Voice Assistant") + print("=" * 40) + + # Initialize hardware + init_hardware() + + # Connect to network + if not init_network(): + print("Failed to initialize network. Exiting.") + return + + # Load wake word model (optional) + load_wake_word_model() + + # Start main loop + try: + main_loop() + except Exception as e: + print(f"Fatal error: {e}") + finally: + # Cleanup + set_led(False) + lcd.clear(COLOR_BLACK) + lcd.draw_string(10, lcd.height()//2, "Stopped", + COLOR_RED, COLOR_BLACK) + + +# Run main program +if __name__ == "__main__": + main() diff --git a/hardware/maixduino/secrets.py.example b/hardware/maixduino/secrets.py.example new file mode 100644 index 0000000..67c7d78 --- /dev/null +++ b/hardware/maixduino/secrets.py.example @@ -0,0 +1,7 @@ +# Copy this file to secrets.py and fill in your values +# secrets.py is gitignored — never commit it +SECRETS = { + "wifi_ssid": "YourNetworkName", + "wifi_password": "YourWiFiPassword", + "voice_server_url": "http://10.1.10.71:5000", # replace with your Minerva server IP +} diff --git a/scripts/download_pretrained_models.sh b/scripts/download_pretrained_models.sh new file mode 100755 index 0000000..d1437a7 --- /dev/null +++ b/scripts/download_pretrained_models.sh @@ -0,0 +1,409 @@ +#!/usr/bin/env bash +# +# Path: download_pretrained_models.sh +# +# Purpose and usage: +# Downloads and sets up pre-trained Mycroft Precise wake word models +# - Downloads Hey Mycroft, Hey Jarvis, and other available models +# - Tests each model with microphone +# - Configures voice server to use them +# +# Requirements: +# - Mycroft Precise installed (run setup_precise.sh first) +# - Internet connection for downloads +# - Microphone for testing +# +# Usage: +# ./download_pretrained_models.sh [--test-all] [--model MODEL_NAME] +# +# Author: PRbL Library +# Created: $(date +"%Y-%m-%d") + +# ----- PRbL Color and output functions ----- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +print_status() { + local level="$1" + shift + case "$level" in + "info") echo -e "${BLUE}[INFO]${NC} $*" >&2 ;; + "success") echo -e "${GREEN}[SUCCESS]${NC} $*" >&2 ;; + "warning") echo -e "${YELLOW}[WARNING]${NC} $*" >&2 ;; + "error") echo -e "${RED}[ERROR]${NC} $*" >&2 ;; + "debug") [[ "$VERBOSE" == "true" ]] && echo -e "${PURPLE}[DEBUG]${NC} $*" >&2 ;; + *) echo -e "$*" >&2 ;; + esac +} + +# ----- Configuration ----- +MODELS_DIR="$HOME/precise-models/pretrained" +TEST_ALL=false +SPECIFIC_MODEL="" +VERBOSE=false + +# Available pre-trained models +declare -A MODELS=( + ["hey-mycroft"]="https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz" + ["hey-jarvis"]="https://github.com/MycroftAI/precise-data/raw/models-dev/hey-jarvis.tar.gz" + ["christopher"]="https://github.com/MycroftAI/precise-data/raw/models-dev/christopher.tar.gz" + ["hey-ezra"]="https://github.com/MycroftAI/precise-data/raw/models-dev/hey-ezra.tar.gz" +) + +# ----- Dependency checking ----- +command_exists() { + command -v "$1" &> /dev/null +} + +check_dependencies() { + local missing=() + + if ! command_exists wget; then + missing+=("wget") + fi + + if ! command_exists precise-listen; then + missing+=("precise-listen (run setup_precise.sh first)") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_status error "Missing dependencies: ${missing[*]}" + return 1 + fi + + return 0 +} + +# ----- Parse arguments ----- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --test-all) + TEST_ALL=true + shift + ;; + --model) + SPECIFIC_MODEL="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -h|--help) + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Download and test pre-trained Mycroft Precise wake word models + +Options: + --test-all Download and test all available models + --model NAME Download and test specific model + -v, --verbose Enable verbose output + -h, --help Show this help message + +Available models: + hey-mycroft Original Mycroft wake word (most data) + hey-jarvis Popular alternative + christopher Alternative wake word + hey-ezra Another option + +Examples: + $(basename "$0") --model hey-mycroft + $(basename "$0") --test-all + +EOF + exit 0 + ;; + *) + print_status error "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# ----- Functions ----- + +create_models_directory() { + print_status info "Creating models directory: $MODELS_DIR" + mkdir -p "$MODELS_DIR" || { + print_status error "Failed to create directory" + return 1 + } + return 0 +} + +download_model() { + local model_name="$1" + local model_url="${MODELS[${model_name}]}" + + if [[ -z "$model_url" ]]; then + print_status error "Unknown model: $model_name" + return 1 + fi + + # Check if already downloaded + if [[ -f "$MODELS_DIR/${model_name}.net" ]]; then + print_status info "Model already exists: $model_name" + return 0 + fi + + print_status info "Downloading $model_name..." + + local temp_file="/tmp/${model_name}-$$.tar.gz" + + wget -q --show-progress -O "$temp_file" "$model_url" || { + print_status error "Failed to download $model_name" + rm -f "$temp_file" + return 1 + } + + # Extract + print_status info "Extracting $model_name..." + tar xzf "$temp_file" -C "$MODELS_DIR" || { + print_status error "Failed to extract $model_name" + rm -f "$temp_file" + return 1 + } + + rm -f "$temp_file" + + # Verify extraction + if [[ -f "$MODELS_DIR/${model_name}.net" ]]; then + print_status success "Downloaded: $model_name" + return 0 + else + print_status error "Extraction failed for $model_name" + return 1 + fi +} + +test_model() { + local model_name="$1" + local model_file="$MODELS_DIR/${model_name}.net" + + if [[ ! -f "$model_file" ]]; then + print_status error "Model file not found: $model_file" + return 1 + fi + + print_status info "Testing model: $model_name" + echo "" + echo -e "${CYAN}Instructions:${NC}" + echo " - Speak the wake word: '$model_name'" + echo " - You should see '!' when detected" + echo " - Press Ctrl+C to stop testing" + echo "" + read -p "Press Enter to start test..." + + # Activate conda environment if needed + if command_exists conda; then + eval "$(conda shell.bash hook)" + conda activate precise 2>/dev/null || true + fi + + precise-listen "$model_file" || { + print_status warning "Test interrupted or failed" + return 1 + } + + return 0 +} + +create_multi_wake_config() { + print_status info "Creating multi-wake-word configuration..." + + local config_file="$MODELS_DIR/multi-wake-config.sh" + + cat > "$config_file" << 'EOF' +#!/bin/bash +# Multi-wake-word configuration +# Generated by download_pretrained_models.sh + +# Start voice server with multiple wake words +cd ~/voice-assistant + +# List of wake word models +MODELS="" + +EOF + + # Add each downloaded model to config + for model_name in "${!MODELS[@]}"; do + if [[ -f "$MODELS_DIR/${model_name}.net" ]]; then + echo "# Found: $model_name" >> "$config_file" + echo "MODELS=\"\${MODELS}${model_name}:$MODELS_DIR/${model_name}.net:0.5,\"" >> "$config_file" + fi + done + + cat >> "$config_file" << 'EOF' + +# Remove trailing comma +MODELS="${MODELS%,}" + +# Activate environment +eval "$(conda shell.bash hook)" +conda activate precise + +# Start server +python voice_server.py \ + --enable-precise \ + --precise-models "$MODELS" \ + --ha-token "$HA_TOKEN" + +EOF + + chmod +x "$config_file" + + print_status success "Created: $config_file" + echo "" + print_status info "To use multiple wake words, run:" + print_status info " $config_file" + + return 0 +} + +list_downloaded_models() { + print_status info "Downloaded models in $MODELS_DIR:" + echo "" + + local count=0 + for model_name in "${!MODELS[@]}"; do + if [[ -f "$MODELS_DIR/${model_name}.net" ]]; then + local size=$(du -h "$MODELS_DIR/${model_name}.net" | cut -f1) + echo -e " ${GREEN}✓${NC} ${model_name}.net (${size})" + ((count++)) + else + echo -e " ${YELLOW}○${NC} ${model_name}.net (not downloaded)" + fi + done + + echo "" + print_status success "Total downloaded: $count" + + return 0 +} + +compare_models() { + print_status info "Model comparison:" + echo "" + + cat << 'EOF' +┌─────────────────┬──────────────┬─────────────┬─────────────────┐ +│ Wake Word │ Popularity │ Difficulty │ Recommended For │ +├─────────────────┼──────────────┼─────────────┼─────────────────┤ +│ Hey Mycroft │ ★★★★★ │ Easy │ Default choice │ +│ Hey Jarvis │ ★★★★☆ │ Easy │ Pop culture │ +│ Christopher │ ★★☆☆☆ │ Medium │ Unique name │ +│ Hey Ezra │ ★★☆☆☆ │ Medium │ Alternative │ +└─────────────────┴──────────────┴─────────────┴─────────────────┘ + +Recommendations: + - Start with: Hey Mycroft (most training data) + - For media: Hey Jarvis (Plex/entertainment) + - For uniqueness: Christopher or Hey Ezra + +Multiple wake words: + - Use different wake words for different contexts + - Example: "Hey Mycroft" for commands, "Hey Jarvis" for media + - Server can run 2-3 models simultaneously + +EOF +} + +# ----- Main ----- +main() { + print_status info "Mycroft Precise Pre-trained Model Downloader" + echo "" + + # Parse arguments + parse_args "$@" + + # Check dependencies + check_dependencies || exit 1 + + # Create directory + create_models_directory || exit 1 + + # Show comparison + if [[ -z "$SPECIFIC_MODEL" && "$TEST_ALL" != "true" ]]; then + compare_models + echo "" + print_status info "Use --model to download a specific model" + print_status info "Use --test-all to download all models" + echo "" + list_downloaded_models + exit 0 + fi + + # Download models + if [[ -n "$SPECIFIC_MODEL" ]]; then + # Download specific model + download_model "$SPECIFIC_MODEL" || exit 1 + + # Offer to test + echo "" + read -p "Test this model now? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + test_model "$SPECIFIC_MODEL" + fi + + elif [[ "$TEST_ALL" == "true" ]]; then + # Download all models + for model_name in "${!MODELS[@]}"; do + download_model "$model_name" + echo "" + done + + # Offer to test each + echo "" + print_status success "All models downloaded" + echo "" + read -p "Test each model? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + for model_name in "${!MODELS[@]}"; do + if [[ -f "$MODELS_DIR/${model_name}.net" ]]; then + echo "" + test_model "$model_name" + fi + done + fi + fi + + # List results + echo "" + list_downloaded_models + + # Create multi-wake config if multiple models + local model_count=$(find "$MODELS_DIR" -name "*.net" | wc -l) + if [[ $model_count -gt 1 ]]; then + echo "" + create_multi_wake_config + fi + + # Final instructions + echo "" + print_status success "Setup complete!" + echo "" + print_status info "Next steps:" + print_status info "1. Test a model: precise-listen $MODELS_DIR/hey-mycroft.net" + print_status info "2. Use in server: python voice_server.py --enable-precise --precise-model $MODELS_DIR/hey-mycroft.net" + print_status info "3. Fine-tune: precise-train -e 30 custom.net . --from-checkpoint $MODELS_DIR/hey-mycroft.net" + + if [[ $model_count -gt 1 ]]; then + echo "" + print_status info "For multiple wake words:" + print_status info " $MODELS_DIR/multi-wake-config.sh" + fi +} + +# Run main +main "$@" diff --git a/scripts/quick_start_hey_mycroft.sh b/scripts/quick_start_hey_mycroft.sh new file mode 100755 index 0000000..555de47 --- /dev/null +++ b/scripts/quick_start_hey_mycroft.sh @@ -0,0 +1,456 @@ +#!/usr/bin/env bash +# +# Path: quick_start_hey_mycroft.sh +# +# Purpose and usage: +# Zero-training quick start using pre-trained "Hey Mycroft" model +# Gets you a working voice assistant in 5 minutes! +# +# Requirements: +# - Heimdall already setup (ran setup_voice_assistant.sh) +# - Mycroft Precise installed (ran setup_precise.sh) +# +# Usage: +# ./quick_start_hey_mycroft.sh [--test-only] +# +# Author: PRbL Library + +# ----- PRbL Color and output functions ----- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' + +print_status() { + local level="$1" + shift + case "$level" in + "info") echo -e "${BLUE}[INFO]${NC} $*" >&2 ;; + "success") echo -e "${GREEN}[SUCCESS]${NC} $*" >&2 ;; + "warning") echo -e "${YELLOW}[WARNING]${NC} $*" >&2 ;; + "error") echo -e "${RED}[ERROR]${NC} $*" >&2 ;; + *) echo -e "$*" >&2 ;; + esac +} + +# ----- Configuration ----- +MODELS_DIR="$HOME/precise-models/pretrained" +MODEL_URL="https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz" +MODEL_NAME="hey-mycroft" +TEST_ONLY=false + +# ----- Parse arguments ----- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --test-only) + TEST_ONLY=true + shift + ;; + -h|--help) + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Quick start with pre-trained "Hey Mycroft" wake word model. +No training required! + +Options: + --test-only Just test the model, don't start server + -h, --help Show this help + +Examples: + $(basename "$0") # Download, test, and run server + $(basename "$0") --test-only # Just download and test + +EOF + exit 0 + ;; + *) + print_status error "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# ----- Functions ----- + +check_prerequisites() { + print_status info "Checking prerequisites..." + + # Check conda + if ! command -v conda &> /dev/null; then + print_status error "conda not found" + return 1 + fi + + # Check precise environment + if ! conda env list | grep -q "^precise\s"; then + print_status error "Precise environment not found" + print_status info "Run: ./setup_precise.sh first" + return 1 + fi + + # Check voice-assistant directory + if [[ ! -d "$HOME/voice-assistant" ]]; then + print_status error "Voice assistant not setup" + print_status info "Run: ./setup_voice_assistant.sh first" + return 1 + fi + + print_status success "Prerequisites OK" + return 0 +} + +download_pretrained_model() { + print_status info "Downloading pre-trained 'Hey Mycroft' model..." + + # Create directory + mkdir -p "$MODELS_DIR" + + # Check if already downloaded + if [[ -f "$MODELS_DIR/${MODEL_NAME}.net" ]]; then + print_status info "Model already downloaded" + return 0 + fi + + # Download + cd "$MODELS_DIR" || return 1 + + print_status info "Fetching from GitHub..." + wget -q --show-progress "$MODEL_URL" || { + print_status error "Failed to download model" + return 1 + } + + # Extract + print_status info "Extracting model..." + tar xzf hey-mycroft.tar.gz || { + print_status error "Failed to extract model" + return 1 + } + + # Verify + if [[ ! -f "${MODEL_NAME}.net" ]]; then + print_status error "Model file not found after extraction" + return 1 + fi + + print_status success "Model downloaded: $MODELS_DIR/${MODEL_NAME}.net" + return 0 +} + +test_model() { + print_status info "Testing wake word model..." + + cd "$MODELS_DIR" || return 1 + + # Activate conda + eval "$(conda shell.bash hook)" + conda activate precise || { + print_status error "Failed to activate precise environment" + return 1 + } + + cat << EOF + +${CYAN}═══════════════════════════════════════════════════${NC} +${CYAN} Wake Word Test: "Hey Mycroft"${NC} +${CYAN}═══════════════════════════════════════════════════${NC} + +${YELLOW}Instructions:${NC} +1. Speak "Hey Mycroft" into your microphone +2. You should see ${GREEN}"!"${NC} when detected +3. Try other phrases - should ${RED}not${NC} trigger +4. Press ${RED}Ctrl+C${NC} when done testing + +${CYAN}Starting in 3 seconds...${NC} + +EOF + + sleep 3 + + # Test the model + precise-listen "${MODEL_NAME}.net" || { + print_status error "Model test failed" + return 1 + } + + print_status success "Model test complete!" + return 0 +} + +update_config() { + print_status info "Updating voice assistant configuration..." + + local config_file="$HOME/voice-assistant/config/.env" + + if [[ ! -f "$config_file" ]]; then + print_status error "Config file not found: $config_file" + return 1 + fi + + # Update PRECISE_MODEL if exists, otherwise add it + if grep -q "^PRECISE_MODEL=" "$config_file"; then + sed -i "s|^PRECISE_MODEL=.*|PRECISE_MODEL=$MODELS_DIR/${MODEL_NAME}.net|" "$config_file" + else + echo "PRECISE_MODEL=$MODELS_DIR/${MODEL_NAME}.net" >> "$config_file" + fi + + # Update sensitivity if not set + if ! grep -q "^PRECISE_SENSITIVITY=" "$config_file"; then + echo "PRECISE_SENSITIVITY=0.5" >> "$config_file" + fi + + print_status success "Configuration updated" + return 0 +} + +start_server() { + print_status info "Starting voice assistant server..." + + cd "$HOME/voice-assistant" || return 1 + + # Activate conda + eval "$(conda shell.bash hook)" + conda activate precise || { + print_status error "Failed to activate environment" + return 1 + } + + cat << EOF + +${CYAN}═══════════════════════════════════════════════════${NC} +${GREEN} Starting Voice Assistant Server${NC} +${CYAN}═══════════════════════════════════════════════════${NC} + +${YELLOW}Configuration:${NC} + Wake word: ${GREEN}Hey Mycroft${NC} + Model: ${MODEL_NAME}.net + Server: http://0.0.0.0:5000 + +${YELLOW}What to do next:${NC} + 1. Wait for "Precise listening started" message + 2. Say ${GREEN}"Hey Mycroft"${NC} to test wake word + 3. Say a command like ${GREEN}"turn on the lights"${NC} + 4. Check server logs for activity + +${YELLOW}Press Ctrl+C to stop the server${NC} + +${CYAN}Starting server...${NC} + +EOF + + # Check if HA token is set + if ! grep -q "^HA_TOKEN=..*" config/.env; then + print_status warning "Home Assistant token not set!" + print_status warning "Commands won't execute without it." + print_status info "Edit config/.env and add your HA token" + echo + read -p "Continue anyway? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + return 1 + fi + fi + + # Start server + python voice_server.py \ + --enable-precise \ + --precise-model "$MODELS_DIR/${MODEL_NAME}.net" \ + --precise-sensitivity 0.5 + + return $? +} + +create_systemd_service() { + print_status info "Creating systemd service..." + + local service_file="/etc/systemd/system/voice-assistant.service" + + # Check if we should update existing service + if [[ -f "$service_file" ]]; then + print_status warning "Service file already exists" + read -p "Update with Hey Mycroft configuration? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + return 0 + fi + fi + + # Create service file + sudo tee "$service_file" > /dev/null << EOF +[Unit] +Description=Voice Assistant with Hey Mycroft Wake Word +After=network.target + +[Service] +Type=simple +User=$USER +WorkingDirectory=$HOME/voice-assistant +Environment="PATH=$HOME/miniconda3/envs/precise/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=$HOME/voice-assistant/config/.env +ExecStart=$HOME/miniconda3/envs/precise/bin/python voice_server.py \\ + --enable-precise \\ + --precise-model $MODELS_DIR/${MODEL_NAME}.net \\ + --precise-sensitivity 0.5 +Restart=on-failure +RestartSec=10 +StandardOutput=append:$HOME/voice-assistant/logs/voice_assistant.log +StandardError=append:$HOME/voice-assistant/logs/voice_assistant_error.log + +[Install] +WantedBy=multi-user.target +EOF + + # Reload systemd + sudo systemctl daemon-reload + + print_status success "Systemd service created" + + cat << EOF + +${CYAN}To enable and start the service:${NC} + sudo systemctl enable voice-assistant + sudo systemctl start voice-assistant + sudo systemctl status voice-assistant + +${CYAN}To view logs:${NC} + journalctl -u voice-assistant -f + +EOF + + read -p "Enable service now? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + sudo systemctl enable voice-assistant + sudo systemctl start voice-assistant + sleep 2 + sudo systemctl status voice-assistant + fi +} + +print_next_steps() { + cat << EOF + +${GREEN}═══════════════════════════════════════════════════${NC} +${GREEN} Success! Your voice assistant is ready!${NC} +${GREEN}═══════════════════════════════════════════════════${NC} + +${CYAN}What you have:${NC} + ✓ Pre-trained "Hey Mycroft" wake word + ✓ Voice assistant server configured + ✓ Ready to control Home Assistant + +${CYAN}Quick test:${NC} + 1. Say: ${GREEN}"Hey Mycroft"${NC} + 2. Say: ${GREEN}"Turn on the living room lights"${NC} + 3. Check if command executed + +${CYAN}Next steps:${NC} + 1. ${YELLOW}Configure Home Assistant entities${NC} + Edit: ~/voice-assistant/config/.env + Add: HA_TOKEN=your_token_here + + 2. ${YELLOW}Add more entity mappings${NC} + Edit: voice_server.py + Update: IntentParser.ENTITY_MAP + + 3. ${YELLOW}Fine-tune for your voice (optional)${NC} + cd ~/precise-models/hey-mycroft-custom + ./1-record-wake-word.sh + # Record 20-30 samples + precise-train -e 30 hey-mycroft-custom.net . \\ + --from-checkpoint $MODELS_DIR/${MODEL_NAME}.net + + 4. ${YELLOW}Setup Maix Duino${NC} + See: QUICKSTART.md Phase 2 + +${CYAN}Useful commands:${NC} + # Test wake word only + cd $MODELS_DIR && conda activate precise + precise-listen ${MODEL_NAME}.net + + # Check server health + curl http://localhost:5000/health + + # Monitor logs + journalctl -u voice-assistant -f + +${CYAN}Documentation:${NC} + README.md - Project overview + WAKE_WORD_ADVANCED.md - Multiple wake words guide + QUICKSTART.md - Complete setup guide + +${GREEN}Happy voice assisting! 🎙️${NC} + +EOF +} + +# ----- Main ----- +main() { + cat << EOF +${CYAN}═══════════════════════════════════════════════════${NC} +${CYAN} Quick Start: Hey Mycroft Wake Word${NC} +${CYAN}═══════════════════════════════════════════════════${NC} + +${YELLOW}This script will:${NC} + 1. Download pre-trained "Hey Mycroft" model + 2. Test wake word detection + 3. Configure voice assistant server + 4. Start the server (optional) + +${YELLOW}Total time: ~5 minutes (no training!)${NC} + +EOF + + # Parse arguments + parse_args "$@" + + # Check prerequisites + check_prerequisites || exit 1 + + # Download model + download_pretrained_model || exit 1 + + # Test model + print_status info "Ready to test wake word" + read -p "Test now? (Y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Nn]$ ]]; then + test_model + fi + + # If test-only mode, stop here + if [[ "$TEST_ONLY" == "true" ]]; then + print_status success "Test complete!" + print_status info "Model location: $MODELS_DIR/${MODEL_NAME}.net" + exit 0 + fi + + # Update configuration + update_config || exit 1 + + # Start server + read -p "Start voice assistant server now? (Y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Nn]$ ]]; then + start_server + else + # Offer to create systemd service + read -p "Create systemd service instead? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + create_systemd_service + fi + fi + + # Print next steps + print_next_steps +} + +# Run main +main "$@" diff --git a/scripts/setup_precise.sh b/scripts/setup_precise.sh new file mode 100755 index 0000000..5d13368 --- /dev/null +++ b/scripts/setup_precise.sh @@ -0,0 +1,630 @@ +#!/usr/bin/env bash +# +# Path: setup_precise.sh +# +# Purpose and usage: +# Sets up Mycroft Precise wake word detection on Heimdall +# - Creates conda environment for Precise +# - Installs TensorFlow 1.x and dependencies +# - Downloads precise-engine +# - Sets up training directories +# - Provides helper scripts for training +# +# Requirements: +# - conda/miniconda installed +# - Internet connection for downloads +# - Microphone for recording samples +# +# Usage: +# ./setup_precise.sh [--wake-word "phrase"] [--env-name NAME] +# +# Author: PRbL Library +# Created: $(date +"%Y-%m-%d") + +# ----- PRbL Color and output functions ----- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +print_status() { + local level="$1" + shift + case "$level" in + "info") echo -e "${BLUE}[INFO]${NC} $*" >&2 ;; + "success") echo -e "${GREEN}[SUCCESS]${NC} $*" >&2 ;; + "warning") echo -e "${YELLOW}[WARNING]${NC} $*" >&2 ;; + "error") echo -e "${RED}[ERROR]${NC} $*" >&2 ;; + "debug") [[ "$VERBOSE" == "true" ]] && echo -e "${PURPLE}[DEBUG]${NC} $*" >&2 ;; + *) echo -e "$*" >&2 ;; + esac +} + +# ----- Configuration ----- +CONDA_ENV_NAME="precise" +WAKE_WORD="hey computer" +MODELS_DIR="$HOME/precise-models" +VERBOSE=false + +# ----- Dependency checking ----- +command_exists() { + command -v "$1" &> /dev/null +} + +check_conda() { + if ! command_exists conda; then + print_status error "conda not found. Please install miniconda first." + return 1 + fi + return 0 +} + +# ----- Parse arguments ----- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --wake-word) + WAKE_WORD="$2" + shift 2 + ;; + --env-name) + CONDA_ENV_NAME="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -h|--help) + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Options: + --wake-word "phrase" Wake word to train (default: "hey computer") + --env-name NAME Custom conda environment name (default: precise) + -v, --verbose Enable verbose output + -h, --help Show this help message + +Examples: + $(basename "$0") --wake-word "hey jarvis" + $(basename "$0") --env-name mycroft-precise + +EOF + exit 0 + ;; + *) + print_status error "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# ----- Setup functions ----- + +create_conda_environment() { + print_status info "Creating conda environment: $CONDA_ENV_NAME" + + # Check if environment already exists + if conda env list | grep -q "^${CONDA_ENV_NAME}\s"; then + print_status warning "Environment $CONDA_ENV_NAME already exists" + read -p "Remove and recreate? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + print_status info "Removing existing environment..." + conda env remove -n "$CONDA_ENV_NAME" -y + else + print_status info "Using existing environment" + return 0 + fi + fi + + # Create new environment with Python 3.7 (required for TF 1.15) + print_status info "Creating Python 3.7 environment..." + conda create -n "$CONDA_ENV_NAME" python=3.7 -y || { + print_status error "Failed to create conda environment" + return 1 + } + + print_status success "Conda environment created" + return 0 +} + +install_tensorflow() { + print_status info "Installing TensorFlow 1.15..." + + # Activate conda environment + eval "$(conda shell.bash hook)" + conda activate "$CONDA_ENV_NAME" || { + print_status error "Failed to activate conda environment" + return 1 + } + + # Install TensorFlow 1.15 (last 1.x version) + pip install tensorflow==1.15.5 --break-system-packages || { + print_status error "Failed to install TensorFlow" + return 1 + } + + # Verify installation + python -c "import tensorflow as tf; print(f'TensorFlow {tf.__version__} installed')" || { + print_status error "TensorFlow installation verification failed" + return 1 + } + + print_status success "TensorFlow 1.15 installed" + return 0 +} + +install_precise() { + print_status info "Installing Mycroft Precise..." + + # Activate conda environment + eval "$(conda shell.bash hook)" + conda activate "$CONDA_ENV_NAME" || { + print_status error "Failed to activate conda environment" + return 1 + } + + # Install audio dependencies + print_status info "Installing system audio dependencies..." + if command_exists apt-get; then + sudo apt-get update + sudo apt-get install -y portaudio19-dev sox libatlas-base-dev || { + print_status warning "Some audio dependencies failed to install" + } + fi + + # Install Python audio libraries + pip install pyaudio --break-system-packages || { + print_status warning "PyAudio installation failed (may need manual installation)" + } + + # Install Precise + pip install mycroft-precise --break-system-packages || { + print_status error "Failed to install Mycroft Precise" + return 1 + } + + # Verify installation + python -c "import precise_runner; print('Precise installed successfully')" || { + print_status error "Precise installation verification failed" + return 1 + } + + print_status success "Mycroft Precise installed" + return 0 +} + +download_precise_engine() { + print_status info "Downloading precise-engine..." + + local engine_version="0.3.0" + local engine_url="https://github.com/MycroftAI/mycroft-precise/releases/download/v${engine_version}/precise-engine_${engine_version}_x86_64.tar.gz" + local temp_dir=$(mktemp -d) + + # Download engine + wget -q --show-progress -O "$temp_dir/precise-engine.tar.gz" "$engine_url" || { + print_status error "Failed to download precise-engine" + rm -rf "$temp_dir" + return 1 + } + + # Extract + tar xzf "$temp_dir/precise-engine.tar.gz" -C "$temp_dir" || { + print_status error "Failed to extract precise-engine" + rm -rf "$temp_dir" + return 1 + } + + # Install to /usr/local/bin + sudo cp "$temp_dir/precise-engine/precise-engine" /usr/local/bin/ || { + print_status error "Failed to install precise-engine" + rm -rf "$temp_dir" + return 1 + } + + sudo chmod +x /usr/local/bin/precise-engine + + # Clean up + rm -rf "$temp_dir" + + # Verify installation + precise-engine --version || { + print_status error "precise-engine installation verification failed" + return 1 + } + + print_status success "precise-engine installed" + return 0 +} + +create_training_directory() { + print_status info "Creating training directory structure..." + + # Sanitize wake word for directory name + local wake_word_dir=$(echo "$WAKE_WORD" | tr ' ' '-' | tr '[:upper:]' '[:lower:]') + local project_dir="$MODELS_DIR/$wake_word_dir" + + mkdir -p "$project_dir"/{wake-word,not-wake-word,test/wake-word,test/not-wake-word} + + print_status success "Training directory created: $project_dir" + + # Store project path for later use + echo "$project_dir" > "$MODELS_DIR/.current_project" + + return 0 +} + +create_training_scripts() { + print_status info "Creating training helper scripts..." + + local wake_word_dir=$(echo "$WAKE_WORD" | tr ' ' '-' | tr '[:upper:]' '[:lower:]') + local project_dir="$MODELS_DIR/$wake_word_dir" + + # Create recording script + cat > "$project_dir/1-record-wake-word.sh" << 'EOF' +#!/bin/bash +# Step 1: Record wake word samples +# Run this script and follow the prompts to record ~50-100 samples + +eval "$(conda shell.bash hook)" +conda activate precise + +echo "Recording wake word samples..." +echo "Press SPACE to start/stop recording" +echo "Press Ctrl+C when done (aim for 50-100 samples)" +echo "" + +precise-collect +EOF + + # Create not-wake-word recording script + cat > "$project_dir/2-record-not-wake-word.sh" << 'EOF' +#!/bin/bash +# Step 2: Record "not wake word" samples +# Record random speech, TV, music, similar-sounding phrases + +eval "$(conda shell.bash hook)" +conda activate precise + +echo "Recording not-wake-word samples..." +echo "Record:" +echo " - Normal conversation" +echo " - TV/music background" +echo " - Similar sounding phrases" +echo " - Ambient noise" +echo "" +echo "Press SPACE to start/stop recording" +echo "Press Ctrl+C when done (aim for 200-500 samples)" +echo "" + +precise-collect -f not-wake-word/samples.wav +EOF + + # Create training script + cat > "$project_dir/3-train-model.sh" << EOF +#!/bin/bash +# Step 3: Train the model +# This will train for 60 epochs (adjust -e parameter for more/less) + +eval "\$(conda shell.bash hook)" +conda activate precise + +echo "Training wake word model..." +echo "This will take 30-60 minutes..." +echo "" + +# Train model +precise-train -e 60 ${wake_word_dir}.net . + +echo "" +echo "Training complete!" +echo "Test with: precise-listen ${wake_word_dir}.net" +EOF + + # Create testing script + cat > "$project_dir/4-test-model.sh" << EOF +#!/bin/bash +# Step 4: Test the model with live microphone + +eval "\$(conda shell.bash hook)" +conda activate precise + +echo "Testing wake word model..." +echo "Speak your wake word - you should see '!' when detected" +echo "Speak other phrases - should not trigger" +echo "" +echo "Press Ctrl+C to exit" +echo "" + +precise-listen ${wake_word_dir}.net +EOF + + # Create evaluation script + cat > "$project_dir/5-evaluate-model.sh" << EOF +#!/bin/bash +# Step 5: Evaluate model on test set + +eval "\$(conda shell.bash hook)" +conda activate precise + +echo "Evaluating wake word model on test set..." +echo "" + +precise-test ${wake_word_dir}.net test/ + +echo "" +echo "Check metrics above:" +echo " - Wake word accuracy should be >95%" +echo " - False positive rate should be <5%" +EOF + + # Create tuning script + cat > "$project_dir/6-tune-threshold.sh" << EOF +#!/bin/bash +# Step 6: Tune activation threshold + +eval "\$(conda shell.bash hook)" +conda activate precise + +echo "Testing different thresholds..." +echo "" +echo "Default threshold: 0.5" +echo "Higher = fewer false positives, may miss some wake words" +echo "Lower = catch more wake words, more false positives" +echo "" + +for threshold in 0.3 0.5 0.7; do + echo "Testing threshold: \$threshold" + echo "Press Ctrl+C to try next threshold" + precise-listen ${wake_word_dir}.net -t \$threshold +done +EOF + + # Make all scripts executable + chmod +x "$project_dir"/*.sh + + print_status success "Training scripts created in $project_dir" + return 0 +} + +create_readme() { + print_status info "Creating README..." + + local wake_word_dir=$(echo "$WAKE_WORD" | tr ' ' '-' | tr '[:upper:]' '[:lower:]') + local project_dir="$MODELS_DIR/$wake_word_dir" + + cat > "$project_dir/README.md" << EOF +# Wake Word Training: "$WAKE_WORD" + +## Quick Start + +Follow these steps in order: + +### 1. Record Wake Word Samples +\`\`\`bash +./1-record-wake-word.sh +\`\`\` + +Record 50-100 samples: +- Vary your tone and speed +- Different distances from microphone +- Different background noise levels +- Have family members record too + +### 2. Record Not-Wake-Word Samples +\`\`\`bash +./2-record-not-wake-word.sh +\`\`\` + +Record 200-500 samples of: +- Normal conversation +- TV/music in background +- Similar sounding phrases +- Ambient household noise + +### 3. Organize Samples + +Move files into training/test split: +\`\`\`bash +# 80% of wake-word samples go to: +mv wake-word-samples-* wake-word/ + +# 20% of wake-word samples go to: +mv wake-word-samples-* test/wake-word/ + +# 80% of not-wake-word samples go to: +mv not-wake-word-samples-* not-wake-word/ + +# 20% of not-wake-word samples go to: +mv not-wake-word-samples-* test/not-wake-word/ +\`\`\` + +### 4. Train Model +\`\`\`bash +./3-train-model.sh +\`\`\` + +Wait 30-60 minutes for training to complete. + +### 5. Test Model +\`\`\`bash +./4-test-model.sh +\`\`\` + +Speak your wake word and verify detection. + +### 6. Evaluate Model +\`\`\`bash +./5-evaluate-model.sh +\`\`\` + +Check accuracy metrics on test set. + +### 7. Tune Threshold +\`\`\`bash +./6-tune-threshold.sh +\`\`\` + +Find the best threshold for your environment. + +## Tips for Good Training + +1. **Quality over quantity** - Clear samples are better than many poor ones +2. **Diverse conditions** - Different noise levels, distances, speakers +3. **Hard negatives** - Include similar-sounding phrases in not-wake-word set +4. **Regular updates** - Add false positives/negatives and retrain + +## Next Steps + +Once trained and tested: + +1. Copy model to voice assistant server: + \`\`\`bash + cp ${wake_word_dir}.net ~/voice-assistant/models/ + \`\`\` + +2. Update voice assistant config: + \`\`\`bash + vim ~/voice-assistant/config/.env + # Set: PRECISE_MODEL=~/voice-assistant/models/${wake_word_dir}.net + \`\`\` + +3. Restart voice assistant service: + \`\`\`bash + sudo systemctl restart voice-assistant + \`\`\` + +## Troubleshooting + +**Low accuracy?** +- Collect more training samples +- Increase training epochs (edit 3-train-model.sh, change -e 60 to -e 120) +- Verify 80/20 train/test split + +**Too many false positives?** +- Increase threshold (use 6-tune-threshold.sh) +- Add false trigger audio to not-wake-word set +- Retrain with more diverse negative samples + +**Misses wake words?** +- Lower threshold +- Add missed samples to training set +- Ensure good audio quality + +## Resources + +- Mycroft Precise Docs: https://github.com/MycroftAI/mycroft-precise +- Training Guide: https://mycroft-ai.gitbook.io/docs/mycroft-technologies/precise +- Community Models: https://github.com/MycroftAI/precise-data +EOF + + print_status success "README created in $project_dir" + return 0 +} + +download_pretrained_models() { + print_status info "Downloading pre-trained models..." + + # Create models directory + mkdir -p "$MODELS_DIR/pretrained" + + # Download Hey Mycroft model (as example/base) + local model_url="https://github.com/MycroftAI/precise-data/raw/models-dev/hey-mycroft.tar.gz" + + if [[ ! -f "$MODELS_DIR/pretrained/hey-mycroft.net" ]]; then + print_status info "Downloading Hey Mycroft model..." + wget -q --show-progress -O "$MODELS_DIR/pretrained/hey-mycroft.tar.gz" "$model_url" || { + print_status warning "Failed to download pre-trained model (optional)" + return 0 + } + + tar xzf "$MODELS_DIR/pretrained/hey-mycroft.tar.gz" -C "$MODELS_DIR/pretrained/" || { + print_status warning "Failed to extract pre-trained model" + return 0 + } + + print_status success "Pre-trained model downloaded" + else + print_status info "Pre-trained model already exists" + fi + + return 0 +} + +print_next_steps() { + local wake_word_dir=$(echo "$WAKE_WORD" | tr ' ' '-' | tr '[:upper:]' '[:lower:]') + local project_dir="$MODELS_DIR/$wake_word_dir" + + cat << EOF + +${GREEN}Setup complete!${NC} + +Wake word: "$WAKE_WORD" +Project directory: $project_dir + +${BLUE}Next steps:${NC} + +1. ${CYAN}Activate conda environment:${NC} + conda activate $CONDA_ENV_NAME + +2. ${CYAN}Navigate to project directory:${NC} + cd $project_dir + +3. ${CYAN}Follow the README or run scripts in order:${NC} + ./1-record-wake-word.sh # Record wake word samples + ./2-record-not-wake-word.sh # Record negative samples + # Organize samples into train/test directories + ./3-train-model.sh # Train the model (30-60 min) + ./4-test-model.sh # Test with microphone + ./5-evaluate-model.sh # Check accuracy metrics + ./6-tune-threshold.sh # Find best threshold + +${BLUE}Helpful commands:${NC} + +Test pre-trained model: + conda activate $CONDA_ENV_NAME + precise-listen $MODELS_DIR/pretrained/hey-mycroft.net + +Check precise-engine: + precise-engine --version + +${BLUE}Resources:${NC} + +Full guide: See MYCROFT_PRECISE_GUIDE.md +Project README: $project_dir/README.md +Mycroft Docs: https://github.com/MycroftAI/mycroft-precise + +EOF +} + +# ----- Main ----- +main() { + print_status info "Starting Mycroft Precise setup..." + + # Parse arguments + parse_args "$@" + + # Check dependencies + check_conda || exit 1 + + # Setup steps + create_conda_environment || exit 1 + install_tensorflow || exit 1 + install_precise || exit 1 + download_precise_engine || exit 1 + create_training_directory || exit 1 + create_training_scripts || exit 1 + create_readme || exit 1 + download_pretrained_models || exit 1 + + # Print next steps + print_next_steps +} + +# Run main +main "$@" diff --git a/scripts/setup_voice_assistant.sh b/scripts/setup_voice_assistant.sh new file mode 100755 index 0000000..bcceced --- /dev/null +++ b/scripts/setup_voice_assistant.sh @@ -0,0 +1,429 @@ +#!/usr/bin/env bash +# +# Path: setup_voice_assistant.sh +# +# Purpose and usage: +# Sets up the voice assistant server environment on Heimdall +# - Creates conda environment +# - Installs dependencies (Whisper, Flask, Piper TTS) +# - Downloads and configures TTS models +# - Sets up systemd service (optional) +# - Configures environment variables +# +# Requirements: +# - conda/miniconda installed +# - Internet connection for downloads +# - Sudo access (for systemd service setup) +# +# Usage: +# ./setup_voice_assistant.sh [--no-service] [--env-name NAME] +# +# Author: PRbL Library +# Created: $(date +"%Y-%m-%d") + +# ----- PRbL Color and output functions ----- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +print_status() { + local level="$1" + shift + case "$level" in + "info") echo -e "${BLUE}[INFO]${NC} $*" >&2 ;; + "success") echo -e "${GREEN}[SUCCESS]${NC} $*" >&2 ;; + "warning") echo -e "${YELLOW}[WARNING]${NC} $*" >&2 ;; + "error") echo -e "${RED}[ERROR]${NC} $*" >&2 ;; + "debug") [[ "$VERBOSE" == "true" ]] && echo -e "${PURPLE}[DEBUG]${NC} $*" >&2 ;; + *) echo -e "$*" >&2 ;; + esac +} + +# ----- Configuration ----- +CONDA_ENV_NAME="voice-assistant" +PROJECT_DIR="$HOME/voice-assistant" +INSTALL_SYSTEMD=true +VERBOSE=false + +# ----- Dependency checking ----- +command_exists() { + command -v "$1" &> /dev/null +} + +check_conda() { + if ! command_exists conda; then + print_status error "conda not found. Please install miniconda first." + print_status info "Install with: wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" + print_status info " bash Miniconda3-latest-Linux-x86_64.sh" + return 1 + fi + return 0 +} + +# ----- Parse arguments ----- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --no-service) + INSTALL_SYSTEMD=false + shift + ;; + --env-name) + CONDA_ENV_NAME="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -h|--help) + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Options: + --no-service Don't install systemd service + --env-name NAME Custom conda environment name (default: voice-assistant) + -v, --verbose Enable verbose output + -h, --help Show this help message + +EOF + exit 0 + ;; + *) + print_status error "Unknown option: $1" + exit 1 + ;; + esac + done +} + +# ----- Setup functions ----- + +create_project_directory() { + print_status info "Creating project directory: $PROJECT_DIR" + + if [[ ! -d "$PROJECT_DIR" ]]; then + mkdir -p "$PROJECT_DIR" || { + print_status error "Failed to create project directory" + return 1 + } + fi + + # Create subdirectories + mkdir -p "$PROJECT_DIR"/{logs,models,config} + + print_status success "Project directory created" + return 0 +} + +create_conda_environment() { + print_status info "Creating conda environment: $CONDA_ENV_NAME" + + # Check if environment already exists + if conda env list | grep -q "^${CONDA_ENV_NAME}\s"; then + print_status warning "Environment $CONDA_ENV_NAME already exists" + read -p "Remove and recreate? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + print_status info "Removing existing environment..." + conda env remove -n "$CONDA_ENV_NAME" -y + else + print_status info "Using existing environment" + return 0 + fi + fi + + # Create new environment + print_status info "Creating Python 3.10 environment..." + conda create -n "$CONDA_ENV_NAME" python=3.10 -y || { + print_status error "Failed to create conda environment" + return 1 + } + + print_status success "Conda environment created" + return 0 +} + +install_python_dependencies() { + print_status info "Installing Python dependencies..." + + # Activate conda environment + eval "$(conda shell.bash hook)" + conda activate "$CONDA_ENV_NAME" || { + print_status error "Failed to activate conda environment" + return 1 + } + + # Install base dependencies + print_status info "Installing base packages..." + pip install --upgrade pip --break-system-packages || true + + # Install Whisper (OpenAI) + print_status info "Installing OpenAI Whisper..." + pip install -U openai-whisper --break-system-packages || { + print_status error "Failed to install Whisper" + return 1 + } + + # Install Flask + print_status info "Installing Flask..." + pip install flask --break-system-packages || { + print_status error "Failed to install Flask" + return 1 + } + + # Install requests + print_status info "Installing requests..." + pip install requests --break-system-packages || { + print_status error "Failed to install requests" + return 1 + } + + # Install python-dotenv + print_status info "Installing python-dotenv..." + pip install python-dotenv --break-system-packages || { + print_status warning "Failed to install python-dotenv (optional)" + } + + # Install Piper TTS + print_status info "Installing Piper TTS..." + # Note: Piper TTS installation method varies, adjust as needed + # For now, we'll install the Python package if available + pip install piper-tts --break-system-packages || { + print_status warning "Piper TTS pip package not found" + print_status info "You may need to install Piper manually from: https://github.com/rhasspy/piper" + } + + # Install PyAudio for audio handling + print_status info "Installing PyAudio dependencies..." + if command_exists apt-get; then + sudo apt-get install -y portaudio19-dev python3-pyaudio || { + print_status warning "Failed to install portaudio dev packages" + } + fi + + pip install pyaudio --break-system-packages || { + print_status warning "Failed to install PyAudio (may need manual installation)" + } + + print_status success "Python dependencies installed" + return 0 +} + +download_piper_models() { + print_status info "Downloading Piper TTS models..." + + local models_dir="$PROJECT_DIR/models/piper" + mkdir -p "$models_dir" + + # Download a default voice model + # Example: en_US-lessac-medium + local model_url="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx" + local config_url="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json" + + if [[ ! -f "$models_dir/en_US-lessac-medium.onnx" ]]; then + print_status info "Downloading voice model..." + wget -q --show-progress -O "$models_dir/en_US-lessac-medium.onnx" "$model_url" || { + print_status warning "Failed to download Piper model (manual download may be needed)" + } + + wget -q --show-progress -O "$models_dir/en_US-lessac-medium.onnx.json" "$config_url" || { + print_status warning "Failed to download Piper config" + } + else + print_status info "Piper model already downloaded" + fi + + print_status success "Piper models ready" + return 0 +} + +create_config_file() { + print_status info "Creating configuration file..." + + local config_file="$PROJECT_DIR/config/.env" + + if [[ -f "$config_file" ]]; then + print_status warning "Config file already exists: $config_file" + return 0 + fi + + cat > "$config_file" << 'EOF' +# Voice Assistant Configuration +# Path: ~/voice-assistant/config/.env + +# Home Assistant Configuration +HA_URL=http://homeassistant.local:8123 +HA_TOKEN=your_long_lived_access_token_here + +# Server Configuration +SERVER_HOST=0.0.0.0 +SERVER_PORT=5000 + +# Whisper Configuration +WHISPER_MODEL=medium + +# Piper TTS Configuration +PIPER_MODEL=/path/to/piper/model.onnx +PIPER_CONFIG=/path/to/piper/model.onnx.json + +# Logging +LOG_LEVEL=INFO +LOG_FILE=/home/$USER/voice-assistant/logs/voice_assistant.log +EOF + + # Update paths in config + sed -i "s|/path/to/piper/model.onnx|$PROJECT_DIR/models/piper/en_US-lessac-medium.onnx|g" "$config_file" + sed -i "s|/path/to/piper/model.onnx.json|$PROJECT_DIR/models/piper/en_US-lessac-medium.onnx.json|g" "$config_file" + sed -i "s|/home/\$USER|$HOME|g" "$config_file" + + chmod 600 "$config_file" + + print_status success "Config file created: $config_file" + print_status warning "Please edit $config_file and add your Home Assistant token" + + return 0 +} + +create_systemd_service() { + if [[ "$INSTALL_SYSTEMD" != "true" ]]; then + print_status info "Skipping systemd service installation" + return 0 + fi + + print_status info "Creating systemd service..." + + local service_file="/etc/systemd/system/voice-assistant.service" + + # Create service file + sudo tee "$service_file" > /dev/null << EOF +[Unit] +Description=Voice Assistant Server +After=network.target + +[Service] +Type=simple +User=$USER +WorkingDirectory=$PROJECT_DIR +Environment="PATH=$HOME/miniconda3/envs/$CONDA_ENV_NAME/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=$PROJECT_DIR/config/.env +ExecStart=$HOME/miniconda3/envs/$CONDA_ENV_NAME/bin/python $PROJECT_DIR/voice_server.py +Restart=on-failure +RestartSec=10 +StandardOutput=append:$PROJECT_DIR/logs/voice_assistant.log +StandardError=append:$PROJECT_DIR/logs/voice_assistant_error.log + +[Install] +WantedBy=multi-user.target +EOF + + # Reload systemd + sudo systemctl daemon-reload + + print_status success "Systemd service created" + print_status info "To enable and start the service:" + print_status info " sudo systemctl enable voice-assistant" + print_status info " sudo systemctl start voice-assistant" + + return 0 +} + +create_test_script() { + print_status info "Creating test script..." + + local test_script="$PROJECT_DIR/test_server.sh" + + cat > "$test_script" << 'EOF' +#!/bin/bash +# Test script for voice assistant server + +# Activate conda environment +eval "$(conda shell.bash hook)" +conda activate voice-assistant + +# Load environment variables +if [[ -f ~/voice-assistant/config/.env ]]; then + export $(grep -v '^#' ~/voice-assistant/config/.env | xargs) +fi + +# Run server +cd ~/voice-assistant +python voice_server.py --verbose +EOF + + chmod +x "$test_script" + + print_status success "Test script created: $test_script" + return 0 +} + +install_voice_server_script() { + print_status info "Installing voice_server.py..." + + # Check if voice_server.py exists in outputs + if [[ -f "$HOME/voice_server.py" ]]; then + cp "$HOME/voice_server.py" "$PROJECT_DIR/voice_server.py" + print_status success "voice_server.py installed" + elif [[ -f "./voice_server.py" ]]; then + cp "./voice_server.py" "$PROJECT_DIR/voice_server.py" + print_status success "voice_server.py installed" + else + print_status warning "voice_server.py not found in current directory" + print_status info "Please copy voice_server.py to $PROJECT_DIR manually" + fi + + return 0 +} + +# ----- Main ----- +main() { + print_status info "Starting voice assistant setup..." + + # Parse arguments + parse_args "$@" + + # Check dependencies + check_conda || exit 1 + + # Setup steps + create_project_directory || exit 1 + create_conda_environment || exit 1 + install_python_dependencies || exit 1 + download_piper_models || exit 1 + create_config_file || exit 1 + install_voice_server_script || exit 1 + create_test_script || exit 1 + + if [[ "$INSTALL_SYSTEMD" == "true" ]]; then + create_systemd_service || exit 1 + fi + + # Final instructions + print_status success "Setup complete!" + echo + print_status info "Next steps:" + print_status info "1. Edit config file: vim $PROJECT_DIR/config/.env" + print_status info "2. Add your Home Assistant long-lived access token" + print_status info "3. Test the server: $PROJECT_DIR/test_server.sh" + print_status info "4. Configure your Maix Duino device" + + if [[ "$INSTALL_SYSTEMD" == "true" ]]; then + echo + print_status info "To run as a service:" + print_status info " sudo systemctl enable voice-assistant" + print_status info " sudo systemctl start voice-assistant" + print_status info " sudo systemctl status voice-assistant" + fi + + echo + print_status info "Project directory: $PROJECT_DIR" + print_status info "Conda environment: $CONDA_ENV_NAME" + print_status info "Activate with: conda activate $CONDA_ENV_NAME" +} + +# Run main +main "$@" diff --git a/scripts/voice_server.py b/scripts/voice_server.py new file mode 100755 index 0000000..d18bc3d --- /dev/null +++ b/scripts/voice_server.py @@ -0,0 +1,700 @@ +#!/usr/bin/env python3 +""" +Voice Processing Server for Maix Duino Voice Assistant + +Purpose and usage: + This server runs on Heimdall (10.1.10.71) and handles: + - Audio stream reception from Maix Duino + - Speech-to-text using Whisper + - Intent recognition and Home Assistant API calls + - Text-to-speech using Piper + - Audio response streaming back to device + +Path: /home/alan/voice-assistant/voice_server.py + +Requirements: + - whisper (already installed) + - piper-tts + - flask + - requests + - python-dotenv + +Usage: + python3 voice_server.py [--host HOST] [--port PORT] [--ha-url URL] +""" + +import os +import sys +import argparse +import tempfile +import wave +import io +import re +import threading +import queue +from pathlib import Path +from typing import Optional, Dict, Any, Tuple + +import whisper +import requests +from flask import Flask, request, jsonify, send_file +from werkzeug.exceptions import BadRequest + +# Try to load environment variables +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + print("Warning: python-dotenv not installed. Using environment variables only.") + +# Try to import Mycroft Precise +PRECISE_AVAILABLE = False +try: + from precise_runner import PreciseEngine, PreciseRunner + import pyaudio + PRECISE_AVAILABLE = True +except ImportError: + print("Warning: Mycroft Precise not installed. Wake word detection disabled.") + print("Install with: pip install mycroft-precise pyaudio") + +# Configuration +DEFAULT_HOST = "0.0.0.0" +DEFAULT_PORT = 5000 +DEFAULT_WHISPER_MODEL = "medium" +DEFAULT_HA_URL = os.getenv("HA_URL", "http://homeassistant.local:8123") +DEFAULT_HA_TOKEN = os.getenv("HA_TOKEN", "") +DEFAULT_PRECISE_MODEL = os.getenv("PRECISE_MODEL", "") +DEFAULT_PRECISE_SENSITIVITY = float(os.getenv("PRECISE_SENSITIVITY", "0.5")) +DEFAULT_PRECISE_ENGINE = "/usr/local/bin/precise-engine" + +# Initialize Flask app +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max audio file + +# Global variables for loaded models +whisper_model = None +ha_client = None +precise_runner = None +precise_enabled = False +wake_word_queue = queue.Queue() # Queue for wake word detections + + +class HomeAssistantClient: + """Client for interacting with Home Assistant API""" + + def __init__(self, base_url: str, token: str): + self.base_url = base_url.rstrip('/') + self.token = token + self.session = requests.Session() + self.session.headers.update({ + 'Authorization': f'Bearer {token}', + 'Content-Type': 'application/json' + }) + + def get_state(self, entity_id: str) -> Optional[Dict[str, Any]]: + """Get the state of an entity""" + try: + response = self.session.get(f'{self.base_url}/api/states/{entity_id}') + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"Error getting state for {entity_id}: {e}") + return None + + def call_service(self, domain: str, service: str, entity_id: str, + **kwargs) -> bool: + """Call a Home Assistant service""" + try: + data = {'entity_id': entity_id} + data.update(kwargs) + + response = self.session.post( + f'{self.base_url}/api/services/{domain}/{service}', + json=data + ) + response.raise_for_status() + return True + except requests.RequestException as e: + print(f"Error calling service {domain}.{service}: {e}") + return False + + def turn_on(self, entity_id: str, **kwargs) -> bool: + """Turn on an entity""" + domain = entity_id.split('.')[0] + return self.call_service(domain, 'turn_on', entity_id, **kwargs) + + def turn_off(self, entity_id: str, **kwargs) -> bool: + """Turn off an entity""" + domain = entity_id.split('.')[0] + return self.call_service(domain, 'turn_off', entity_id, **kwargs) + + def toggle(self, entity_id: str, **kwargs) -> bool: + """Toggle an entity""" + domain = entity_id.split('.')[0] + return self.call_service(domain, 'toggle', entity_id, **kwargs) + + +class IntentParser: + """Simple pattern-based intent recognition""" + + # Intent patterns (can be expanded or replaced with ML-based NLU) + PATTERNS = { + 'turn_on': [ + r'turn on (the )?(.+)', + r'switch on (the )?(.+)', + r'enable (the )?(.+)', + ], + 'turn_off': [ + r'turn off (the )?(.+)', + r'switch off (the )?(.+)', + r'disable (the )?(.+)', + ], + 'toggle': [ + r'toggle (the )?(.+)', + ], + 'get_state': [ + r'what(?:\'s| is) (the )?(.+)', + r'how is (the )?(.+)', + r'status of (the )?(.+)', + ], + 'get_temperature': [ + r'what(?:\'s| is) the temperature', + r'how (?:warm|cold|hot) is it', + ], + } + + # Entity name mapping (friendly names to entity IDs) + ENTITY_MAP = { + 'living room light': 'light.living_room', + 'living room lights': 'light.living_room', + 'bedroom light': 'light.bedroom', + 'bedroom lights': 'light.bedroom', + 'kitchen light': 'light.kitchen', + 'kitchen lights': 'light.kitchen', + 'all lights': 'group.all_lights', + 'temperature': 'sensor.temperature', + 'thermostat': 'climate.thermostat', + } + + def parse(self, text: str) -> Optional[Tuple[str, str, Dict[str, Any]]]: + """ + Parse text into intent, entity, and parameters + + Returns: + (intent, entity_id, params) or None if no match + """ + text = text.lower().strip() + + for intent, patterns in self.PATTERNS.items(): + for pattern in patterns: + match = re.match(pattern, text, re.IGNORECASE) + if match: + # Extract entity name from match groups + entity_name = None + for group in match.groups(): + if group and group.lower() not in ['the', 'a', 'an']: + entity_name = group.lower().strip() + break + + # Map entity name to entity ID + entity_id = None + if entity_name: + entity_id = self.ENTITY_MAP.get(entity_name) + + # For get_temperature, use default sensor + if intent == 'get_temperature': + entity_id = self.ENTITY_MAP.get('temperature') + + if entity_id: + return (intent, entity_id, {}) + + return None + + +def load_whisper_model(model_name: str = DEFAULT_WHISPER_MODEL): + """Load Whisper model""" + global whisper_model + + if whisper_model is None: + print(f"Loading Whisper model: {model_name}") + whisper_model = whisper.load_model(model_name) + print("Whisper model loaded successfully") + + return whisper_model + + +def transcribe_audio(audio_file_path: str) -> Optional[str]: + """Transcribe audio file using Whisper""" + try: + model = load_whisper_model() + result = model.transcribe(audio_file_path) + return result['text'].strip() + except Exception as e: + print(f"Error transcribing audio: {e}") + return None + + +def generate_tts(text: str) -> Optional[bytes]: + """ + Generate speech from text using Piper TTS + + TODO: Implement Piper TTS integration + For now, returns None - implement based on Piper installation + """ + # Placeholder for TTS implementation + print(f"TTS requested for: {text}") + + # You'll need to add Piper TTS integration here + # Example command: piper --model --output_file < text + + return None + + +def on_wake_word_detected(): + """ + Callback when Mycroft Precise detects wake word + + This function is called by the Precise runner when the wake word + is detected. It signals the main application to start recording + and processing the user's command. + """ + print("Wake word detected by Precise!") + wake_word_queue.put({ + 'timestamp': time.time(), + 'source': 'precise' + }) + + +def start_precise_listener(model_path: str, sensitivity: float = 0.5, + engine_path: str = DEFAULT_PRECISE_ENGINE): + """ + Start Mycroft Precise wake word detection + + Args: + model_path: Path to .net model file + sensitivity: Detection threshold (0.0-1.0, default 0.5) + engine_path: Path to precise-engine binary + + Returns: + PreciseRunner instance if successful, None otherwise + """ + global precise_runner, precise_enabled + + if not PRECISE_AVAILABLE: + print("Error: Mycroft Precise not available") + return None + + # Verify model exists + if not os.path.exists(model_path): + print(f"Error: Precise model not found: {model_path}") + return None + + # Verify engine exists + if not os.path.exists(engine_path): + print(f"Error: precise-engine not found: {engine_path}") + print("Download from: https://github.com/MycroftAI/mycroft-precise/releases") + return None + + try: + # Create Precise engine + engine = PreciseEngine(engine_path, model_path) + + # Create runner with callback + precise_runner = PreciseRunner( + engine, + sensitivity=sensitivity, + on_activation=on_wake_word_detected + ) + + # Start listening + precise_runner.start() + precise_enabled = True + + print(f"Precise listening started:") + print(f" Model: {model_path}") + print(f" Sensitivity: {sensitivity}") + print(f" Engine: {engine_path}") + + return precise_runner + + except Exception as e: + print(f"Error starting Precise: {e}") + return None + + +def stop_precise_listener(): + """Stop Mycroft Precise wake word detection""" + global precise_runner, precise_enabled + + if precise_runner: + try: + precise_runner.stop() + precise_enabled = False + print("Precise listener stopped") + except Exception as e: + print(f"Error stopping Precise: {e}") + + +def record_audio_after_wake(duration: int = 5) -> Optional[bytes]: + """ + Record audio after wake word is detected + + Args: + duration: Maximum recording duration in seconds + + Returns: + WAV audio data or None + + Note: This is for server-side wake word detection where + the server is also doing audio capture. For Maix Duino + client-side wake detection, audio comes from the client. + """ + if not PRECISE_AVAILABLE: + return None + + try: + # Audio settings + CHUNK = 1024 + FORMAT = pyaudio.paInt16 + CHANNELS = 1 + RATE = 16000 + + p = pyaudio.PyAudio() + + # Open stream + stream = p.open( + format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK + ) + + print(f"Recording for {duration} seconds...") + + frames = [] + for _ in range(0, int(RATE / CHUNK * duration)): + data = stream.read(CHUNK) + frames.append(data) + + # Stop and close stream + stream.stop_stream() + stream.close() + p.terminate() + + # Convert to WAV + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wf: + wf.setnchannels(CHANNELS) + wf.setsampwidth(p.get_sample_size(FORMAT)) + wf.setframerate(RATE) + wf.writeframes(b''.join(frames)) + + return wav_buffer.getvalue() + + except Exception as e: + print(f"Error recording audio: {e}") + return None + + +import time # Add this import at the top if not already there + + +def execute_intent(intent: str, entity_id: str, params: Dict[str, Any]) -> str: + """Execute an intent and return response text""" + + if intent == 'turn_on': + success = ha_client.turn_on(entity_id) + if success: + entity_name = entity_id.split('.')[-1].replace('_', ' ') + return f"Turned on {entity_name}" + else: + return "Sorry, I couldn't turn that on" + + elif intent == 'turn_off': + success = ha_client.turn_off(entity_id) + if success: + entity_name = entity_id.split('.')[-1].replace('_', ' ') + return f"Turned off {entity_name}" + else: + return "Sorry, I couldn't turn that off" + + elif intent == 'toggle': + success = ha_client.toggle(entity_id) + if success: + entity_name = entity_id.split('.')[-1].replace('_', ' ') + return f"Toggled {entity_name}" + else: + return "Sorry, I couldn't toggle that" + + elif intent in ['get_state', 'get_temperature']: + state = ha_client.get_state(entity_id) + if state: + entity_name = entity_id.split('.')[-1].replace('_', ' ') + value = state.get('state', 'unknown') + unit = state.get('attributes', {}).get('unit_of_measurement', '') + + return f"The {entity_name} is {value} {unit}".strip() + else: + return "Sorry, I couldn't get that information" + + return "I didn't understand that command" + + +# Flask routes + +@app.route('/health', methods=['GET']) +def health(): + """Health check endpoint""" + return jsonify({ + 'status': 'healthy', + 'whisper_loaded': whisper_model is not None, + 'ha_connected': ha_client is not None, + 'precise_enabled': precise_enabled, + 'precise_available': PRECISE_AVAILABLE + }) + + +@app.route('/wake-word/status', methods=['GET']) +def wake_word_status(): + """Get wake word detection status""" + return jsonify({ + 'enabled': precise_enabled, + 'available': PRECISE_AVAILABLE, + 'model': DEFAULT_PRECISE_MODEL if precise_enabled else None, + 'sensitivity': DEFAULT_PRECISE_SENSITIVITY if precise_enabled else None + }) + + +@app.route('/wake-word/detections', methods=['GET']) +def wake_word_detections(): + """ + Get recent wake word detections (non-blocking) + + Returns any wake word detections in the queue. + Used for testing and monitoring. + """ + detections = [] + + try: + while not wake_word_queue.empty(): + detections.append(wake_word_queue.get_nowait()) + except queue.Empty: + pass + + return jsonify({ + 'detections': detections, + 'count': len(detections) + }) + + +@app.route('/transcribe', methods=['POST']) +def transcribe(): + """ + Transcribe audio file + + Expects: WAV audio file in request body + Returns: JSON with transcribed text + """ + if 'audio' not in request.files: + raise BadRequest('No audio file provided') + + audio_file = request.files['audio'] + + # Save to temporary file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: + audio_file.save(temp_file.name) + temp_path = temp_file.name + + try: + # Transcribe + text = transcribe_audio(temp_path) + + if text: + return jsonify({ + 'success': True, + 'text': text + }) + else: + return jsonify({ + 'success': False, + 'error': 'Transcription failed' + }), 500 + + finally: + # Clean up temp file + if os.path.exists(temp_path): + os.remove(temp_path) + + +@app.route('/process', methods=['POST']) +def process(): + """ + Process complete voice command + + Expects: WAV audio file in request body + Returns: JSON with response and audio file + """ + if 'audio' not in request.files: + raise BadRequest('No audio file provided') + + audio_file = request.files['audio'] + + # Save to temporary file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: + audio_file.save(temp_file.name) + temp_path = temp_file.name + + try: + # Step 1: Transcribe + text = transcribe_audio(temp_path) + + if not text: + return jsonify({ + 'success': False, + 'error': 'Transcription failed' + }), 500 + + print(f"Transcribed: {text}") + + # Step 2: Parse intent + parser = IntentParser() + intent_result = parser.parse(text) + + if not intent_result: + response_text = "I didn't understand that command" + else: + intent, entity_id, params = intent_result + print(f"Intent: {intent}, Entity: {entity_id}") + + # Step 3: Execute intent + response_text = execute_intent(intent, entity_id, params) + + print(f"Response: {response_text}") + + # Step 4: Generate TTS (placeholder for now) + # audio_response = generate_tts(response_text) + + return jsonify({ + 'success': True, + 'transcription': text, + 'response': response_text, + # 'audio_available': audio_response is not None + }) + + finally: + # Clean up temp file + if os.path.exists(temp_path): + os.remove(temp_path) + + +@app.route('/tts', methods=['POST']) +def tts(): + """ + Generate TTS audio + + Expects: JSON with 'text' field + Returns: WAV audio file + """ + data = request.get_json() + + if not data or 'text' not in data: + raise BadRequest('No text provided') + + text = data['text'] + + # Generate TTS + audio_data = generate_tts(text) + + if audio_data: + return send_file( + io.BytesIO(audio_data), + mimetype='audio/wav', + as_attachment=True, + download_name='response.wav' + ) + else: + return jsonify({ + 'success': False, + 'error': 'TTS generation not implemented yet' + }), 501 + + +def main(): + parser = argparse.ArgumentParser( + description="Voice Processing Server for Maix Duino Voice Assistant" + ) + parser.add_argument('--host', default=DEFAULT_HOST, + help=f'Server host (default: {DEFAULT_HOST})') + parser.add_argument('--port', type=int, default=DEFAULT_PORT, + help=f'Server port (default: {DEFAULT_PORT})') + parser.add_argument('--whisper-model', default=DEFAULT_WHISPER_MODEL, + help=f'Whisper model to use (default: {DEFAULT_WHISPER_MODEL})') + parser.add_argument('--ha-url', default=DEFAULT_HA_URL, + help=f'Home Assistant URL (default: {DEFAULT_HA_URL})') + parser.add_argument('--ha-token', default=DEFAULT_HA_TOKEN, + help='Home Assistant long-lived access token') + parser.add_argument('--enable-precise', action='store_true', + help='Enable Mycroft Precise wake word detection') + parser.add_argument('--precise-model', default=DEFAULT_PRECISE_MODEL, + help='Path to Precise .net model file') + parser.add_argument('--precise-sensitivity', type=float, + default=DEFAULT_PRECISE_SENSITIVITY, + help='Precise sensitivity threshold (0.0-1.0, default: 0.5)') + parser.add_argument('--precise-engine', default=DEFAULT_PRECISE_ENGINE, + help=f'Path to precise-engine binary (default: {DEFAULT_PRECISE_ENGINE})') + + args = parser.parse_args() + + # Validate HA configuration + if not args.ha_token: + print("Warning: No Home Assistant token provided!") + print("Set HA_TOKEN environment variable or use --ha-token") + print("Commands will not execute without authentication.") + + # Initialize global clients + global ha_client + ha_client = HomeAssistantClient(args.ha_url, args.ha_token) + + # Load Whisper model + print(f"Starting voice processing server on {args.host}:{args.port}") + load_whisper_model(args.whisper_model) + + # Start Precise if enabled + if args.enable_precise: + if not PRECISE_AVAILABLE: + print("Error: --enable-precise specified but Mycroft Precise not installed") + print("Install with: pip install mycroft-precise pyaudio") + sys.exit(1) + + if not args.precise_model: + print("Error: --enable-precise requires --precise-model") + sys.exit(1) + + print("\nStarting Mycroft Precise wake word detection...") + precise_result = start_precise_listener( + args.precise_model, + args.precise_sensitivity, + args.precise_engine + ) + + if not precise_result: + print("Error: Failed to start Precise listener") + sys.exit(1) + + print("\nWake word detection active!") + print("The server will detect wake words and queue them for processing.") + print("Use /wake-word/detections endpoint to check for detections.\n") + + # Start Flask server + try: + app.run(host=args.host, port=args.port, debug=False) + except KeyboardInterrupt: + print("\nShutting down...") + if args.enable_precise: + stop_precise_listener() + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/scripts/voice_server_enhanced.py b/scripts/voice_server_enhanced.py new file mode 100755 index 0000000..74e9a84 --- /dev/null +++ b/scripts/voice_server_enhanced.py @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +""" +Enhanced Voice Server with Multiple Wake Words and Speaker Identification + +Path: /home/alan/voice-assistant/voice_server_enhanced.py + +This enhanced version adds: +- Multiple wake word support +- Speaker identification using pyannote.audio +- Per-user customization +- Wake word-specific responses + +Usage: + python3 voice_server_enhanced.py \ + --enable-precise \ + --multi-wake-word \ + --enable-speaker-id +""" + +import os +import sys +import json +import argparse +import tempfile +import wave +import io +import re +import threading +import queue +import time +from pathlib import Path +from typing import Optional, Dict, Any, Tuple, List + +import whisper +import requests +from flask import Flask, request, jsonify, send_file +from werkzeug.exceptions import BadRequest + +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass + +# Mycroft Precise +PRECISE_AVAILABLE = False +try: + from precise_runner import PreciseEngine, PreciseRunner + import pyaudio + PRECISE_AVAILABLE = True +except ImportError: + print("Warning: Mycroft Precise not installed") + +# Speaker identification +SPEAKER_ID_AVAILABLE = False +try: + from pyannote.audio import Inference + from scipy.spatial.distance import cosine + import numpy as np + SPEAKER_ID_AVAILABLE = True +except ImportError: + print("Warning: Speaker ID not available. Install: pip install pyannote.audio scipy") + +# Configuration +DEFAULT_HOST = "0.0.0.0" +DEFAULT_PORT = 5000 +DEFAULT_WHISPER_MODEL = "medium" +DEFAULT_HA_URL = os.getenv("HA_URL", "http://homeassistant.local:8123") +DEFAULT_HA_TOKEN = os.getenv("HA_TOKEN", "") +DEFAULT_PRECISE_ENGINE = "/usr/local/bin/precise-engine" +DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN", "") + +# Wake word configurations +WAKE_WORD_CONFIGS = { + 'hey_mycroft': { + 'model': os.path.expanduser('~/precise-models/pretrained/hey-mycroft.net'), + 'sensitivity': 0.5, + 'response': 'Yes?', + 'enabled': True, + 'context': 'general' + }, + 'hey_computer': { + 'model': os.path.expanduser('~/precise-models/hey-computer/hey-computer.net'), + 'sensitivity': 0.5, + 'response': 'I\'m listening', + 'enabled': False, # Disabled by default (requires training) + 'context': 'general' + }, + 'jarvis': { + 'model': os.path.expanduser('~/precise-models/jarvis/jarvis.net'), + 'sensitivity': 0.6, + 'response': 'At your service', + 'enabled': False, + 'context': 'personal' + }, +} + +# Speaker profiles (stored in JSON file) +SPEAKER_PROFILES_FILE = os.path.expanduser('~/voice-assistant/config/speaker_profiles.json') + +# Flask app +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 + +# Global state +whisper_model = None +ha_client = None +precise_runners = {} +precise_enabled = False +speaker_id_enabled = False +speaker_inference = None +speaker_profiles = {} +wake_word_queue = queue.Queue() + + +class HomeAssistantClient: + """Client for Home Assistant API""" + + def __init__(self, base_url: str, token: str): + self.base_url = base_url.rstrip('/') + self.token = token + self.session = requests.Session() + self.session.headers.update({ + 'Authorization': f'Bearer {token}', + 'Content-Type': 'application/json' + }) + + def get_state(self, entity_id: str) -> Optional[Dict[str, Any]]: + try: + response = self.session.get(f'{self.base_url}/api/states/{entity_id}') + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"Error getting state for {entity_id}: {e}") + return None + + def call_service(self, domain: str, service: str, entity_id: str, **kwargs) -> bool: + try: + data = {'entity_id': entity_id} + data.update(kwargs) + response = self.session.post( + f'{self.base_url}/api/services/{domain}/{service}', + json=data + ) + response.raise_for_status() + return True + except requests.RequestException as e: + print(f"Error calling service {domain}.{service}: {e}") + return False + + def turn_on(self, entity_id: str, **kwargs) -> bool: + domain = entity_id.split('.')[0] + return self.call_service(domain, 'turn_on', entity_id, **kwargs) + + def turn_off(self, entity_id: str, **kwargs) -> bool: + domain = entity_id.split('.')[0] + return self.call_service(domain, 'turn_off', entity_id, **kwargs) + + +class SpeakerIdentification: + """Speaker identification using pyannote.audio""" + + def __init__(self, hf_token: str): + if not SPEAKER_ID_AVAILABLE: + raise ImportError("Speaker ID dependencies not available") + + self.inference = Inference( + "pyannote/embedding", + use_auth_token=hf_token + ) + self.profiles = {} + + def enroll_speaker(self, name: str, audio_file: str): + """Enroll a speaker from audio file""" + embedding = self.inference(audio_file) + self.profiles[name] = { + 'embedding': embedding.tolist(), # Convert to list for JSON + 'enrolled': time.time() + } + print(f"Enrolled speaker: {name}") + + def identify_speaker(self, audio_file: str, threshold: float = 0.7) -> Optional[str]: + """Identify speaker from audio file""" + if not self.profiles: + return None + + unknown_embedding = self.inference(audio_file) + + best_match = None + best_similarity = 0.0 + + for name, profile in self.profiles.items(): + known_embedding = np.array(profile['embedding']) + similarity = 1 - cosine(unknown_embedding, known_embedding) + + if similarity > best_similarity: + best_similarity = similarity + best_match = name + + if best_similarity >= threshold: + return best_match + + return 'unknown' + + def load_profiles(self, filepath: str): + """Load speaker profiles from JSON""" + if os.path.exists(filepath): + with open(filepath, 'r') as f: + self.profiles = json.load(f) + print(f"Loaded {len(self.profiles)} speaker profiles") + + def save_profiles(self, filepath: str): + """Save speaker profiles to JSON""" + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, 'w') as f: + json.dump(self.profiles, f, indent=2) + print(f"Saved {len(self.profiles)} speaker profiles") + + +def load_whisper_model(model_name: str = DEFAULT_WHISPER_MODEL): + """Load Whisper model""" + global whisper_model + if whisper_model is None: + print(f"Loading Whisper model: {model_name}") + whisper_model = whisper.load_model(model_name) + print("Whisper model loaded") + return whisper_model + + +def transcribe_audio(audio_file_path: str) -> Optional[str]: + """Transcribe audio file""" + try: + model = load_whisper_model() + result = model.transcribe(audio_file_path) + return result['text'].strip() + except Exception as e: + print(f"Error transcribing: {e}") + return None + + +def on_wake_word_detected(wake_word_name: str): + """Callback factory for wake word detection""" + def callback(): + config = WAKE_WORD_CONFIGS.get(wake_word_name, {}) + print(f"Wake word detected: {wake_word_name}") + + wake_word_queue.put({ + 'timestamp': time.time(), + 'wake_word': wake_word_name, + 'response': config.get('response', 'Yes?'), + 'context': config.get('context', 'general') + }) + + return callback + + +def start_multiple_wake_words(configs: Dict[str, Dict], engine_path: str): + """Start multiple Precise wake word listeners""" + global precise_runners, precise_enabled + + if not PRECISE_AVAILABLE: + print("Error: Precise not available") + return False + + active_count = 0 + + for name, config in configs.items(): + if not config.get('enabled', False): + continue + + model_path = config['model'] + if not os.path.exists(model_path): + print(f"Warning: Model not found: {model_path} (skipping {name})") + continue + + try: + engine = PreciseEngine(engine_path, model_path) + runner = PreciseRunner( + engine, + sensitivity=config.get('sensitivity', 0.5), + on_activation=on_wake_word_detected(name) + ) + runner.start() + precise_runners[name] = runner + active_count += 1 + + print(f"✓ Started wake word: {name}") + print(f" Model: {model_path}") + print(f" Sensitivity: {config.get('sensitivity', 0.5)}") + + except Exception as e: + print(f"✗ Failed to start {name}: {e}") + + if active_count > 0: + precise_enabled = True + print(f"\nTotal active wake words: {active_count}") + return True + + return False + + +def stop_all_wake_words(): + """Stop all wake word listeners""" + global precise_runners, precise_enabled + + for name, runner in precise_runners.items(): + try: + runner.stop() + print(f"Stopped wake word: {name}") + except Exception as e: + print(f"Error stopping {name}: {e}") + + precise_runners = {} + precise_enabled = False + + +def init_speaker_identification(hf_token: str) -> Optional[SpeakerIdentification]: + """Initialize speaker identification""" + global speaker_inference, speaker_id_enabled + + if not SPEAKER_ID_AVAILABLE: + print("Speaker ID not available") + return None + + try: + speaker_inference = SpeakerIdentification(hf_token) + + # Load existing profiles + if os.path.exists(SPEAKER_PROFILES_FILE): + speaker_inference.load_profiles(SPEAKER_PROFILES_FILE) + + speaker_id_enabled = True + print("Speaker identification initialized") + return speaker_inference + + except Exception as e: + print(f"Error initializing speaker ID: {e}") + return None + + +# Flask routes + +@app.route('/health', methods=['GET']) +def health(): + """Health check""" + return jsonify({ + 'status': 'healthy', + 'whisper_loaded': whisper_model is not None, + 'ha_connected': ha_client is not None, + 'precise_enabled': precise_enabled, + 'active_wake_words': list(precise_runners.keys()), + 'speaker_id_enabled': speaker_id_enabled, + 'enrolled_speakers': list(speaker_inference.profiles.keys()) if speaker_inference else [] + }) + + +@app.route('/wake-words', methods=['GET']) +def list_wake_words(): + """List all configured wake words""" + wake_words = [] + + for name, config in WAKE_WORD_CONFIGS.items(): + wake_words.append({ + 'name': name, + 'enabled': config.get('enabled', False), + 'active': name in precise_runners, + 'model': config['model'], + 'sensitivity': config.get('sensitivity', 0.5), + 'response': config.get('response', ''), + 'context': config.get('context', 'general') + }) + + return jsonify({ + 'wake_words': wake_words, + 'total': len(wake_words), + 'active': len(precise_runners) + }) + + +@app.route('/wake-words//enable', methods=['POST']) +def enable_wake_word(name): + """Enable a wake word""" + if name not in WAKE_WORD_CONFIGS: + return jsonify({'error': 'Wake word not found'}), 404 + + config = WAKE_WORD_CONFIGS[name] + config['enabled'] = True + + # Start the wake word if not already running + if name not in precise_runners: + # Restart all wake words to pick up changes + # (simpler than starting individual ones) + return jsonify({ + 'message': f'Enabled {name}. Restart server to activate.' + }) + + return jsonify({'message': f'Wake word {name} enabled'}) + + +@app.route('/speakers/enroll', methods=['POST']) +def enroll_speaker(): + """Enroll a new speaker""" + if not speaker_id_enabled or not speaker_inference: + return jsonify({'error': 'Speaker ID not enabled'}), 400 + + if 'audio' not in request.files: + return jsonify({'error': 'No audio file'}), 400 + + name = request.form.get('name') + if not name: + return jsonify({'error': 'No speaker name provided'}), 400 + + audio_file = request.files['audio'] + + # Save temporarily + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp: + audio_file.save(temp.name) + temp_path = temp.name + + try: + speaker_inference.enroll_speaker(name, temp_path) + speaker_inference.save_profiles(SPEAKER_PROFILES_FILE) + + return jsonify({ + 'message': f'Enrolled speaker: {name}', + 'total_speakers': len(speaker_inference.profiles) + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + finally: + if os.path.exists(temp_path): + os.remove(temp_path) + + +@app.route('/speakers', methods=['GET']) +def list_speakers(): + """List enrolled speakers""" + if not speaker_id_enabled or not speaker_inference: + return jsonify({'error': 'Speaker ID not enabled'}), 400 + + speakers = [] + for name, profile in speaker_inference.profiles.items(): + speakers.append({ + 'name': name, + 'enrolled': profile.get('enrolled', 0) + }) + + return jsonify({ + 'speakers': speakers, + 'total': len(speakers) + }) + + +@app.route('/process-enhanced', methods=['POST']) +def process_enhanced(): + """ + Enhanced processing with speaker ID and wake word context + """ + if 'audio' not in request.files: + return jsonify({'error': 'No audio file'}), 400 + + wake_word = request.form.get('wake_word', 'unknown') + + audio_file = request.files['audio'] + + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp: + audio_file.save(temp.name) + temp_path = temp.name + + try: + # Identify speaker (if enabled) + speaker = 'unknown' + if speaker_id_enabled and speaker_inference: + speaker = speaker_inference.identify_speaker(temp_path) + print(f"Identified speaker: {speaker}") + + # Transcribe + text = transcribe_audio(temp_path) + if not text: + return jsonify({'error': 'Transcription failed'}), 500 + + print(f"[{speaker}] via [{wake_word}]: {text}") + + # Get wake word config + config = WAKE_WORD_CONFIGS.get(wake_word, {}) + context = config.get('context', 'general') + + # Process based on context and speaker + response = f"Heard via {wake_word}: {text}" + + return jsonify({ + 'success': True, + 'transcription': text, + 'speaker': speaker, + 'wake_word': wake_word, + 'context': context, + 'response': response + }) + + finally: + if os.path.exists(temp_path): + os.remove(temp_path) + + +def main(): + parser = argparse.ArgumentParser( + description="Enhanced Voice Server with Multi-Wake-Word and Speaker ID" + ) + parser.add_argument('--host', default=DEFAULT_HOST) + parser.add_argument('--port', type=int, default=DEFAULT_PORT) + parser.add_argument('--whisper-model', default=DEFAULT_WHISPER_MODEL) + parser.add_argument('--ha-url', default=DEFAULT_HA_URL) + parser.add_argument('--ha-token', default=DEFAULT_HA_TOKEN) + parser.add_argument('--enable-precise', action='store_true', + help='Enable wake word detection') + parser.add_argument('--multi-wake-word', action='store_true', + help='Enable multiple wake words') + parser.add_argument('--precise-engine', default=DEFAULT_PRECISE_ENGINE) + parser.add_argument('--enable-speaker-id', action='store_true', + help='Enable speaker identification') + parser.add_argument('--hf-token', default=DEFAULT_HF_TOKEN, + help='HuggingFace token for speaker ID') + + args = parser.parse_args() + + # Initialize HA client + global ha_client + ha_client = HomeAssistantClient(args.ha_url, args.ha_token) + + # Load Whisper + print(f"Starting enhanced voice server on {args.host}:{args.port}") + load_whisper_model(args.whisper_model) + + # Start Precise (multiple wake words) + if args.enable_precise: + if not PRECISE_AVAILABLE: + print("Error: Precise not available") + sys.exit(1) + + # Enable all or just first wake word + if args.multi_wake_word: + # Enable all configured wake words + enabled_count = sum(1 for c in WAKE_WORD_CONFIGS.values() if c.get('enabled')) + print(f"\nStarting {enabled_count} wake words...") + else: + # Enable only first wake word + first_key = list(WAKE_WORD_CONFIGS.keys())[0] + WAKE_WORD_CONFIGS[first_key]['enabled'] = True + for key in list(WAKE_WORD_CONFIGS.keys())[1:]: + WAKE_WORD_CONFIGS[key]['enabled'] = False + + if not start_multiple_wake_words(WAKE_WORD_CONFIGS, args.precise_engine): + print("Error: No wake words started") + sys.exit(1) + + # Initialize speaker ID + if args.enable_speaker_id: + if not args.hf_token: + print("Error: --hf-token required for speaker ID") + sys.exit(1) + + if not init_speaker_identification(args.hf_token): + print("Warning: Speaker ID initialization failed") + + # Start server + try: + print("\n" + "="*50) + print("Server ready!") + print("="*50 + "\n") + app.run(host=args.host, port=args.port, debug=False) + except KeyboardInterrupt: + print("\nShutting down...") + stop_all_wake_words() + sys.exit(0) + + +if __name__ == '__main__': + main()