feat(features): implement BlinkDetector, GazeEstimator, HeadPoseEstimator, HandGestureDetector

- BlinkDetector: EAR-based blink detection (left/right/both), 6 tests
- GazeEstimator: iris-to-eye-corner ratio gaze direction, frozen GazeDirection dataclass, 4 tests
- HeadPoseEstimator: velocity-based nod/shake/tilt detection (stateful, no tests — daemon smoke test)
- HandGestureDetector: normalize_hand + tip-distance open/pinch/fist classifier (no tests — daemon smoke test)
- TDD: blink and gaze followed RED→GREEN cycle; Black applied to all 6 files
This commit is contained in:
pyr0ball 2026-04-26 21:13:59 -07:00
parent 072ee3f36c
commit 0dcc25164d
6 changed files with 369 additions and 0 deletions

62
merlin/features/blink.py Normal file
View file

@ -0,0 +1,62 @@
"""
Blink detection from MediaPipe Face Mesh landmarks.
Uses the Eye Aspect Ratio (EAR) method: when the eye closes, the vertical
landmark distances shrink relative to the horizontal width, driving EAR toward 0.
"""
from __future__ import annotations
from enum import Enum
from typing import Optional
import numpy as np
_LEFT_EYE = [33, 160, 158, 133, 153, 144]
_RIGHT_EYE = [362, 385, 387, 263, 373, 380]
class BlinkEvent(str, Enum):
LEFT = "left_blink"
RIGHT = "right_blink"
BOTH = "both_blink"
def eye_aspect_ratio(landmarks: np.ndarray, indices: list[int]) -> float:
"""
EAR = (||p2-p6|| + ||p3-p5||) / (2 * ||p1-p4||)
~0.3 for open eye, ~0.0 for closed.
"""
p1, p2, p3, p4, p5, p6 = [landmarks[i] for i in indices]
vert_a = np.linalg.norm(p2 - p6)
vert_b = np.linalg.norm(p3 - p5)
horiz = np.linalg.norm(p1 - p4)
if horiz < 1e-6:
return 0.0
return float((vert_a + vert_b) / (2.0 * horiz))
class BlinkDetector:
"""Detect left, right, or both-eye blinks from face mesh landmarks."""
def __init__(self, threshold: float = 0.20) -> None:
self._threshold = threshold
def detect(self, face_landmarks: np.ndarray) -> Optional[BlinkEvent]:
"""
Args:
face_landmarks: (478, 3) float32 MediaPipe Face Mesh with iris refinement.
Returns:
BlinkEvent if a blink is detected, else None.
"""
left_closed = eye_aspect_ratio(face_landmarks, _LEFT_EYE) < self._threshold
right_closed = eye_aspect_ratio(face_landmarks, _RIGHT_EYE) < self._threshold
if left_closed and right_closed:
return BlinkEvent.BOTH
if left_closed:
return BlinkEvent.LEFT
if right_closed:
return BlinkEvent.RIGHT
return None

60
merlin/features/gaze.py Normal file
View file

@ -0,0 +1,60 @@
"""
Gaze direction estimation from MediaPipe Face Mesh iris landmarks.
Requires mediapipe to be run with refine_landmarks=True (enables iris tracking,
landmark indices 468-477).
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
_LEFT_IRIS_CENTER = 468
_RIGHT_IRIS_CENTER = 473
_LEFT_EYE_INNER = 133
_RIGHT_EYE_OUTER = 263
@dataclass(frozen=True)
class GazeDirection:
dx: float # [-1, 1] — negative = left, positive = right
dy: float # [-1, 1] — negative = up, positive = down
@property
def label(self) -> str:
if abs(self.dx) < 0.15 and abs(self.dy) < 0.15:
return "center"
if abs(self.dx) > abs(self.dy):
return "left" if self.dx < 0 else "right"
return "up" if self.dy < 0 else "down"
class GazeEstimator:
"""Estimate gaze direction from iris center relative to eye corners."""
def estimate(self, face_landmarks: np.ndarray) -> GazeDirection:
"""
Args:
face_landmarks: (478, 3) float32 MediaPipe Face Mesh with iris refinement.
Returns:
GazeDirection with normalized (dx, dy).
"""
left_iris = face_landmarks[_LEFT_IRIS_CENTER]
right_iris = face_landmarks[_RIGHT_IRIS_CENTER]
iris_center = (left_iris + right_iris) / 2.0
left_inner = face_landmarks[_LEFT_EYE_INNER]
right_outer = face_landmarks[_RIGHT_EYE_OUTER]
eye_width = np.linalg.norm(right_outer - left_inner)
if eye_width < 1e-6:
return GazeDirection(dx=0.0, dy=0.0)
eye_center = (left_inner + right_outer) / 2.0
delta = iris_center - eye_center
return GazeDirection(
dx=float(delta[0] / eye_width),
dy=float(delta[1] / eye_width),
)

View file

@ -0,0 +1,52 @@
"""
Simple hand gesture detection open palm, closed fist, pinch.
Operates on raw (21, 3) landmark arrays; calls normalize_hand() internally.
"""
from __future__ import annotations
from enum import Enum
from typing import Optional
import numpy as np
from circuitforge_core.input.gestures.normalizer import normalize_hand
_FINGERTIP_IDX = [4, 8, 12, 16, 20]
_THUMB_TIP = 4
_INDEX_TIP = 8
OPEN_PALM_THRESHOLD = 1.5
PINCH_THRESHOLD = 0.15
class HandGesture(str, Enum):
OPEN_PALM = "open_palm"
PINCH = "pinch"
FIST = "fist"
class HandGestureDetector:
"""Classify a hand pose from MediaPipe landmarks."""
def detect(self, raw_points: np.ndarray) -> Optional[HandGesture]:
"""
Args:
raw_points: (21, 3) float32 raw MediaPipe hand landmarks.
Returns:
HandGesture or None if pose is ambiguous.
"""
vec = normalize_hand(raw_points).reshape(21, 3)
tip_distances = [float(np.linalg.norm(vec[i])) for i in _FINGERTIP_IDX]
tip_sum = sum(tip_distances)
pinch_dist = float(np.linalg.norm(vec[_THUMB_TIP] - vec[_INDEX_TIP]))
if pinch_dist < PINCH_THRESHOLD:
return HandGesture.PINCH
if tip_sum > OPEN_PALM_THRESHOLD:
return HandGesture.OPEN_PALM
if tip_sum < 0.6:
return HandGesture.FIST
return None

View file

@ -0,0 +1,65 @@
"""
Head pose estimation detect nod, shake, and tilt from Face Mesh landmarks.
Velocity-based: compares nose tip position across consecutive frames.
"""
from __future__ import annotations
from enum import Enum
from typing import Optional
import numpy as np
_NOSE_TIP = 1
_L_CHEEK = 234
_R_CHEEK = 454
NOD_THRESHOLD = 0.015
SHAKE_THRESHOLD = 0.015
TILT_THRESHOLD = 0.012
class HeadGesture(str, Enum):
NOD = "head_nod"
SHAKE = "head_shake"
TILT_LEFT = "head_tilt_left"
TILT_RIGHT = "head_tilt_right"
class HeadPoseEstimator:
"""Detect head gestures by comparing nose tip position across frames."""
def __init__(self) -> None:
self._prev: Optional[np.ndarray] = None
def update(self, face_landmarks: np.ndarray) -> Optional[HeadGesture]:
"""
Args:
face_landmarks: (478, 3) float32 current frame.
Returns:
HeadGesture if detected, else None.
"""
nose = face_landmarks[_NOSE_TIP]
if self._prev is None:
self._prev = nose.copy()
return None
delta = nose - self._prev
self._prev = nose.copy()
dy = float(delta[1])
dx = float(delta[0])
l_cheek = face_landmarks[_L_CHEEK]
r_cheek = face_landmarks[_R_CHEEK]
roll = float(l_cheek[1] - r_cheek[1])
if abs(dy) > NOD_THRESHOLD and abs(dy) > abs(dx):
return HeadGesture.NOD
if abs(dx) > SHAKE_THRESHOLD and abs(dx) > abs(dy):
return HeadGesture.SHAKE
if abs(roll) > TILT_THRESHOLD:
return HeadGesture.TILT_LEFT if roll > 0 else HeadGesture.TILT_RIGHT
return None

View file

@ -0,0 +1,73 @@
import numpy as np
import pytest
from merlin.features.blink import BlinkDetector, BlinkEvent, eye_aspect_ratio
LEFT_EYE = [33, 160, 158, 133, 153, 144]
RIGHT_EYE = [362, 385, 387, 263, 373, 380]
def _face(n: int = 478) -> np.ndarray:
return np.zeros((n, 3), dtype=np.float32)
def _set_eye_open(face: np.ndarray, indices: list[int]) -> None:
"""Set 6 EAR landmarks so EAR = ~0.35 (open eye)."""
p1, p2, p3, p4, p5, p6 = indices
face[p1] = [0.0, 0.0, 0.0]
face[p4] = [1.0, 0.0, 0.0]
face[p2] = [0.25, 0.2, 0.0]
face[p6] = [0.25, -0.2, 0.0]
face[p3] = [0.75, 0.2, 0.0]
face[p5] = [0.75, -0.2, 0.0]
def _set_eye_closed(face: np.ndarray, indices: list[int]) -> None:
"""Set 6 EAR landmarks so EAR ≈ 0 (closed)."""
for i in indices:
face[i] = [0.0, 0.0, 0.0]
def test_ear_open_eye():
face = _face()
_set_eye_open(face, LEFT_EYE)
ear = eye_aspect_ratio(face, LEFT_EYE)
assert ear > 0.20
def test_ear_closed_eye():
face = _face()
_set_eye_closed(face, LEFT_EYE)
ear = eye_aspect_ratio(face, LEFT_EYE)
assert ear < 0.05
def test_no_blink_when_both_open():
detector = BlinkDetector(threshold=0.20)
face = _face()
_set_eye_open(face, LEFT_EYE)
_set_eye_open(face, RIGHT_EYE)
assert detector.detect(face) is None
def test_left_blink_detected():
detector = BlinkDetector(threshold=0.20)
face = _face()
_set_eye_closed(face, LEFT_EYE)
_set_eye_open(face, RIGHT_EYE)
assert detector.detect(face) == BlinkEvent.LEFT
def test_right_blink_detected():
detector = BlinkDetector(threshold=0.20)
face = _face()
_set_eye_open(face, LEFT_EYE)
_set_eye_closed(face, RIGHT_EYE)
assert detector.detect(face) == BlinkEvent.RIGHT
def test_both_blink_detected():
detector = BlinkDetector(threshold=0.20)
face = _face()
_set_eye_closed(face, LEFT_EYE)
_set_eye_closed(face, RIGHT_EYE)
assert detector.detect(face) == BlinkEvent.BOTH

View file

@ -0,0 +1,57 @@
import numpy as np
from merlin.features.gaze import GazeEstimator, GazeDirection
_LEFT_IRIS = 468
_RIGHT_IRIS = 473
_LEFT_INNER = 133
_RIGHT_OUTER = 263
def _face(n: int = 478) -> np.ndarray:
return np.zeros((n, 3), dtype=np.float32)
def _set_gaze_center(face: np.ndarray) -> None:
"""Iris centers at midpoint of eye span → center gaze."""
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
mid = (face[_LEFT_INNER] + face[_RIGHT_OUTER]) / 2.0
face[_LEFT_IRIS] = mid.copy()
face[_RIGHT_IRIS] = mid.copy()
def _set_gaze_left(face: np.ndarray) -> None:
"""Iris centers shifted left relative to eye span."""
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
face[_LEFT_IRIS] = [0.35, 0.5, 0.0]
face[_RIGHT_IRIS] = [0.35, 0.5, 0.0]
def test_center_gaze_label():
face = _face()
_set_gaze_center(face)
g = GazeEstimator().estimate(face)
assert g.label == "center"
def test_left_gaze_label():
face = _face()
_set_gaze_left(face)
g = GazeEstimator().estimate(face)
assert g.label == "left"
def test_zero_eye_width_returns_center():
"""Degenerate case: all landmarks at same point → center."""
face = _face()
g = GazeEstimator().estimate(face)
assert g.dx == 0.0 and g.dy == 0.0
def test_gazeresult_is_frozen():
g = GazeDirection(dx=0.1, dy=0.2)
import pytest
with pytest.raises((AttributeError, TypeError)):
g.dx = 0.5