feat(features): implement BlinkDetector, GazeEstimator, HeadPoseEstimator, HandGestureDetector
- BlinkDetector: EAR-based blink detection (left/right/both), 6 tests - GazeEstimator: iris-to-eye-corner ratio gaze direction, frozen GazeDirection dataclass, 4 tests - HeadPoseEstimator: velocity-based nod/shake/tilt detection (stateful, no tests — daemon smoke test) - HandGestureDetector: normalize_hand + tip-distance open/pinch/fist classifier (no tests — daemon smoke test) - TDD: blink and gaze followed RED→GREEN cycle; Black applied to all 6 files
This commit is contained in:
parent
072ee3f36c
commit
0dcc25164d
6 changed files with 369 additions and 0 deletions
62
merlin/features/blink.py
Normal file
62
merlin/features/blink.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
"""
|
||||
Blink detection from MediaPipe Face Mesh landmarks.
|
||||
|
||||
Uses the Eye Aspect Ratio (EAR) method: when the eye closes, the vertical
|
||||
landmark distances shrink relative to the horizontal width, driving EAR toward 0.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
_LEFT_EYE = [33, 160, 158, 133, 153, 144]
|
||||
_RIGHT_EYE = [362, 385, 387, 263, 373, 380]
|
||||
|
||||
|
||||
class BlinkEvent(str, Enum):
|
||||
LEFT = "left_blink"
|
||||
RIGHT = "right_blink"
|
||||
BOTH = "both_blink"
|
||||
|
||||
|
||||
def eye_aspect_ratio(landmarks: np.ndarray, indices: list[int]) -> float:
|
||||
"""
|
||||
EAR = (||p2-p6|| + ||p3-p5||) / (2 * ||p1-p4||)
|
||||
|
||||
~0.3 for open eye, ~0.0 for closed.
|
||||
"""
|
||||
p1, p2, p3, p4, p5, p6 = [landmarks[i] for i in indices]
|
||||
vert_a = np.linalg.norm(p2 - p6)
|
||||
vert_b = np.linalg.norm(p3 - p5)
|
||||
horiz = np.linalg.norm(p1 - p4)
|
||||
if horiz < 1e-6:
|
||||
return 0.0
|
||||
return float((vert_a + vert_b) / (2.0 * horiz))
|
||||
|
||||
|
||||
class BlinkDetector:
|
||||
"""Detect left, right, or both-eye blinks from face mesh landmarks."""
|
||||
|
||||
def __init__(self, threshold: float = 0.20) -> None:
|
||||
self._threshold = threshold
|
||||
|
||||
def detect(self, face_landmarks: np.ndarray) -> Optional[BlinkEvent]:
|
||||
"""
|
||||
Args:
|
||||
face_landmarks: (478, 3) float32 — MediaPipe Face Mesh with iris refinement.
|
||||
|
||||
Returns:
|
||||
BlinkEvent if a blink is detected, else None.
|
||||
"""
|
||||
left_closed = eye_aspect_ratio(face_landmarks, _LEFT_EYE) < self._threshold
|
||||
right_closed = eye_aspect_ratio(face_landmarks, _RIGHT_EYE) < self._threshold
|
||||
if left_closed and right_closed:
|
||||
return BlinkEvent.BOTH
|
||||
if left_closed:
|
||||
return BlinkEvent.LEFT
|
||||
if right_closed:
|
||||
return BlinkEvent.RIGHT
|
||||
return None
|
||||
60
merlin/features/gaze.py
Normal file
60
merlin/features/gaze.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
Gaze direction estimation from MediaPipe Face Mesh iris landmarks.
|
||||
|
||||
Requires mediapipe to be run with refine_landmarks=True (enables iris tracking,
|
||||
landmark indices 468-477).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
_LEFT_IRIS_CENTER = 468
|
||||
_RIGHT_IRIS_CENTER = 473
|
||||
_LEFT_EYE_INNER = 133
|
||||
_RIGHT_EYE_OUTER = 263
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GazeDirection:
|
||||
dx: float # [-1, 1] — negative = left, positive = right
|
||||
dy: float # [-1, 1] — negative = up, positive = down
|
||||
|
||||
@property
|
||||
def label(self) -> str:
|
||||
if abs(self.dx) < 0.15 and abs(self.dy) < 0.15:
|
||||
return "center"
|
||||
if abs(self.dx) > abs(self.dy):
|
||||
return "left" if self.dx < 0 else "right"
|
||||
return "up" if self.dy < 0 else "down"
|
||||
|
||||
|
||||
class GazeEstimator:
|
||||
"""Estimate gaze direction from iris center relative to eye corners."""
|
||||
|
||||
def estimate(self, face_landmarks: np.ndarray) -> GazeDirection:
|
||||
"""
|
||||
Args:
|
||||
face_landmarks: (478, 3) float32 — MediaPipe Face Mesh with iris refinement.
|
||||
|
||||
Returns:
|
||||
GazeDirection with normalized (dx, dy).
|
||||
"""
|
||||
left_iris = face_landmarks[_LEFT_IRIS_CENTER]
|
||||
right_iris = face_landmarks[_RIGHT_IRIS_CENTER]
|
||||
iris_center = (left_iris + right_iris) / 2.0
|
||||
|
||||
left_inner = face_landmarks[_LEFT_EYE_INNER]
|
||||
right_outer = face_landmarks[_RIGHT_EYE_OUTER]
|
||||
eye_width = np.linalg.norm(right_outer - left_inner)
|
||||
if eye_width < 1e-6:
|
||||
return GazeDirection(dx=0.0, dy=0.0)
|
||||
|
||||
eye_center = (left_inner + right_outer) / 2.0
|
||||
delta = iris_center - eye_center
|
||||
return GazeDirection(
|
||||
dx=float(delta[0] / eye_width),
|
||||
dy=float(delta[1] / eye_width),
|
||||
)
|
||||
52
merlin/features/hand_gesture.py
Normal file
52
merlin/features/hand_gesture.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
"""
|
||||
Simple hand gesture detection — open palm, closed fist, pinch.
|
||||
|
||||
Operates on raw (21, 3) landmark arrays; calls normalize_hand() internally.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from circuitforge_core.input.gestures.normalizer import normalize_hand
|
||||
|
||||
_FINGERTIP_IDX = [4, 8, 12, 16, 20]
|
||||
_THUMB_TIP = 4
|
||||
_INDEX_TIP = 8
|
||||
|
||||
OPEN_PALM_THRESHOLD = 1.5
|
||||
PINCH_THRESHOLD = 0.15
|
||||
|
||||
|
||||
class HandGesture(str, Enum):
|
||||
OPEN_PALM = "open_palm"
|
||||
PINCH = "pinch"
|
||||
FIST = "fist"
|
||||
|
||||
|
||||
class HandGestureDetector:
|
||||
"""Classify a hand pose from MediaPipe landmarks."""
|
||||
|
||||
def detect(self, raw_points: np.ndarray) -> Optional[HandGesture]:
|
||||
"""
|
||||
Args:
|
||||
raw_points: (21, 3) float32 — raw MediaPipe hand landmarks.
|
||||
|
||||
Returns:
|
||||
HandGesture or None if pose is ambiguous.
|
||||
"""
|
||||
vec = normalize_hand(raw_points).reshape(21, 3)
|
||||
tip_distances = [float(np.linalg.norm(vec[i])) for i in _FINGERTIP_IDX]
|
||||
tip_sum = sum(tip_distances)
|
||||
|
||||
pinch_dist = float(np.linalg.norm(vec[_THUMB_TIP] - vec[_INDEX_TIP]))
|
||||
if pinch_dist < PINCH_THRESHOLD:
|
||||
return HandGesture.PINCH
|
||||
if tip_sum > OPEN_PALM_THRESHOLD:
|
||||
return HandGesture.OPEN_PALM
|
||||
if tip_sum < 0.6:
|
||||
return HandGesture.FIST
|
||||
return None
|
||||
65
merlin/features/head_pose.py
Normal file
65
merlin/features/head_pose.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""
|
||||
Head pose estimation — detect nod, shake, and tilt from Face Mesh landmarks.
|
||||
|
||||
Velocity-based: compares nose tip position across consecutive frames.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
_NOSE_TIP = 1
|
||||
_L_CHEEK = 234
|
||||
_R_CHEEK = 454
|
||||
|
||||
NOD_THRESHOLD = 0.015
|
||||
SHAKE_THRESHOLD = 0.015
|
||||
TILT_THRESHOLD = 0.012
|
||||
|
||||
|
||||
class HeadGesture(str, Enum):
|
||||
NOD = "head_nod"
|
||||
SHAKE = "head_shake"
|
||||
TILT_LEFT = "head_tilt_left"
|
||||
TILT_RIGHT = "head_tilt_right"
|
||||
|
||||
|
||||
class HeadPoseEstimator:
|
||||
"""Detect head gestures by comparing nose tip position across frames."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._prev: Optional[np.ndarray] = None
|
||||
|
||||
def update(self, face_landmarks: np.ndarray) -> Optional[HeadGesture]:
|
||||
"""
|
||||
Args:
|
||||
face_landmarks: (478, 3) float32 — current frame.
|
||||
|
||||
Returns:
|
||||
HeadGesture if detected, else None.
|
||||
"""
|
||||
nose = face_landmarks[_NOSE_TIP]
|
||||
if self._prev is None:
|
||||
self._prev = nose.copy()
|
||||
return None
|
||||
|
||||
delta = nose - self._prev
|
||||
self._prev = nose.copy()
|
||||
|
||||
dy = float(delta[1])
|
||||
dx = float(delta[0])
|
||||
|
||||
l_cheek = face_landmarks[_L_CHEEK]
|
||||
r_cheek = face_landmarks[_R_CHEEK]
|
||||
roll = float(l_cheek[1] - r_cheek[1])
|
||||
|
||||
if abs(dy) > NOD_THRESHOLD and abs(dy) > abs(dx):
|
||||
return HeadGesture.NOD
|
||||
if abs(dx) > SHAKE_THRESHOLD and abs(dx) > abs(dy):
|
||||
return HeadGesture.SHAKE
|
||||
if abs(roll) > TILT_THRESHOLD:
|
||||
return HeadGesture.TILT_LEFT if roll > 0 else HeadGesture.TILT_RIGHT
|
||||
return None
|
||||
73
tests/test_features/test_blink.py
Normal file
73
tests/test_features/test_blink.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
from merlin.features.blink import BlinkDetector, BlinkEvent, eye_aspect_ratio
|
||||
|
||||
LEFT_EYE = [33, 160, 158, 133, 153, 144]
|
||||
RIGHT_EYE = [362, 385, 387, 263, 373, 380]
|
||||
|
||||
|
||||
def _face(n: int = 478) -> np.ndarray:
|
||||
return np.zeros((n, 3), dtype=np.float32)
|
||||
|
||||
|
||||
def _set_eye_open(face: np.ndarray, indices: list[int]) -> None:
|
||||
"""Set 6 EAR landmarks so EAR = ~0.35 (open eye)."""
|
||||
p1, p2, p3, p4, p5, p6 = indices
|
||||
face[p1] = [0.0, 0.0, 0.0]
|
||||
face[p4] = [1.0, 0.0, 0.0]
|
||||
face[p2] = [0.25, 0.2, 0.0]
|
||||
face[p6] = [0.25, -0.2, 0.0]
|
||||
face[p3] = [0.75, 0.2, 0.0]
|
||||
face[p5] = [0.75, -0.2, 0.0]
|
||||
|
||||
|
||||
def _set_eye_closed(face: np.ndarray, indices: list[int]) -> None:
|
||||
"""Set 6 EAR landmarks so EAR ≈ 0 (closed)."""
|
||||
for i in indices:
|
||||
face[i] = [0.0, 0.0, 0.0]
|
||||
|
||||
|
||||
def test_ear_open_eye():
|
||||
face = _face()
|
||||
_set_eye_open(face, LEFT_EYE)
|
||||
ear = eye_aspect_ratio(face, LEFT_EYE)
|
||||
assert ear > 0.20
|
||||
|
||||
|
||||
def test_ear_closed_eye():
|
||||
face = _face()
|
||||
_set_eye_closed(face, LEFT_EYE)
|
||||
ear = eye_aspect_ratio(face, LEFT_EYE)
|
||||
assert ear < 0.05
|
||||
|
||||
|
||||
def test_no_blink_when_both_open():
|
||||
detector = BlinkDetector(threshold=0.20)
|
||||
face = _face()
|
||||
_set_eye_open(face, LEFT_EYE)
|
||||
_set_eye_open(face, RIGHT_EYE)
|
||||
assert detector.detect(face) is None
|
||||
|
||||
|
||||
def test_left_blink_detected():
|
||||
detector = BlinkDetector(threshold=0.20)
|
||||
face = _face()
|
||||
_set_eye_closed(face, LEFT_EYE)
|
||||
_set_eye_open(face, RIGHT_EYE)
|
||||
assert detector.detect(face) == BlinkEvent.LEFT
|
||||
|
||||
|
||||
def test_right_blink_detected():
|
||||
detector = BlinkDetector(threshold=0.20)
|
||||
face = _face()
|
||||
_set_eye_open(face, LEFT_EYE)
|
||||
_set_eye_closed(face, RIGHT_EYE)
|
||||
assert detector.detect(face) == BlinkEvent.RIGHT
|
||||
|
||||
|
||||
def test_both_blink_detected():
|
||||
detector = BlinkDetector(threshold=0.20)
|
||||
face = _face()
|
||||
_set_eye_closed(face, LEFT_EYE)
|
||||
_set_eye_closed(face, RIGHT_EYE)
|
||||
assert detector.detect(face) == BlinkEvent.BOTH
|
||||
57
tests/test_features/test_gaze.py
Normal file
57
tests/test_features/test_gaze.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import numpy as np
|
||||
from merlin.features.gaze import GazeEstimator, GazeDirection
|
||||
|
||||
_LEFT_IRIS = 468
|
||||
_RIGHT_IRIS = 473
|
||||
_LEFT_INNER = 133
|
||||
_RIGHT_OUTER = 263
|
||||
|
||||
|
||||
def _face(n: int = 478) -> np.ndarray:
|
||||
return np.zeros((n, 3), dtype=np.float32)
|
||||
|
||||
|
||||
def _set_gaze_center(face: np.ndarray) -> None:
|
||||
"""Iris centers at midpoint of eye span → center gaze."""
|
||||
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
|
||||
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
|
||||
mid = (face[_LEFT_INNER] + face[_RIGHT_OUTER]) / 2.0
|
||||
face[_LEFT_IRIS] = mid.copy()
|
||||
face[_RIGHT_IRIS] = mid.copy()
|
||||
|
||||
|
||||
def _set_gaze_left(face: np.ndarray) -> None:
|
||||
"""Iris centers shifted left relative to eye span."""
|
||||
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
|
||||
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
|
||||
face[_LEFT_IRIS] = [0.35, 0.5, 0.0]
|
||||
face[_RIGHT_IRIS] = [0.35, 0.5, 0.0]
|
||||
|
||||
|
||||
def test_center_gaze_label():
|
||||
face = _face()
|
||||
_set_gaze_center(face)
|
||||
g = GazeEstimator().estimate(face)
|
||||
assert g.label == "center"
|
||||
|
||||
|
||||
def test_left_gaze_label():
|
||||
face = _face()
|
||||
_set_gaze_left(face)
|
||||
g = GazeEstimator().estimate(face)
|
||||
assert g.label == "left"
|
||||
|
||||
|
||||
def test_zero_eye_width_returns_center():
|
||||
"""Degenerate case: all landmarks at same point → center."""
|
||||
face = _face()
|
||||
g = GazeEstimator().estimate(face)
|
||||
assert g.dx == 0.0 and g.dy == 0.0
|
||||
|
||||
|
||||
def test_gazeresult_is_frozen():
|
||||
g = GazeDirection(dx=0.1, dy=0.2)
|
||||
import pytest
|
||||
|
||||
with pytest.raises((AttributeError, TypeError)):
|
||||
g.dx = 0.5
|
||||
Loading…
Reference in a new issue