feat(features): implement BlinkDetector, GazeEstimator, HeadPoseEstimator, HandGestureDetector
- BlinkDetector: EAR-based blink detection (left/right/both), 6 tests - GazeEstimator: iris-to-eye-corner ratio gaze direction, frozen GazeDirection dataclass, 4 tests - HeadPoseEstimator: velocity-based nod/shake/tilt detection (stateful, no tests — daemon smoke test) - HandGestureDetector: normalize_hand + tip-distance open/pinch/fist classifier (no tests — daemon smoke test) - TDD: blink and gaze followed RED→GREEN cycle; Black applied to all 6 files
This commit is contained in:
parent
072ee3f36c
commit
0dcc25164d
6 changed files with 369 additions and 0 deletions
62
merlin/features/blink.py
Normal file
62
merlin/features/blink.py
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
"""
|
||||||
|
Blink detection from MediaPipe Face Mesh landmarks.
|
||||||
|
|
||||||
|
Uses the Eye Aspect Ratio (EAR) method: when the eye closes, the vertical
|
||||||
|
landmark distances shrink relative to the horizontal width, driving EAR toward 0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
_LEFT_EYE = [33, 160, 158, 133, 153, 144]
|
||||||
|
_RIGHT_EYE = [362, 385, 387, 263, 373, 380]
|
||||||
|
|
||||||
|
|
||||||
|
class BlinkEvent(str, Enum):
|
||||||
|
LEFT = "left_blink"
|
||||||
|
RIGHT = "right_blink"
|
||||||
|
BOTH = "both_blink"
|
||||||
|
|
||||||
|
|
||||||
|
def eye_aspect_ratio(landmarks: np.ndarray, indices: list[int]) -> float:
|
||||||
|
"""
|
||||||
|
EAR = (||p2-p6|| + ||p3-p5||) / (2 * ||p1-p4||)
|
||||||
|
|
||||||
|
~0.3 for open eye, ~0.0 for closed.
|
||||||
|
"""
|
||||||
|
p1, p2, p3, p4, p5, p6 = [landmarks[i] for i in indices]
|
||||||
|
vert_a = np.linalg.norm(p2 - p6)
|
||||||
|
vert_b = np.linalg.norm(p3 - p5)
|
||||||
|
horiz = np.linalg.norm(p1 - p4)
|
||||||
|
if horiz < 1e-6:
|
||||||
|
return 0.0
|
||||||
|
return float((vert_a + vert_b) / (2.0 * horiz))
|
||||||
|
|
||||||
|
|
||||||
|
class BlinkDetector:
|
||||||
|
"""Detect left, right, or both-eye blinks from face mesh landmarks."""
|
||||||
|
|
||||||
|
def __init__(self, threshold: float = 0.20) -> None:
|
||||||
|
self._threshold = threshold
|
||||||
|
|
||||||
|
def detect(self, face_landmarks: np.ndarray) -> Optional[BlinkEvent]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
face_landmarks: (478, 3) float32 — MediaPipe Face Mesh with iris refinement.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BlinkEvent if a blink is detected, else None.
|
||||||
|
"""
|
||||||
|
left_closed = eye_aspect_ratio(face_landmarks, _LEFT_EYE) < self._threshold
|
||||||
|
right_closed = eye_aspect_ratio(face_landmarks, _RIGHT_EYE) < self._threshold
|
||||||
|
if left_closed and right_closed:
|
||||||
|
return BlinkEvent.BOTH
|
||||||
|
if left_closed:
|
||||||
|
return BlinkEvent.LEFT
|
||||||
|
if right_closed:
|
||||||
|
return BlinkEvent.RIGHT
|
||||||
|
return None
|
||||||
60
merlin/features/gaze.py
Normal file
60
merlin/features/gaze.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
"""
|
||||||
|
Gaze direction estimation from MediaPipe Face Mesh iris landmarks.
|
||||||
|
|
||||||
|
Requires mediapipe to be run with refine_landmarks=True (enables iris tracking,
|
||||||
|
landmark indices 468-477).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
_LEFT_IRIS_CENTER = 468
|
||||||
|
_RIGHT_IRIS_CENTER = 473
|
||||||
|
_LEFT_EYE_INNER = 133
|
||||||
|
_RIGHT_EYE_OUTER = 263
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class GazeDirection:
|
||||||
|
dx: float # [-1, 1] — negative = left, positive = right
|
||||||
|
dy: float # [-1, 1] — negative = up, positive = down
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self) -> str:
|
||||||
|
if abs(self.dx) < 0.15 and abs(self.dy) < 0.15:
|
||||||
|
return "center"
|
||||||
|
if abs(self.dx) > abs(self.dy):
|
||||||
|
return "left" if self.dx < 0 else "right"
|
||||||
|
return "up" if self.dy < 0 else "down"
|
||||||
|
|
||||||
|
|
||||||
|
class GazeEstimator:
|
||||||
|
"""Estimate gaze direction from iris center relative to eye corners."""
|
||||||
|
|
||||||
|
def estimate(self, face_landmarks: np.ndarray) -> GazeDirection:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
face_landmarks: (478, 3) float32 — MediaPipe Face Mesh with iris refinement.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
GazeDirection with normalized (dx, dy).
|
||||||
|
"""
|
||||||
|
left_iris = face_landmarks[_LEFT_IRIS_CENTER]
|
||||||
|
right_iris = face_landmarks[_RIGHT_IRIS_CENTER]
|
||||||
|
iris_center = (left_iris + right_iris) / 2.0
|
||||||
|
|
||||||
|
left_inner = face_landmarks[_LEFT_EYE_INNER]
|
||||||
|
right_outer = face_landmarks[_RIGHT_EYE_OUTER]
|
||||||
|
eye_width = np.linalg.norm(right_outer - left_inner)
|
||||||
|
if eye_width < 1e-6:
|
||||||
|
return GazeDirection(dx=0.0, dy=0.0)
|
||||||
|
|
||||||
|
eye_center = (left_inner + right_outer) / 2.0
|
||||||
|
delta = iris_center - eye_center
|
||||||
|
return GazeDirection(
|
||||||
|
dx=float(delta[0] / eye_width),
|
||||||
|
dy=float(delta[1] / eye_width),
|
||||||
|
)
|
||||||
52
merlin/features/hand_gesture.py
Normal file
52
merlin/features/hand_gesture.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
"""
|
||||||
|
Simple hand gesture detection — open palm, closed fist, pinch.
|
||||||
|
|
||||||
|
Operates on raw (21, 3) landmark arrays; calls normalize_hand() internally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from circuitforge_core.input.gestures.normalizer import normalize_hand
|
||||||
|
|
||||||
|
_FINGERTIP_IDX = [4, 8, 12, 16, 20]
|
||||||
|
_THUMB_TIP = 4
|
||||||
|
_INDEX_TIP = 8
|
||||||
|
|
||||||
|
OPEN_PALM_THRESHOLD = 1.5
|
||||||
|
PINCH_THRESHOLD = 0.15
|
||||||
|
|
||||||
|
|
||||||
|
class HandGesture(str, Enum):
|
||||||
|
OPEN_PALM = "open_palm"
|
||||||
|
PINCH = "pinch"
|
||||||
|
FIST = "fist"
|
||||||
|
|
||||||
|
|
||||||
|
class HandGestureDetector:
|
||||||
|
"""Classify a hand pose from MediaPipe landmarks."""
|
||||||
|
|
||||||
|
def detect(self, raw_points: np.ndarray) -> Optional[HandGesture]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
raw_points: (21, 3) float32 — raw MediaPipe hand landmarks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HandGesture or None if pose is ambiguous.
|
||||||
|
"""
|
||||||
|
vec = normalize_hand(raw_points).reshape(21, 3)
|
||||||
|
tip_distances = [float(np.linalg.norm(vec[i])) for i in _FINGERTIP_IDX]
|
||||||
|
tip_sum = sum(tip_distances)
|
||||||
|
|
||||||
|
pinch_dist = float(np.linalg.norm(vec[_THUMB_TIP] - vec[_INDEX_TIP]))
|
||||||
|
if pinch_dist < PINCH_THRESHOLD:
|
||||||
|
return HandGesture.PINCH
|
||||||
|
if tip_sum > OPEN_PALM_THRESHOLD:
|
||||||
|
return HandGesture.OPEN_PALM
|
||||||
|
if tip_sum < 0.6:
|
||||||
|
return HandGesture.FIST
|
||||||
|
return None
|
||||||
65
merlin/features/head_pose.py
Normal file
65
merlin/features/head_pose.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
"""
|
||||||
|
Head pose estimation — detect nod, shake, and tilt from Face Mesh landmarks.
|
||||||
|
|
||||||
|
Velocity-based: compares nose tip position across consecutive frames.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
_NOSE_TIP = 1
|
||||||
|
_L_CHEEK = 234
|
||||||
|
_R_CHEEK = 454
|
||||||
|
|
||||||
|
NOD_THRESHOLD = 0.015
|
||||||
|
SHAKE_THRESHOLD = 0.015
|
||||||
|
TILT_THRESHOLD = 0.012
|
||||||
|
|
||||||
|
|
||||||
|
class HeadGesture(str, Enum):
|
||||||
|
NOD = "head_nod"
|
||||||
|
SHAKE = "head_shake"
|
||||||
|
TILT_LEFT = "head_tilt_left"
|
||||||
|
TILT_RIGHT = "head_tilt_right"
|
||||||
|
|
||||||
|
|
||||||
|
class HeadPoseEstimator:
|
||||||
|
"""Detect head gestures by comparing nose tip position across frames."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._prev: Optional[np.ndarray] = None
|
||||||
|
|
||||||
|
def update(self, face_landmarks: np.ndarray) -> Optional[HeadGesture]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
face_landmarks: (478, 3) float32 — current frame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HeadGesture if detected, else None.
|
||||||
|
"""
|
||||||
|
nose = face_landmarks[_NOSE_TIP]
|
||||||
|
if self._prev is None:
|
||||||
|
self._prev = nose.copy()
|
||||||
|
return None
|
||||||
|
|
||||||
|
delta = nose - self._prev
|
||||||
|
self._prev = nose.copy()
|
||||||
|
|
||||||
|
dy = float(delta[1])
|
||||||
|
dx = float(delta[0])
|
||||||
|
|
||||||
|
l_cheek = face_landmarks[_L_CHEEK]
|
||||||
|
r_cheek = face_landmarks[_R_CHEEK]
|
||||||
|
roll = float(l_cheek[1] - r_cheek[1])
|
||||||
|
|
||||||
|
if abs(dy) > NOD_THRESHOLD and abs(dy) > abs(dx):
|
||||||
|
return HeadGesture.NOD
|
||||||
|
if abs(dx) > SHAKE_THRESHOLD and abs(dx) > abs(dy):
|
||||||
|
return HeadGesture.SHAKE
|
||||||
|
if abs(roll) > TILT_THRESHOLD:
|
||||||
|
return HeadGesture.TILT_LEFT if roll > 0 else HeadGesture.TILT_RIGHT
|
||||||
|
return None
|
||||||
73
tests/test_features/test_blink.py
Normal file
73
tests/test_features/test_blink.py
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
from merlin.features.blink import BlinkDetector, BlinkEvent, eye_aspect_ratio
|
||||||
|
|
||||||
|
LEFT_EYE = [33, 160, 158, 133, 153, 144]
|
||||||
|
RIGHT_EYE = [362, 385, 387, 263, 373, 380]
|
||||||
|
|
||||||
|
|
||||||
|
def _face(n: int = 478) -> np.ndarray:
|
||||||
|
return np.zeros((n, 3), dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_eye_open(face: np.ndarray, indices: list[int]) -> None:
|
||||||
|
"""Set 6 EAR landmarks so EAR = ~0.35 (open eye)."""
|
||||||
|
p1, p2, p3, p4, p5, p6 = indices
|
||||||
|
face[p1] = [0.0, 0.0, 0.0]
|
||||||
|
face[p4] = [1.0, 0.0, 0.0]
|
||||||
|
face[p2] = [0.25, 0.2, 0.0]
|
||||||
|
face[p6] = [0.25, -0.2, 0.0]
|
||||||
|
face[p3] = [0.75, 0.2, 0.0]
|
||||||
|
face[p5] = [0.75, -0.2, 0.0]
|
||||||
|
|
||||||
|
|
||||||
|
def _set_eye_closed(face: np.ndarray, indices: list[int]) -> None:
|
||||||
|
"""Set 6 EAR landmarks so EAR ≈ 0 (closed)."""
|
||||||
|
for i in indices:
|
||||||
|
face[i] = [0.0, 0.0, 0.0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ear_open_eye():
|
||||||
|
face = _face()
|
||||||
|
_set_eye_open(face, LEFT_EYE)
|
||||||
|
ear = eye_aspect_ratio(face, LEFT_EYE)
|
||||||
|
assert ear > 0.20
|
||||||
|
|
||||||
|
|
||||||
|
def test_ear_closed_eye():
|
||||||
|
face = _face()
|
||||||
|
_set_eye_closed(face, LEFT_EYE)
|
||||||
|
ear = eye_aspect_ratio(face, LEFT_EYE)
|
||||||
|
assert ear < 0.05
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_blink_when_both_open():
|
||||||
|
detector = BlinkDetector(threshold=0.20)
|
||||||
|
face = _face()
|
||||||
|
_set_eye_open(face, LEFT_EYE)
|
||||||
|
_set_eye_open(face, RIGHT_EYE)
|
||||||
|
assert detector.detect(face) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_left_blink_detected():
|
||||||
|
detector = BlinkDetector(threshold=0.20)
|
||||||
|
face = _face()
|
||||||
|
_set_eye_closed(face, LEFT_EYE)
|
||||||
|
_set_eye_open(face, RIGHT_EYE)
|
||||||
|
assert detector.detect(face) == BlinkEvent.LEFT
|
||||||
|
|
||||||
|
|
||||||
|
def test_right_blink_detected():
|
||||||
|
detector = BlinkDetector(threshold=0.20)
|
||||||
|
face = _face()
|
||||||
|
_set_eye_open(face, LEFT_EYE)
|
||||||
|
_set_eye_closed(face, RIGHT_EYE)
|
||||||
|
assert detector.detect(face) == BlinkEvent.RIGHT
|
||||||
|
|
||||||
|
|
||||||
|
def test_both_blink_detected():
|
||||||
|
detector = BlinkDetector(threshold=0.20)
|
||||||
|
face = _face()
|
||||||
|
_set_eye_closed(face, LEFT_EYE)
|
||||||
|
_set_eye_closed(face, RIGHT_EYE)
|
||||||
|
assert detector.detect(face) == BlinkEvent.BOTH
|
||||||
57
tests/test_features/test_gaze.py
Normal file
57
tests/test_features/test_gaze.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
import numpy as np
|
||||||
|
from merlin.features.gaze import GazeEstimator, GazeDirection
|
||||||
|
|
||||||
|
_LEFT_IRIS = 468
|
||||||
|
_RIGHT_IRIS = 473
|
||||||
|
_LEFT_INNER = 133
|
||||||
|
_RIGHT_OUTER = 263
|
||||||
|
|
||||||
|
|
||||||
|
def _face(n: int = 478) -> np.ndarray:
|
||||||
|
return np.zeros((n, 3), dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_gaze_center(face: np.ndarray) -> None:
|
||||||
|
"""Iris centers at midpoint of eye span → center gaze."""
|
||||||
|
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
|
||||||
|
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
|
||||||
|
mid = (face[_LEFT_INNER] + face[_RIGHT_OUTER]) / 2.0
|
||||||
|
face[_LEFT_IRIS] = mid.copy()
|
||||||
|
face[_RIGHT_IRIS] = mid.copy()
|
||||||
|
|
||||||
|
|
||||||
|
def _set_gaze_left(face: np.ndarray) -> None:
|
||||||
|
"""Iris centers shifted left relative to eye span."""
|
||||||
|
face[_LEFT_INNER] = [0.3, 0.5, 0.0]
|
||||||
|
face[_RIGHT_OUTER] = [0.7, 0.5, 0.0]
|
||||||
|
face[_LEFT_IRIS] = [0.35, 0.5, 0.0]
|
||||||
|
face[_RIGHT_IRIS] = [0.35, 0.5, 0.0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_center_gaze_label():
|
||||||
|
face = _face()
|
||||||
|
_set_gaze_center(face)
|
||||||
|
g = GazeEstimator().estimate(face)
|
||||||
|
assert g.label == "center"
|
||||||
|
|
||||||
|
|
||||||
|
def test_left_gaze_label():
|
||||||
|
face = _face()
|
||||||
|
_set_gaze_left(face)
|
||||||
|
g = GazeEstimator().estimate(face)
|
||||||
|
assert g.label == "left"
|
||||||
|
|
||||||
|
|
||||||
|
def test_zero_eye_width_returns_center():
|
||||||
|
"""Degenerate case: all landmarks at same point → center."""
|
||||||
|
face = _face()
|
||||||
|
g = GazeEstimator().estimate(face)
|
||||||
|
assert g.dx == 0.0 and g.dy == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_gazeresult_is_frozen():
|
||||||
|
g = GazeDirection(dx=0.1, dy=0.2)
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
with pytest.raises((AttributeError, TypeError)):
|
||||||
|
g.dx = 0.5
|
||||||
Loading…
Reference in a new issue