Files
slr_handshapes_locations/doc/capture_sequence.py
2026-01-19 22:27:20 -05:00

324 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# capture_sequence.py
# Record N short sequences per label with MediaPipe Holistic and build per-frame features:
# RightHand(63) + LeftHand(63) + Face(468*3=1404) + Pose(33*4=132) + Face-relative hand extras(8) = 1670 dims
# Requirements: numpy, opencv-python, mediapipe
import argparse, os, time, math, re # stdlib: args, filesystem, timing, trig, regex
from pathlib import Path # pathlib for portable paths
import numpy as np, cv2, mediapipe as mp # core libs: arrays, webcam/GUI, landmarks
mp_holistic = mp.solutions.holistic # alias to the Holistic solution entry
# ---------- geometry / normalization ----------
def _angle(v):
"""
Return atan2(y, x) of a 2D vector.
Used to compute the orientation of a segment in the image plane.
"""
return math.atan2(v[1], v[0]) # angle in radians for rotation normalization
def _rot2d(t):
"""
Build a 2×2 rotation matrix for angle t (radians).
Used to rotate landmark sets into a canonical frame.
"""
c, s = math.cos(t), math.sin(t) # precompute cos/sin for speed
return np.array([[c, -s], [s, c]], dtype=np.float32) # standard 2D rotation matrix
def normalize_hand(pts, handed=None):
"""
Normalize a (21,3) hand landmark array:
1) translate so wrist (idx 0) is at origin
2) mirror X for left hands so both hands look like right
3) rotate so vector from wrist->middle-MCP (idx 9) points +Y
4) scale by max pairwise XY distance so size is comparable across frames
Returns: (21,3) float32
"""
pts = pts.astype(np.float32).copy() # make float32 copy (avoid mutating caller)
pts[:, :2] -= pts[0, :2] # translate: wrist to origin (stabilizes position)
if handed and str(handed).lower().startswith("left"):
pts[:, 0] *= -1.0 # mirror X for left hand to canonicalize handedness
v = pts[9, :2] # vector from wrist→middle MCP (index 9)
R = _rot2d(math.pi/2 - _angle(v)) # rotate so this vector points up (+Y)
pts[:, :2] = pts[:, :2] @ R.T # apply rotation to XY (keep Z as-is for now)
xy = pts[:, :2] # convenience view
d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max() # max pairwise XY distance (scale)
d = 1.0 if d < 1e-6 else float(d) # avoid divide-by-zero on degenerate frames
pts[:, :2] /= d; pts[:, 2] /= d # isotropic scale XY and Z by same factor
return pts # return normalized hand landmarks
def normalize_face(face):
"""
Normalize a (468,3) face mesh:
1) center at midpoint between outer eye corners (33, 263)
2) scale by inter-ocular distance
3) rotate so eye-line is horizontal
Returns: (468,3) float32
"""
f = face.astype(np.float32).copy() # safe copy
left = f[33, :2]; right = f[263, :2] # outer eye corners per MediaPipe indexing
center = 0.5 * (left + right) # center between eyes anchors the face
f[:, :2] -= center[None, :] # translate to center
eye_vec = right - left # vector from left→right eye
eye_dist = float(np.linalg.norm(eye_vec)) or 1.0 # scale factor; avoid zero
f[:, :2] /= eye_dist; f[:, 2] /= eye_dist # scale all dims consistently
R = _rot2d(-_angle(eye_vec)) # rotate so eye line aligns with +X
f[:, :2] = f[:, :2] @ R.T # apply rotation to XY
return f
def normalize_pose(pose):
"""
Normalize a (33,4) pose landmark array (x,y,z,visibility):
1) center at shoulder midpoint (11,12)
2) scale by shoulder width
3) rotate so shoulders are horizontal
Visibility channel ([:,3]) is preserved as-is.
Returns: (33,4) float32
"""
p = pose.astype(np.float32).copy() # copy to avoid mutating input
ls = p[11, :2]; rs = p[12, :2] # left/right shoulder in XY
center = 0.5 * (ls + rs) # mid-shoulder anchor
p[:, :2] -= center[None, :] # translate to center
sw_vec = rs - ls # shoulder vector (scale + rotation anchor)
sw = float(np.linalg.norm(sw_vec)) or 1.0 # shoulder width (avoid zero)
p[:, :2] /= sw; p[:, 2] /= sw # scale pose consistently
R = _rot2d(-_angle(sw_vec)) # rotate so shoulders are horizontal
p[:, :2] = p[:, :2] @ R.T # apply rotation to XY
return p
def face_frame_transform(face_pts):
"""
Compute a transform that maps image XY into the normalized face frame
(same definition as in normalize_face).
Returns:
center : (2,) eye midpoint
eye_dist: scalar inter-ocular distance
R : 2×2 rotation aligning eye-line to +X
Use downstream as: v' = ((v - center)/eye_dist) @ R.T
"""
left = face_pts[33, :2]; right = face_pts[263, :2] # reference points: eye corners
center = 0.5*(left + right) # face center
eye_vec = right - left # direction of eye line
eye_dist = float(np.linalg.norm(eye_vec)) or 1.0 # scale of face
theta = _angle(eye_vec) # angle of eye line
R = _rot2d(-theta) # rotation to align with +X
return center, eye_dist, R
def to_face_frame(pt_xy, center, eye_dist, R):
"""
Transform a 2D point from image space into the normalized face frame.
Inputs are from face_frame_transform().
"""
v = (pt_xy - center) / eye_dist # translate + scale
return (v @ R.T).astype(np.float32) # rotate into face frame
# ---------- utils ----------
def next_idx(folder: Path, prefix="clip_"):
"""
Scan a folder for files like 'clip_###.npz' and return the next index.
Keeps your saved clips sequential without collisions.
"""
pat = re.compile(rf"^{re.escape(prefix)}(\d+)\.npz$") # matches clip index
mx = 0 # track max index seen
if folder.exists(): # only if folder exists
for n in os.listdir(folder): # iterate files
m = pat.match(n) # regex match
if m: mx = max(mx, int(m.group(1))) # update max on matches
return mx + 1 # next available index
def countdown(cap, seconds=3):
"""
Show a full-screen countdown overlay before recording starts.
Press 'q' to abort during countdown.
"""
for i in range(seconds, 0, -1): # 3..2..1
start = time.time() # ensure ~1s display per number
while time.time() - start < 1.0:
ok, frame = cap.read() # read a frame
if not ok: continue # skip if camera hiccups
h, w = frame.shape[:2] # frame size for centering text
text = str(i) # the digit to render
(tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 5, 10) # size of big number
cv2.putText(frame, text, ((w - tw)//2, (h + th)//2),
cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 10, cv2.LINE_AA) # draw big red numeral
msg = "Starting in..." # helper message above the number
(mw, mh), _ = cv2.getTextSize(msg, cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
cv2.putText(frame, msg, ((w - mw)//2, (h//2) - th - 20),
cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0,255,255), 3, cv2.LINE_AA)
cv2.imshow("sequence capture", frame) # show overlay
if cv2.waitKey(1) & 0xFF == ord('q'): # allow abort
cap.release(); cv2.destroyAllWindows()
raise SystemExit("Aborted during countdown")
def draw_progress_bar(img, frac_remaining, bar_h=16, margin=12):
"""
Draw a simple progress bar at the top of the frame.
frac_remaining in [0,1] indicates time left in the clip.
"""
h, w = img.shape[:2] # image dimensions
x0, x1 = margin, w - margin # horizontal extent
y0, y1 = margin, margin + bar_h # vertical extent
cv2.rectangle(img, (x0, y0), (x1, y1), (40, 40, 40), -1) # dark background bar
cv2.rectangle(img, (x0, y0), (x1, y1), (90, 90, 90), 2) # border
rem_w = int((x1 - x0) * max(0.0, min(1.0, frac_remaining))) # filled width clamped
if rem_w > 0:
cv2.rectangle(img, (x0, y0), (x0 + rem_w, y1), (0, 200, 0), -1) # green fill
# ---------- holistic wrapper ----------
class HolisticDetector:
"""
Thin wrapper around MediaPipe Holistic to fix configuration once and expose process().
"""
def __init__(self, det_conf=0.5, track_conf=0.5, model_complexity=1):
# Build the Holistic detector with steady defaults; smooth_landmarks helps temporal stability.
self.h = mp_holistic.Holistic(
static_image_mode=False, # realtime video stream
model_complexity=model_complexity, # 0=fastest, 2=most accurate
smooth_landmarks=True, # temporal smoothing reduces jitter
enable_segmentation=False, # not needed; saves compute
refine_face_landmarks=False, # faster; we only need coarse face
min_detection_confidence=det_conf, # detection threshold
min_tracking_confidence=track_conf, # tracking threshold
)
def process(self, rgb):
"""
Run landmark detection on an RGB frame and return MediaPipe results object.
"""
return self.h.process(rgb) # delegate to MP
# ---------- main ----------
def main():
"""
CLI entry: capture N clips of length --seconds for a given --label and --split,
save per-frame 1670-D features into sequences/<split>/<label>/clip_XXX.npz.
"""
ap = argparse.ArgumentParser() # CLI flag parsing
ap.add_argument("--label", required=True, help="Class label (e.g., A, B, Mother, Father, etc.)")
ap.add_argument("--split", required=True, choices=["train","val"])
ap.add_argument("--seconds", type=float, default=0.8)
ap.add_argument("--camera", type=int, default=0)
ap.add_argument("--width", type=int, default=640)
ap.add_argument("--height", type=int, default=480)
ap.add_argument("--count", type=int, default=None)
ap.add_argument("--det-thresh", type=float, default=0.5)
ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
args = ap.parse_args() # finalize args
L = args.label.strip() # normalized label string
if len(L) == 0 or ("/" in L or "\\" in L): # basic validation to keep clean paths
raise SystemExit("Use a non-empty label without slashes")
if args.count is None: # default count per split for convenience
args.count = 100 if args.split == "train" else 20
out_dir = Path("sequences") / args.split / L # where clip_*.npz will go
out_dir.mkdir(parents=True, exist_ok=True) # ensure directory exists
idx = next_idx(out_dir) # next clip index to use
det = HolisticDetector(args.det_thresh, args.det_thresh, args.holistic_complexity) # detector
cap = cv2.VideoCapture(args.camera) # open camera device
if not cap.isOpened(): # fail early if missing
raise SystemExit(f"Could not open camera {args.camera}")
cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width) # set capture width
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height) # set capture height
print(f"Recording {args.count} clips for {L}/{args.split}, {args.seconds}s each. (R+L hands + face + pose + face-relative extras)")
countdown(cap, 3) # give operator time to get ready
for n in range(args.count): # loop over requested clips
seq_X = [] # holds per-frame features
start_t = time.time(); end_t = start_t + args.seconds # fixed-length recording window
while True: # per-frame capture loop
now = time.time()
if now >= end_t: break # stop after desired duration
ok, fr = cap.read() # grab a frame
if not ok: break # camera yielded nothing; end clip
rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB) # MediaPipe expects RGB
res = det.process(rgb) # run landmark detection
# hands
right_pts = left_pts = None # initialize as missing
if res.right_hand_landmarks is not None: # if right detected…
right_pts = np.array([[lm.x, lm.y, lm.z]
for lm in res.right_hand_landmarks.landmark], np.float32) # (21,3)
if res.left_hand_landmarks is not None: # if left detected…
left_pts = np.array([[lm.x, lm.y, lm.z]
for lm in res.left_hand_landmarks.landmark], np.float32) # (21,3)
# face
face_pts = None
if res.face_landmarks is not None: # 468 face landmarks
face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
# pose
pose_arr = None
if res.pose_landmarks is not None: # 33 pose landmarks with visibility
pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility]
for lm in res.pose_landmarks.landmark], np.float32)
# Build feature: require face present and at least one hand (pose optional)
if face_pts is not None and (right_pts is not None or left_pts is not None):
f_norm = normalize_face(face_pts) # canonicalize face geometry → (468,3)
# transform pieces to express hand positions in face frame
f_center, f_scale, f_R = face_frame_transform(face_pts) # face frame for extras
def hand_face_extras(hand_pts):
"""
For a hand, return [wrist_x, wrist_y, tip_x, tip_y] in face frame.
If hand missing, returns zeros. Keeps coarse spatial relation to face.
"""
if hand_pts is None:
return np.zeros(4, np.float32) # missing hand → zeros to keep dims fixed
wrist_xy = hand_pts[0, :2] # wrist point
tip_xy = hand_pts[8, :2] # index fingertip (salient for pointing)
w = to_face_frame(wrist_xy, f_center, f_scale, f_R) # project to face frame
t = to_face_frame(tip_xy, f_center, f_scale, f_R)
return np.array([w[0], w[1], t[0], t[1]], np.float32) # pack features
rh_ex = hand_face_extras(right_pts) # (4,) right extras
lh_ex = hand_face_extras(left_pts) # (4,) left extras
rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32) # hand (63,)
lh = normalize_hand(left_pts, "Left" ).reshape(-1) if left_pts is not None else np.zeros(63, np.float32) # hand (63,)
p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32) # pose (132,)
feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0) # (1670,) full feature
seq_X.append(feat) # push this frames feature vector
# optional fingertip markers for visual feedback (normalized hand to index tip)
if right_pts is not None:
pt = normalize_hand(right_pts, "Right")[8, :2] # index tip in normalized [0..1]-ish coords
cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (0,255,0), -1) # green dot
if left_pts is not None:
pt = normalize_hand(left_pts, "Left")[8, :2]
cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (255,0,0), -1) # blue/red dot
# UI overlay (progress + label)
frac_remaining = (end_t - now) / max(1e-6, args.seconds) # progress bar fraction
draw_progress_bar(fr, frac_remaining, bar_h=16, margin=12)
cv2.putText(fr, f"{L} {args.split} Clip {n+1}/{args.count}",
(20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA)
cv2.imshow("sequence capture", fr) # show live preview
if cv2.waitKey(1) & 0xFF == ord('q'): # allow stopping whole session
cap.release(); cv2.destroyAllWindows(); return
# After clip duration, save if we collected any valid frames
if seq_X:
X = np.stack(seq_X, 0).astype(np.float32) # (T, 1670) stack into array
path = out_dir / f"clip_{idx:03d}.npz" # next filename
np.savez_compressed(path, X=X) # compressed .npz with key 'X'
print(f"💾 saved {path} frames={X.shape[0]} dims={X.shape[1]}")
idx += 1 # advance index
else:
print("⚠️ Not enough frames with face + any hand; skipped clip.") # guardrail
print("✅ Done recording.") # session complete
cap.release(); cv2.destroyAllWindows() # clean up resources
if __name__ == "__main__":
main() # run CLI