Initial commit: handshapes multiclass project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
259
capture_sequence.py
Executable file
259
capture_sequence.py
Executable file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env python3
|
||||
# capture_sequence.py
|
||||
# Record N short sequences per label with MediaPipe Holistic and build per-frame features:
|
||||
# RightHand(63) + LeftHand(63) + Face(468*3=1404) + Pose(33*4=132) + Face-relative hand extras(8) = 1670 dims
|
||||
# Requirements: numpy, opencv-python, mediapipe
|
||||
|
||||
import argparse, os, time, math, re
|
||||
from pathlib import Path
|
||||
import numpy as np, cv2, mediapipe as mp
|
||||
|
||||
mp_holistic = mp.solutions.holistic
|
||||
|
||||
# ---------- geometry / normalization ----------
|
||||
def _angle(v):
|
||||
return math.atan2(v[1], v[0])
|
||||
|
||||
def _rot2d(t):
|
||||
c, s = math.cos(t), math.sin(t)
|
||||
return np.array([[c, -s], [s, c]], dtype=np.float32)
|
||||
|
||||
def normalize_hand(pts, handed=None):
|
||||
"""Hand (21,3) → translate wrist, mirror left, rotate middle-MCP to +Y, scale by max pairwise distance."""
|
||||
pts = pts.astype(np.float32).copy()
|
||||
pts[:, :2] -= pts[0, :2]
|
||||
if handed and str(handed).lower().startswith("left"):
|
||||
pts[:, 0] *= -1.0
|
||||
v = pts[9, :2]
|
||||
R = _rot2d(math.pi/2 - _angle(v))
|
||||
pts[:, :2] = pts[:, :2] @ R.T
|
||||
xy = pts[:, :2]
|
||||
d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
|
||||
d = 1.0 if d < 1e-6 else float(d)
|
||||
pts[:, :2] /= d; pts[:, 2] /= d
|
||||
return pts # (21,3)
|
||||
|
||||
def normalize_face(face):
|
||||
"""Face (468,3) → center at eye midpoint, scale by inter-ocular, rotate eye-line horizontal."""
|
||||
f = face.astype(np.float32).copy()
|
||||
left = f[33, :2]; right = f[263, :2] # outer eye corners
|
||||
center = 0.5 * (left + right)
|
||||
f[:, :2] -= center[None, :]
|
||||
eye_vec = right - left
|
||||
eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
|
||||
f[:, :2] /= eye_dist; f[:, 2] /= eye_dist
|
||||
R = _rot2d(-_angle(eye_vec))
|
||||
f[:, :2] = f[:, :2] @ R.T
|
||||
return f # (468,3)
|
||||
|
||||
def normalize_pose(pose):
|
||||
"""
|
||||
Pose (33,4: x,y,z,vis) → center at shoulder midpoint, scale by shoulder width, rotate shoulders horizontal.
|
||||
Keep visibility ([:,3]) as-is.
|
||||
"""
|
||||
p = pose.astype(np.float32).copy()
|
||||
ls = p[11, :2]; rs = p[12, :2] # left/right shoulder
|
||||
center = 0.5 * (ls + rs)
|
||||
p[:, :2] -= center[None, :]
|
||||
sw_vec = rs - ls
|
||||
sw = float(np.linalg.norm(sw_vec)) or 1.0
|
||||
p[:, :2] /= sw; p[:, 2] /= sw
|
||||
R = _rot2d(-_angle(sw_vec))
|
||||
p[:, :2] = p[:, :2] @ R.T
|
||||
return p # (33,4)
|
||||
|
||||
def face_frame_transform(face_pts):
|
||||
"""
|
||||
Return (center, eye_dist, R) to map image XY to the normalized face frame (same as normalize_face).
|
||||
Use: v' = ((v - center)/eye_dist) @ R.T
|
||||
"""
|
||||
left = face_pts[33, :2]; right = face_pts[263, :2]
|
||||
center = 0.5*(left + right)
|
||||
eye_vec = right - left
|
||||
eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
|
||||
# rotation that aligns eye line to +X (inverse of normalize_face's rotation matrix)
|
||||
# normalize_face uses R = rot(-theta) applied after scaling/centering.
|
||||
theta = _angle(eye_vec)
|
||||
R = _rot2d(-theta)
|
||||
return center, eye_dist, R
|
||||
|
||||
def to_face_frame(pt_xy, center, eye_dist, R):
|
||||
v = (pt_xy - center) / eye_dist
|
||||
return (v @ R.T).astype(np.float32)
|
||||
|
||||
# ---------- utils ----------
|
||||
def next_idx(folder: Path, prefix="clip_"):
|
||||
pat = re.compile(rf"^{re.escape(prefix)}(\d+)\.npz$")
|
||||
mx = 0
|
||||
if folder.exists():
|
||||
for n in os.listdir(folder):
|
||||
m = pat.match(n)
|
||||
if m: mx = max(mx, int(m.group(1)))
|
||||
return mx + 1
|
||||
|
||||
def countdown(cap, seconds=3):
|
||||
for i in range(seconds, 0, -1):
|
||||
start = time.time()
|
||||
while time.time() - start < 1.0:
|
||||
ok, frame = cap.read()
|
||||
if not ok: continue
|
||||
h, w = frame.shape[:2]
|
||||
text = str(i)
|
||||
(tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 5, 10)
|
||||
cv2.putText(frame, text, ((w - tw)//2, (h + th)//2),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 10, cv2.LINE_AA)
|
||||
msg = "Starting in..."
|
||||
(mw, mh), _ = cv2.getTextSize(msg, cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
|
||||
cv2.putText(frame, msg, ((w - mw)//2, (h//2) - th - 20),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0,255,255), 3, cv2.LINE_AA)
|
||||
cv2.imshow("sequence capture", frame)
|
||||
if cv2.waitKey(1) & 0xFF == ord('q'):
|
||||
cap.release(); cv2.destroyAllWindows(); raise SystemExit("Aborted during countdown")
|
||||
|
||||
def draw_progress_bar(img, frac_remaining, bar_h=16, margin=12):
|
||||
h, w = img.shape[:2]
|
||||
x0, x1 = margin, w - margin
|
||||
y0, y1 = margin, margin + bar_h
|
||||
cv2.rectangle(img, (x0, y0), (x1, y1), (40, 40, 40), -1)
|
||||
cv2.rectangle(img, (x0, y0), (x1, y1), (90, 90, 90), 2)
|
||||
rem_w = int((x1 - x0) * max(0.0, min(1.0, frac_remaining)))
|
||||
if rem_w > 0:
|
||||
cv2.rectangle(img, (x0, y0), (x0 + rem_w, y1), (0, 200, 0), -1)
|
||||
|
||||
# ---------- holistic wrapper ----------
|
||||
class HolisticDetector:
|
||||
def __init__(self, det_conf=0.5, track_conf=0.5, model_complexity=1):
|
||||
self.h = mp_holistic.Holistic(
|
||||
static_image_mode=False,
|
||||
model_complexity=model_complexity,
|
||||
smooth_landmarks=True,
|
||||
enable_segmentation=False,
|
||||
refine_face_landmarks=False,
|
||||
min_detection_confidence=det_conf,
|
||||
min_tracking_confidence=track_conf,
|
||||
)
|
||||
def process(self, rgb):
|
||||
return self.h.process(rgb)
|
||||
|
||||
# ---------- main ----------
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--label", required=True, help="Class label (e.g., A, B, Mother, Father, etc.)")
|
||||
ap.add_argument("--split", required=True, choices=["train","val"])
|
||||
ap.add_argument("--seconds", type=float, default=0.8)
|
||||
ap.add_argument("--camera", type=int, default=0)
|
||||
ap.add_argument("--width", type=int, default=640)
|
||||
ap.add_argument("--height", type=int, default=480)
|
||||
ap.add_argument("--count", type=int, default=None)
|
||||
ap.add_argument("--det-thresh", type=float, default=0.5)
|
||||
ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
|
||||
args = ap.parse_args()
|
||||
|
||||
L = args.label.strip()
|
||||
if len(L) == 0 or ("/" in L or "\\" in L):
|
||||
raise SystemExit("Use a non-empty label without slashes")
|
||||
|
||||
if args.count is None:
|
||||
args.count = 100 if args.split == "train" else 20
|
||||
|
||||
out_dir = Path("sequences") / args.split / L
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
idx = next_idx(out_dir)
|
||||
|
||||
det = HolisticDetector(args.det_thresh, args.det_thresh, args.holistic_complexity)
|
||||
|
||||
cap = cv2.VideoCapture(args.camera)
|
||||
if not cap.isOpened(): raise SystemExit(f"Could not open camera {args.camera}")
|
||||
cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width)
|
||||
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
|
||||
|
||||
print(f"Recording {args.count} clips for {L}/{args.split}, {args.seconds}s each. (R+L hands + face + pose + face-relative extras)")
|
||||
countdown(cap, 3)
|
||||
|
||||
for n in range(args.count):
|
||||
seq_X = []
|
||||
start_t = time.time(); end_t = start_t + args.seconds
|
||||
|
||||
while True:
|
||||
now = time.time()
|
||||
if now >= end_t: break
|
||||
ok, fr = cap.read()
|
||||
if not ok: break
|
||||
|
||||
rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)
|
||||
res = det.process(rgb)
|
||||
|
||||
# hands
|
||||
right_pts = left_pts = None
|
||||
if res.right_hand_landmarks is not None:
|
||||
right_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.right_hand_landmarks.landmark], np.float32)
|
||||
if res.left_hand_landmarks is not None:
|
||||
left_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.left_hand_landmarks.landmark], np.float32)
|
||||
|
||||
# face
|
||||
face_pts = None
|
||||
if res.face_landmarks is not None:
|
||||
face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
|
||||
|
||||
# pose
|
||||
pose_arr = None
|
||||
if res.pose_landmarks is not None:
|
||||
pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in res.pose_landmarks.landmark], np.float32)
|
||||
|
||||
# Build feature: require face present and at least one hand (pose optional)
|
||||
if face_pts is not None and (right_pts is not None or left_pts is not None):
|
||||
f_norm = normalize_face(face_pts) # (468,3)
|
||||
|
||||
# transform pieces to express hand positions in face frame
|
||||
f_center, f_scale, f_R = face_frame_transform(face_pts)
|
||||
|
||||
def hand_face_extras(hand_pts):
|
||||
if hand_pts is None:
|
||||
return np.zeros(4, np.float32)
|
||||
wrist_xy = hand_pts[0, :2]
|
||||
tip_xy = hand_pts[8, :2]
|
||||
w = to_face_frame(wrist_xy, f_center, f_scale, f_R)
|
||||
t = to_face_frame(tip_xy, f_center, f_scale, f_R)
|
||||
return np.array([w[0], w[1], t[0], t[1]], np.float32)
|
||||
|
||||
rh_ex = hand_face_extras(right_pts) # 4
|
||||
lh_ex = hand_face_extras(left_pts) # 4
|
||||
|
||||
rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)
|
||||
lh = normalize_hand(left_pts, "Left").reshape(-1) if left_pts is not None else np.zeros(63, np.float32)
|
||||
p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32)
|
||||
|
||||
feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0) # (1670,)
|
||||
seq_X.append(feat)
|
||||
|
||||
# optional fingertip markers for visual feedback
|
||||
if right_pts is not None:
|
||||
pt = normalize_hand(right_pts, "Right")[8, :2]
|
||||
cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (0,255,0), -1)
|
||||
if left_pts is not None:
|
||||
pt = normalize_hand(left_pts, "Left")[8, :2]
|
||||
cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (255,0,0), -1)
|
||||
|
||||
# UI
|
||||
frac_remaining = (end_t - now) / max(1e-6, args.seconds)
|
||||
draw_progress_bar(fr, frac_remaining, bar_h=16, margin=12)
|
||||
cv2.putText(fr, f"{L} {args.split} Clip {n+1}/{args.count}",
|
||||
(20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA)
|
||||
cv2.imshow("sequence capture", fr)
|
||||
if cv2.waitKey(1) & 0xFF == ord('q'):
|
||||
cap.release(); cv2.destroyAllWindows(); return
|
||||
|
||||
if seq_X:
|
||||
X = np.stack(seq_X, 0).astype(np.float32) # (T, 1670)
|
||||
path = out_dir / f"clip_{idx:03d}.npz"
|
||||
np.savez_compressed(path, X=X)
|
||||
print(f"💾 saved {path} frames={X.shape[0]} dims={X.shape[1]}")
|
||||
idx += 1
|
||||
else:
|
||||
print("⚠️ Not enough frames with face + any hand; skipped clip.")
|
||||
|
||||
print("✅ Done recording.")
|
||||
cap.release(); cv2.destroyAllWindows()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user