Initial commit: handshapes multiclass project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
227
infer_seq_webcam.py
Executable file
227
infer_seq_webcam.py
Executable file
@@ -0,0 +1,227 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Live webcam inference for two hands + full face + pose + face-relative hand extras (1670 dims/frame).
|
||||
Works for letters (A..Z) or word classes (e.g., Mother, Father).
|
||||
Optionally detects the sequence W → E → B to open a URL.
|
||||
"""
|
||||
|
||||
import os, math, argparse, time, webbrowser
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
import mediapipe as mp
|
||||
|
||||
# Quiet logs
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"; os.environ["GLOG_minloglevel"] = "2"
|
||||
import absl.logging; absl.logging.set_verbosity(absl.logging.ERROR)
|
||||
cv2.setLogLevel(0)
|
||||
|
||||
mp_holistic = mp.solutions.holistic
|
||||
|
||||
# ---------- normalization ----------
|
||||
def _angle(v):
|
||||
return math.atan2(v[1], v[0])
|
||||
|
||||
def _rot2d(t):
|
||||
c, s = math.cos(t), math.sin(t)
|
||||
return np.array([[c, -s], [s, c]], dtype=np.float32)
|
||||
|
||||
def normalize_hand(pts, handed=None):
|
||||
pts = pts.astype(np.float32).copy()
|
||||
pts[:, :2] -= pts[0, :2]
|
||||
if handed and str(handed).lower().startswith("left"): pts[:, 0] *= -1.0
|
||||
v = pts[9, :2]; R = _rot2d(math.pi/2 - _angle(v))
|
||||
pts[:, :2] = pts[:, :2] @ R.T
|
||||
xy = pts[:, :2]; d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
|
||||
d = 1.0 if d < 1e-6 else float(d)
|
||||
pts[:, :2] /= d; pts[:, 2] /= d
|
||||
return pts
|
||||
|
||||
def normalize_face(face):
|
||||
f = face.astype(np.float32).copy()
|
||||
left, right = f[33, :2], f[263, :2]
|
||||
center = 0.5*(left+right)
|
||||
f[:, :2] -= center[None, :]
|
||||
eye_vec = right - left; eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
|
||||
f[:, :2] /= eye_dist; f[:, 2] /= eye_dist
|
||||
R = _rot2d(-_angle(eye_vec)); f[:, :2] = f[:, :2] @ R.T
|
||||
return f
|
||||
|
||||
def normalize_pose(pose):
|
||||
p = pose.astype(np.float32).copy()
|
||||
ls, rs = p[11, :2], p[12, :2]
|
||||
center = 0.5*(ls+rs); p[:, :2] -= center[None, :]
|
||||
sw_vec = rs - ls; sw = float(np.linalg.norm(sw_vec)) or 1.0
|
||||
p[:, :2] /= sw; p[:, 2] /= sw
|
||||
R = _rot2d(-_angle(sw_vec)); p[:, :2] = p[:, :2] @ R.T
|
||||
return p
|
||||
|
||||
def face_frame_transform(face_pts):
|
||||
left = face_pts[33, :2]; right = face_pts[263, :2]
|
||||
center = 0.5*(left + right)
|
||||
eye_vec = right - left
|
||||
eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
|
||||
R = _rot2d(-_angle(eye_vec))
|
||||
return center, eye_dist, R
|
||||
|
||||
def to_face_frame(pt_xy, center, eye_dist, R):
|
||||
v = (pt_xy - center) / eye_dist
|
||||
return (v @ R.T).astype(np.float32)
|
||||
|
||||
# ---------- model ----------
|
||||
class SeqGRU(torch.nn.Module):
|
||||
def __init__(self, input_dim, hidden=128, num_classes=26):
|
||||
super().__init__()
|
||||
self.gru = torch.nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
|
||||
self.head = torch.nn.Sequential(
|
||||
torch.nn.Linear(hidden*2, 128), torch.nn.ReLU(), torch.nn.Dropout(0.2),
|
||||
torch.nn.Linear(128, num_classes),
|
||||
)
|
||||
def forward(self, x):
|
||||
h,_ = self.gru(x); return self.head(h[:, -1, :])
|
||||
|
||||
# ---------- main ----------
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--model", required=True)
|
||||
ap.add_argument("--camera", type=int, default=0)
|
||||
ap.add_argument("--threshold", type=float, default=0.35)
|
||||
ap.add_argument("--smooth", type=float, default=0.1, help="EMA window (seconds); 0 disables")
|
||||
ap.add_argument("--width", type=int, default=640)
|
||||
ap.add_argument("--height", type=int, default=480)
|
||||
ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
|
||||
ap.add_argument("--det-thresh", type=float, default=0.5)
|
||||
ap.add_argument("--url", type=str, default="https://www.google.com")
|
||||
args = ap.parse_args()
|
||||
|
||||
state = torch.load(args.model, map_location="cpu", weights_only=False)
|
||||
classes = state["classes"]
|
||||
T = int(state.get("frames", 32))
|
||||
X_mean = state["X_mean"].cpu().numpy().astype(np.float32)
|
||||
X_std = (state["X_std"].cpu().numpy().astype(np.float32) + 1e-6)
|
||||
input_dim = X_mean.shape[-1] # expected 1670
|
||||
|
||||
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
|
||||
model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes)).to(device)
|
||||
model.load_state_dict(state["model"]); model.eval()
|
||||
|
||||
hol = mp_holistic.Holistic(
|
||||
static_image_mode=False,
|
||||
model_complexity=args.holistic_complexity,
|
||||
smooth_landmarks=True,
|
||||
enable_segmentation=False,
|
||||
refine_face_landmarks=False,
|
||||
min_detection_confidence=args.det_thresh,
|
||||
min_tracking_confidence=args.det_thresh,
|
||||
)
|
||||
|
||||
cap = cv2.VideoCapture(args.camera)
|
||||
if not cap.isOpened(): raise SystemExit(f"❌ Could not open camera {args.camera}")
|
||||
cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width); cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
|
||||
|
||||
print(f"✅ Loaded {args.model} frames={T} classes={classes} input_dim={input_dim}")
|
||||
print("Press 'q' to quit.")
|
||||
|
||||
seq_buffer, ema_probs = [], None
|
||||
last_ts = time.time()
|
||||
last_emitted = None
|
||||
history = []
|
||||
|
||||
while True:
|
||||
ok, frame = cap.read()
|
||||
if not ok: break
|
||||
now = time.time(); dt = max(1e-6, now - last_ts); last_ts = now
|
||||
|
||||
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
res = hol.process(rgb)
|
||||
|
||||
overlay = "No face/hand"
|
||||
current = None
|
||||
|
||||
# hands
|
||||
right_pts = left_pts = None
|
||||
if res.right_hand_landmarks is not None:
|
||||
right_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.right_hand_landmarks.landmark], np.float32)
|
||||
if res.left_hand_landmarks is not None:
|
||||
left_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.left_hand_landmarks.landmark], np.float32)
|
||||
|
||||
# face
|
||||
face_pts = None
|
||||
if res.face_landmarks is not None:
|
||||
face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
|
||||
|
||||
# pose
|
||||
pose_arr = None
|
||||
if res.pose_landmarks is not None:
|
||||
pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in res.pose_landmarks.landmark], np.float32)
|
||||
|
||||
if face_pts is not None and (right_pts is not None or left_pts is not None):
|
||||
f_norm = normalize_face(face_pts)
|
||||
f_center, f_scale, f_R = face_frame_transform(face_pts)
|
||||
|
||||
def hand_face_extras(hand_pts):
|
||||
if hand_pts is None:
|
||||
return np.zeros(4, np.float32)
|
||||
wrist_xy = hand_pts[0, :2]
|
||||
tip_xy = hand_pts[8, :2]
|
||||
w = to_face_frame(wrist_xy, f_center, f_scale, f_R)
|
||||
t = to_face_frame(tip_xy, f_center, f_scale, f_R)
|
||||
return np.array([w[0], w[1], t[0], t[1]], np.float32)
|
||||
|
||||
rh_ex = hand_face_extras(right_pts)
|
||||
lh_ex = hand_face_extras(left_pts)
|
||||
|
||||
rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)
|
||||
lh = normalize_hand(left_pts, "Left").reshape(-1) if left_pts is not None else np.zeros(63, np.float32)
|
||||
p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32)
|
||||
|
||||
feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0) # (1670,)
|
||||
seq_buffer.append(feat)
|
||||
if len(seq_buffer) > T: seq_buffer.pop(0)
|
||||
|
||||
if len(seq_buffer) == T:
|
||||
X = np.stack(seq_buffer, 0)
|
||||
Xn = (X - X_mean) / X_std
|
||||
xt = torch.from_numpy(Xn).float().unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
probs = torch.softmax(model(xt), dim=1)[0].cpu().numpy()
|
||||
|
||||
if args.smooth > 0:
|
||||
alpha = 1.0 - math.exp(-dt / args.smooth)
|
||||
ema_probs = probs if ema_probs is None else (1.0 - alpha) * ema_probs + alpha * probs
|
||||
use = ema_probs
|
||||
else:
|
||||
use = probs
|
||||
|
||||
top_idx = int(np.argmax(use)); top_p = float(use[top_idx]); top_cls = classes[top_idx]
|
||||
overlay = f"{top_cls} {top_p*100:.1f}%"
|
||||
if top_p >= args.threshold: current = top_cls
|
||||
else:
|
||||
seq_buffer, ema_probs = [], None
|
||||
|
||||
# Emit on change & optional "WEB" sequence trigger
|
||||
if current is not None and current != last_emitted:
|
||||
print(f"Detected: {current}")
|
||||
last_emitted = current
|
||||
history.append(current)
|
||||
if len(history) > 3: history.pop(0)
|
||||
if history == ["W","E","B"]:
|
||||
print("🚀 Detected WEB! Opening browser…")
|
||||
try: webbrowser.open(args.url)
|
||||
except Exception as e: print(f"⚠️ Browser open failed: {e}")
|
||||
history.clear()
|
||||
|
||||
# Overlay
|
||||
buf = f"buf={len(seq_buffer)}/{T}"
|
||||
if ema_probs is not None:
|
||||
ti = int(np.argmax(ema_probs)); tp = float(ema_probs[ti]); tc = classes[ti]
|
||||
buf += f" top={tc} {tp:.2f}"
|
||||
cv2.putText(frame, overlay, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)
|
||||
cv2.putText(frame, buf, (20, 75), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
|
||||
cv2.imshow("ASL demo (R+L hands + face + pose + extras)", frame)
|
||||
if cv2.waitKey(1) & 0xFF == ord('q'): break
|
||||
|
||||
cap.release(); cv2.destroyAllWindows()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user