From 41b3b95c426eab23f749ad534fd6ed7e637f7aa1 Mon Sep 17 00:00:00 2001 From: jared Date: Mon, 19 Jan 2026 22:19:15 -0500 Subject: [PATCH] Initial commit: ASL handshape recognition project Co-Authored-By: Claude Opus 4.5 --- .gitignore | 10 ++ 1prep.sh | 12 ++ 2train.sh | 14 +++ 3demo.sh | 5 + eval_val.py | 62 ++++++++++ infer_webcam-multi.py | 220 +++++++++++++++++++++++++++++++++ infer_webcam.py | 137 ++++++++++++++++++++ make_all_letter_directories.sh | 14 +++ prep_landmarks_binary.py | 137 ++++++++++++++++++++ train_mlp.py | 127 +++++++++++++++++++ webcam_capture.py | 145 ++++++++++++++++++++++ 11 files changed, 883 insertions(+) create mode 100644 .gitignore create mode 100755 1prep.sh create mode 100755 2train.sh create mode 100755 3demo.sh create mode 100644 eval_val.py create mode 100644 infer_webcam-multi.py create mode 100755 infer_webcam.py create mode 100755 make_all_letter_directories.sh create mode 100755 prep_landmarks_binary.py create mode 100755 train_mlp.py create mode 100755 webcam_capture.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..384f32c --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Ignore everything +* + +# But not these files... +!.gitignore +!*.py +!*.sh + +# And not directories (so git can recurse into them) +!*/ diff --git a/1prep.sh b/1prep.sh new file mode 100755 index 0000000..f12870f --- /dev/null +++ b/1prep.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Single letters: +python prep_landmarks_binary.py --letter A +# etc for B-Z + +# OR all letters at once: +# for L in {A..Z}; do +# python prep_landmarks_binary.py --letter "$L" +# done + diff --git a/2train.sh b/2train.sh new file mode 100755 index 0000000..5b370e7 --- /dev/null +++ b/2train.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Train single letters: +python train_mlp.py --letter A --epochs 40 --batch 64 +# etc for B-Z + +# Each run saves: asl__mlp.pt + +# OR all letters at once: +# for L in {A..Z}; do +# python train_mlp.py --letter "$L" --epochs 40 --batch 64 +# done + diff --git a/3demo.sh b/3demo.sh new file mode 100755 index 0000000..4f94c12 --- /dev/null +++ b/3demo.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +python infer_webcam.py --letter A + diff --git a/eval_val.py b/eval_val.py new file mode 100644 index 0000000..79a6c25 --- /dev/null +++ b/eval_val.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +""" +Evaluate the trained per-letter model on the saved val split. +Prints confusion matrix and a classification report. + +Usage: + python eval_val.py --letter A +""" +import argparse, json +import numpy as np +from sklearn.metrics import confusion_matrix, classification_report +import torch +import torch.nn as nn + +class MLP(nn.Module): + def __init__(self, in_dim, num_classes): + super().__init__() + self.net = nn.Sequential( + nn.Linear(in_dim,128), nn.ReLU(), nn.Dropout(0.2), + nn.Linear(128,64), nn.ReLU(), nn.Dropout(0.1), + nn.Linear(64,num_classes), + ) + def forward(self, x): return self.net(x) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--letter", required=True, help="Target letter (A–Z)") + args = ap.parse_args() + L = args.letter.upper() + + # Load val split and classes + X = np.load(f"landmarks_{L}/val_X.npy") + y = np.load(f"landmarks_{L}/val_y.npy") + classes = json.load(open(f"landmarks_{L}/class_names.json")) + + # Load checkpoint (disable weights-only safety; handle tensor/ndarray) + state = torch.load(f"asl_{L}_mlp.pt", map_location="cpu", weights_only=False) + X_mean = state["X_mean"] + X_std = state["X_std"] + if isinstance(X_mean, torch.Tensor): X_mean = X_mean.cpu().numpy() + if isinstance(X_std, torch.Tensor): X_std = X_std.cpu().numpy() + X_mean = np.asarray(X_mean, dtype=np.float32) + X_std = np.asarray(X_std, dtype=np.float32) + 1e-6 + + model = MLP(X.shape[1], len(classes)) + model.load_state_dict(state["model"]) + model.eval() + + # Normalize and predict + Xn = (X - X_mean) / X_std + with torch.no_grad(): + probs = torch.softmax(model(torch.from_numpy(Xn).float()), dim=1).numpy() + pred = probs.argmax(axis=1) + + print("Classes:", classes) # e.g., ['Not_A','A'] + print("\nConfusion matrix (rows=true, cols=pred):") + print(confusion_matrix(y, pred)) + print("\nReport:") + print(classification_report(y, pred, target_names=classes, digits=3)) + +if __name__ == "__main__": + main() diff --git a/infer_webcam-multi.py b/infer_webcam-multi.py new file mode 100644 index 0000000..5bd1432 --- /dev/null +++ b/infer_webcam-multi.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +infer_webcam_multi.py +Live multi-letter inference from webcam using multiple per-letter binary models. + +Examples: + # Detect A, B, C using default filenames asl_A_mlp.pt, asl_B_mlp.pt, asl_C_mlp.pt + python infer_webcam_multi.py --letters A,B,C + + # Same but with a confidence threshold for accepting any letter + python infer_webcam_multi.py --letters A,B,C --threshold 0.8 + + # Explicit model paths (overrides --letters) + python infer_webcam_multi.py --models asl_A_mlp.pt asl_B_mlp.pt --threshold 0.75 + +Press 'q' to quit. +""" +import os, math, argparse +import numpy as np +import cv2 +import torch +import mediapipe as mp + +# ---------- geometry helpers ---------- +def _angle(v): return math.atan2(v[1], v[0]) +def _rot2d(t): + c, s = math.cos(t), math.sin(t) + return np.array([[c, -s], [s, c]], dtype=np.float32) + +def normalize_landmarks(pts, handedness_label=None): + pts = pts.astype(np.float32).copy() + # translate wrist to origin + pts[:, :2] -= pts[0, :2] + # mirror left→right + if handedness_label and handedness_label.lower().startswith("left"): + pts[:, 0] *= -1.0 + # rotate wrist→middle_mcp to +Y + v = pts[9, :2] + R = _rot2d(math.pi/2 - _angle(v)) + pts[:, :2] = pts[:, :2] @ R.T + # scale by max pairwise distance + xy = pts[:, :2] + d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max() + d = 1.0 if d < 1e-6 else float(d) + pts[:, :2] /= d; pts[:, 2] /= d + return pts.reshape(-1) + +# ---------- MLP ---------- +class MLP(torch.nn.Module): + def __init__(self, in_dim, num_classes): + super().__init__() + self.net = torch.nn.Sequential( + torch.nn.Linear(in_dim, 128), + torch.nn.ReLU(), + torch.nn.Dropout(0.2), + torch.nn.Linear(128, 64), + torch.nn.ReLU(), + torch.nn.Dropout(0.1), + torch.nn.Linear(64, num_classes), + ) + def forward(self, x): return self.net(x) + +# ---------- Utilities ---------- +def load_model_bundle(model_path): + """ + Load a single per-letter model checkpoint and return a dict bundle with: + - 'model': torch.nn.Module (eval, on device) + - 'classes': list of class names, e.g. ['Not_A', 'A'] + - 'pos_index': index of the positive (letter) class in 'classes' + - 'X_mean', 'X_std': np arrays (1, 63) + - 'letter': inferred letter string for display (e.g., 'A') + """ + state = torch.load(model_path, map_location="cpu", weights_only=False) + classes = state["classes"] + # identify the "letter" class: prefer anything not starting with "Not_" + # fallback: last class + pos_idx = None + for i, c in enumerate(classes): + if not c.lower().startswith("not_"): + pos_idx = i + break + if pos_idx is None: + pos_idx = len(classes) - 1 + + # letter name (strip Not_ if needed) + letter_name = classes[pos_idx] + if letter_name.lower().startswith("not_"): + letter_name = letter_name[4:] + + X_mean = state["X_mean"]; X_std = state["X_std"] + if isinstance(X_mean, torch.Tensor): X_mean = X_mean.cpu().numpy() + if isinstance(X_std, torch.Tensor): X_std = X_std.cpu().numpy() + X_mean = np.asarray(X_mean, dtype=np.float32) + X_std = np.asarray(X_std, dtype=np.float32) + 1e-6 + + model = MLP(63, len(classes)) + model.load_state_dict(state["model"]) + model.eval() + + return { + "path": model_path, + "model": model, + "classes": classes, + "pos_index": pos_idx, + "X_mean": X_mean, + "X_std": X_std, + "letter": letter_name, + } + +def put_text(img, text, org, scale=1.1, color=(0,255,0), thick=2): + cv2.putText(img, text, org, cv2.FONT_HERSHEY_SIMPLEX, scale, color, thick, cv2.LINE_AA) + +# ---------- Main ---------- +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--letters", help="Comma-separated letters, e.g. A,B,C (uses asl__mlp.pt)") + ap.add_argument("--models", nargs="+", help="Explicit model paths (overrides --letters)") + ap.add_argument("--threshold", type=float, default=0.5, + help="Reject threshold on positive-class probability (default: 0.5)") + ap.add_argument("--camera", type=int, default=0, help="OpenCV camera index (default: 0)") + ap.add_argument("--width", type=int, default=640, help="Requested capture width (default: 640)") + ap.add_argument("--height", type=int, default=480, help="Requested capture height (default: 480)") + args = ap.parse_args() + + model_paths = [] + if args.models: + model_paths = args.models + elif args.letters: + for L in [s.strip().upper() for s in args.letters.split(",") if s.strip()]: + model_paths.append(f"asl_{L}_mlp.pt") + else: + raise SystemExit("Please provide --letters A,B,C or --models path1.pt path2.pt ...") + + # Check files + for p in model_paths: + if not os.path.exists(p): + raise SystemExit(f"❌ Model file not found: {p}") + + # Device + device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu") + + # Load bundles + bundles = [load_model_bundle(p) for p in model_paths] + for b in bundles: + b["model"].to(device) + print("✅ Loaded models:", ", ".join(f"{b['letter']}({os.path.basename(b['path'])})" for b in bundles)) + + # MediaPipe Hands + hands = mp.solutions.hands.Hands( + static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5 + ) + + # Camera + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise SystemExit(f"❌ Could not open camera index {args.camera}") + cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width) + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height) + + print("Press 'q' to quit.") + while True: + ok, frame = cap.read() + if not ok: + break + + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + res = hands.process(rgb) + + overlay = frame.copy() + label_text = "No hand" + scoreboard = [] + if res.multi_hand_landmarks: + ih = res.multi_hand_landmarks[0] + handed = None + if res.multi_handedness: + handed = res.multi_handedness[0].classification[0].label + + pts = np.array([[lm.x, lm.y, lm.z] for lm in ih.landmark], dtype=np.float32) + feat = normalize_landmarks(pts, handedness_label=handed) + + # Evaluate each model + best_letter, best_prob = None, -1.0 + for b in bundles: + X_mean = b["X_mean"].flatten() + X_std = b["X_std"].flatten() + xn = (feat - X_mean) / X_std + xt = torch.from_numpy(xn).float().unsqueeze(0).to(device) + with torch.no_grad(): + probs = torch.softmax(b["model"](xt), dim=1)[0].cpu().numpy() + p_pos = float(probs[b["pos_index"]]) + scoreboard.append((b["letter"], p_pos)) + if p_pos > best_prob: + best_prob = p_pos + best_letter = b["letter"] + + # Compose label based on threshold + if best_prob >= args.threshold: + label_text = f"{best_letter} {best_prob*100:.1f}%" + else: + label_text = f"Unknown ({best_letter} {best_prob*100:.1f}%)" + + # Sort scoreboard desc and show top 3 + scoreboard.sort(key=lambda x: x[1], reverse=True) + y0 = 80 + put_text(overlay, "Scores:", (20, y0), scale=0.9, color=(0,255,255), thick=2) + y = y0 + 30 + for i, (L, p) in enumerate(scoreboard[:3]): + put_text(overlay, f"{L}: {p*100:.1f}%", (20, y), scale=0.9, color=(0,255,0), thick=2) + y += 28 + + put_text(overlay, label_text, (20, 40), scale=1.2, color=(0,255,0), thick=3) + cv2.imshow("ASL multi-letter demo", overlay) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + cap.release() + cv2.destroyAllWindows() + +if __name__ == "__main__": + main() diff --git a/infer_webcam.py b/infer_webcam.py new file mode 100755 index 0000000..33b4818 --- /dev/null +++ b/infer_webcam.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +infer_webcam.py +Live webcam demo: detect a hand with MediaPipe, normalize landmarks, +classify with a trained MLP model. + +Examples: + python infer_webcam.py --letter A # loads asl_A_mlp.pt + python infer_webcam.py --letter B # loads asl_B_mlp.pt + python infer_webcam.py --model /path/to/asl_A_mlp.pt + +Press 'q' to quit. +""" +import os, math, argparse +import numpy as np +import cv2 +import torch +import mediapipe as mp + +# ---------- geometry helpers ---------- +def _angle(v): return math.atan2(v[1], v[0]) +def _rot2d(t): + c, s = math.cos(t), math.sin(t) + return np.array([[c, -s], [s, c]], dtype=np.float32) + +def normalize_landmarks(pts, handedness_label=None): + pts = pts.astype(np.float32).copy() + # translate wrist to origin + pts[:, :2] -= pts[0, :2] + # mirror left→right + if handedness_label and handedness_label.lower().startswith("left"): + pts[:, 0] *= -1.0 + # rotate wrist→middle_mcp to +Y + v = pts[9, :2] + R = _rot2d(math.pi/2 - _angle(v)) + pts[:, :2] = pts[:, :2] @ R.T + # scale by max pairwise distance + xy = pts[:, :2] + d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max() + d = 1.0 if d < 1e-6 else float(d) + pts[:, :2] /= d; pts[:, 2] /= d + return pts.reshape(-1) + +# ---------- model ---------- +class MLP(torch.nn.Module): + def __init__(self, in_dim, num_classes): + super().__init__() + self.net = torch.nn.Sequential( + torch.nn.Linear(in_dim, 128), + torch.nn.ReLU(), + torch.nn.Dropout(0.2), + torch.nn.Linear(128, 64), + torch.nn.ReLU(), + torch.nn.Dropout(0.1), + torch.nn.Linear(64, num_classes), + ) + def forward(self, x): return self.net(x) + +# ---------- main ---------- +def main(): + ap = argparse.ArgumentParser() + grp = ap.add_mutually_exclusive_group(required=True) + grp.add_argument("--letter", help="Target letter (A–Z). Loads asl__mlp.pt") + grp.add_argument("--model", help="Path to trained .pt model (overrides --letter)") + ap.add_argument("--camera", type=int, default=0, help="OpenCV camera index (default: 0)") + args = ap.parse_args() + + # Resolve model path + model_path = args.model + if model_path is None: + letter = args.letter.upper() + model_path = f"asl_{letter}_mlp.pt" + + if not os.path.exists(model_path): + raise SystemExit(f"❌ Model file not found: {model_path}") + + # Load state (allowing tensors or numpy inside; disable weights-only safety) + state = torch.load(model_path, map_location="cpu", weights_only=False) + classes = state["classes"] + X_mean = state["X_mean"] + X_std = state["X_std"] + + # Convert X_mean/X_std to numpy no matter how they were saved + if isinstance(X_mean, torch.Tensor): X_mean = X_mean.cpu().numpy() + if isinstance(X_std, torch.Tensor): X_std = X_std.cpu().numpy() + X_mean = np.asarray(X_mean, dtype=np.float32) + X_std = np.asarray(X_std, dtype=np.float32) + 1e-6 + + model = MLP(63, len(classes)) + model.load_state_dict(state["model"]) + model.eval() + + device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu") + model.to(device) + + hands = mp.solutions.hands.Hands( + static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5 + ) + + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise SystemExit(f"❌ Could not open camera index {args.camera}") + + print(f"✅ Loaded {model_path} with classes {classes}") + print("Press 'q' to quit.") + while True: + ok, frame = cap.read() + if not ok: break + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + res = hands.process(rgb) + + label_text = "No hand" + if res.multi_hand_landmarks: + ih = res.multi_hand_landmarks[0] + handed = None + if res.multi_handedness: + handed = res.multi_handedness[0].classification[0].label + pts = np.array([[lm.x, lm.y, lm.z] for lm in ih.landmark], dtype=np.float32) + feat = normalize_landmarks(pts, handedness_label=handed) + # standardize + xn = (feat - X_mean.flatten()) / X_std.flatten() + xt = torch.from_numpy(xn).float().unsqueeze(0).to(device) + with torch.no_grad(): + probs = torch.softmax(model(xt), dim=1)[0].cpu().numpy() + idx = int(probs.argmax()) + label_text = f"{classes[idx]} {probs[idx]*100:.1f}%" + + cv2.putText(frame, label_text, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2) + cv2.imshow("ASL handshape demo", frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + cap.release() + cv2.destroyAllWindows() + +if __name__ == "__main__": + main() diff --git a/make_all_letter_directories.sh b/make_all_letter_directories.sh new file mode 100755 index 0000000..90f81c7 --- /dev/null +++ b/make_all_letter_directories.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Create train/val directories for per-letter binary datasets: +# data/asl/train/{A,Not_A,...,Z,Not_Z} and same under val/ + +set -euo pipefail + +for split in train val; do + for L in {A..Z}; do + mkdir -p "data/asl/$split/$L" + mkdir -p "data/asl/$split/Not_$L" + done +done + +echo "✅ Created data/asl/train|val/{A,Not_A,...,Z,Not_Z}" diff --git a/prep_landmarks_binary.py b/prep_landmarks_binary.py new file mode 100755 index 0000000..418b5a9 --- /dev/null +++ b/prep_landmarks_binary.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Prepare landmarks for a single binary task (Letter vs Not_Letter). + +Data layout (per letter): +data/asl/ + train/ + A/ + Not_A/ + val/ + A/ + Not_A/ + +Usage (no outdir needed): + python prep_landmarks_binary.py --letter A + # -> saves into landmarks_A/ + +Optional: + python prep_landmarks_binary.py --letter B --data /path/to/dataset +""" +import os, argparse, json, math +from pathlib import Path +import numpy as np +import cv2 +import mediapipe as mp + +# ---------- geometry helpers ---------- +def _angle(v): return math.atan2(v[1], v[0]) + +def _rot2d(t): + c, s = math.cos(t), math.sin(t) + return np.array([[c, -s], [s, c]], dtype=np.float32) + +def normalize_landmarks(pts, handed=None): + """ + pts: (21,3) in MediaPipe normalized image coords. + Steps: + 1) translate wrist to origin + 2) mirror left->right (canonicalize) + 3) rotate wrist->middle_mcp to +Y + 4) scale by max pairwise XY distance + returns: (63,) float32 + """ + pts = pts.astype(np.float32).copy() + # 1) translate + pts[:, :2] -= pts[0, :2] + # 2) canonicalize left/right + if handed and handed.lower().startswith("left"): + pts[:, 0] *= -1.0 + # 3) rotate + v = pts[9, :2] # middle MCP + R = _rot2d(math.pi/2 - _angle(v)) # align to +Y + pts[:, :2] = pts[:, :2] @ R.T + # 4) scale + xy = pts[:, :2] + d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max() + d = 1.0 if d < 1e-6 else float(d) + pts[:, :2] /= d; pts[:, 2] /= d + return pts.reshape(-1) + +# ---------- extraction ---------- +def collect(split_dir: Path, pos_name: str, neg_name: str, min_det_conf: float): + X, y, paths = [], [], [] + total, used = 0, 0 + + hands = mp.solutions.hands.Hands( + static_image_mode=True, + max_num_hands=1, + min_detection_confidence=min_det_conf + ) + + for label, cls in [(1, pos_name), (0, neg_name)]: + cls_dir = split_dir / cls + if not cls_dir.exists(): + continue + for p in cls_dir.rglob("*"): + if not p.is_file() or p.suffix.lower() not in {".jpg",".jpeg",".png",".bmp",".webp"}: + continue + total += 1 + bgr = cv2.imread(str(p)) + if bgr is None: + continue + rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB) + res = hands.process(rgb) + if not res.multi_hand_landmarks: + continue + + ih = res.multi_hand_landmarks[0] + handed = None + if res.multi_handedness: + handed = res.multi_handedness[0].classification[0].label # "Left"/"Right" + + pts = np.array([[lm.x, lm.y, lm.z] for lm in ih.landmark], dtype=np.float32) + feat = normalize_landmarks(pts, handed) + X.append(feat); y.append(label); paths.append(str(p)); used += 1 + + X = np.stack(X) if X else np.zeros((0,63), np.float32) + y = np.array(y, dtype=np.int64) + print(f"Split '{split_dir.name}': found {total}, used {used} (hands detected).") + return X, y, paths + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--letter", required=True, help="Target letter (A–Z)") + ap.add_argument("--data", default="data/asl", help="Root with train/ and val/ (default: data/asl)") + ap.add_argument("--outdir", default=None, help="Output dir (default: landmarks_)") + ap.add_argument("--min_det_conf", type=float, default=0.5, help="MediaPipe min detection confidence") + args = ap.parse_args() + + L = args.letter.upper() + pos_name = L + neg_name = f"Not_{L}" + outdir = args.outdir or f"landmarks_{L}" + os.makedirs(outdir, exist_ok=True) + + train_dir = Path(args.data) / "train" + val_dir = Path(args.data) / "val" + + Xtr, ytr, ptr = collect(train_dir, pos_name, neg_name, args.min_det_conf) + Xva, yva, pva = collect(val_dir, pos_name, neg_name, args.min_det_conf) + + # Save arrays + metadata + np.save(f"{outdir}/train_X.npy", Xtr) + np.save(f"{outdir}/train_y.npy", ytr) + np.save(f"{outdir}/val_X.npy", Xva) + np.save(f"{outdir}/val_y.npy", yva) + + with open(f"{outdir}/class_names.json","w") as f: + json.dump([neg_name, pos_name], f) # index 0: Not_L, index 1: L + + open(f"{outdir}/train_paths.txt","w").write("\n".join(ptr)) + open(f"{outdir}/val_paths.txt","w").write("\n".join(pva)) + + print(f"✅ Saved {L}: train {Xtr.shape}, val {Xva.shape}, classes={[neg_name, pos_name]} → {outdir}") + +if __name__ == "__main__": + main() diff --git a/train_mlp.py b/train_mlp.py new file mode 100755 index 0000000..2e8f76f --- /dev/null +++ b/train_mlp.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +train_mlp.py +Train a small MLP on landmarks for a single letter (binary: Letter vs Not_Letter). + +Expected workflow: + python prep_landmarks_binary.py --letter A # saves landmarks_A/ + python train_mlp.py --letter A --epochs 40 --batch 64 + python infer_webcam.py --letter A +""" +import os, json, argparse +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import TensorDataset, DataLoader + +def get_device(): + return torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu") + +class MLP(nn.Module): + def __init__(self, in_dim, num_classes): + super().__init__() + self.net = nn.Sequential( + nn.Linear(in_dim, 128), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(128, 64), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(64, num_classes), + ) + def forward(self, x): return self.net(x) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--letter", required=True, help="Target letter (A–Z)") + ap.add_argument("--epochs", type=int, default=40) + ap.add_argument("--batch", type=int, default=64) + ap.add_argument("--lr", type=float, default=1e-3) + ap.add_argument("--landmarks", default=None, + help="Landmarks folder (default: landmarks_)") + ap.add_argument("--out", default=None, + help="Output filename (default: asl__mlp.pt)") + args = ap.parse_args() + + letter = args.letter.upper() + landmarks_dir = args.landmarks or f"landmarks_{letter}" + out_file = args.out or f"asl_{letter}_mlp.pt" + + # Load data + trX = np.load(os.path.join(landmarks_dir, "train_X.npy")) + trY = np.load(os.path.join(landmarks_dir, "train_y.npy")) + vaX = np.load(os.path.join(landmarks_dir, "val_X.npy")) + vaY = np.load(os.path.join(landmarks_dir, "val_y.npy")) + with open(os.path.join(landmarks_dir, "class_names.json")) as f: + classes = json.load(f) + + print(f"Letter: {letter}") + print(f"Loaded: train {trX.shape} val {vaX.shape} classes={classes}") + + # Standardize using train mean/std + X_mean_np = trX.mean(axis=0, keepdims=True).astype(np.float32) + X_std_np = (trX.std(axis=0, keepdims=True) + 1e-6).astype(np.float32) + trXn = (trX - X_mean_np) / X_std_np + vaXn = (vaX - X_mean_np) / X_std_np + + # Torch datasets + tr_ds = TensorDataset(torch.from_numpy(trXn).float(), torch.from_numpy(trY).long()) + va_ds = TensorDataset(torch.from_numpy(vaXn).float(), torch.from_numpy(vaY).long()) + tr_dl = DataLoader(tr_ds, batch_size=args.batch, shuffle=True) + va_dl = DataLoader(va_ds, batch_size=args.batch, shuffle=False) + + device = get_device() + model = MLP(in_dim=trX.shape[1], num_classes=len(classes)).to(device) + criterion = nn.CrossEntropyLoss() + opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-4) + sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs) + + best_acc, best_state = 0.0, None + + for epoch in range(1, args.epochs + 1): + # Train + model.train() + tot, correct, loss_sum = 0, 0, 0.0 + for xb, yb in tr_dl: + xb, yb = xb.to(device), yb.to(device) + opt.zero_grad(set_to_none=True) + logits = model(xb) + loss = criterion(logits, yb) + loss.backward() + opt.step() + loss_sum += loss.item() * yb.size(0) + correct += (logits.argmax(1) == yb).sum().item() + tot += yb.size(0) + tr_loss = loss_sum / max(1, tot) + tr_acc = correct / max(1, tot) + + # Validate + model.eval() + vtot, vcorrect = 0, 0 + with torch.no_grad(): + for xb, yb in va_dl: + xb, yb = xb.to(device), yb.to(device) + logits = model(xb) + vcorrect += (logits.argmax(1) == yb).sum().item() + vtot += yb.size(0) + va_acc = vcorrect / max(1, vtot) + sched.step() + + print(f"Epoch {epoch:02d}: train_loss={tr_loss:.4f} train_acc={tr_acc:.3f} val_acc={va_acc:.3f}") + + if va_acc > best_acc: + best_acc = va_acc + # Save stats as **tensors** (future-proof for torch.load safety) + best_state = { + "model": model.state_dict(), + "classes": classes, + "X_mean": torch.from_numpy(X_mean_np), # tensor + "X_std": torch.from_numpy(X_std_np), # tensor + } + torch.save(best_state, out_file) + print(f" ✅ Saved best → {out_file} (val_acc={best_acc:.3f})") + + print("Done. Best val_acc:", best_acc) + +if __name__ == "__main__": + main() diff --git a/webcam_capture.py b/webcam_capture.py new file mode 100755 index 0000000..8d20076 --- /dev/null +++ b/webcam_capture.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +capture_webcam.py +Show webcam preview and, given --letter L, count down 5s, then capture frames +every --interval seconds until --count images are saved. +Saves PNGs to ./captures as L001.PNG, L002.PNG, ... + +Usage: + python capture_webcam.py --letter A + python capture_webcam.py --letter B --camera 1 + python capture_webcam.py --letter C --count 10 --interval 1 + + # Default: 5 captures at 2s spacing, 640x480 + python capture_webcam.py --letter A + + # Ten captures, 1s apart + python capture_webcam.py --letter B --count 10 --interval 1 + + # USB camera index 1, HD override + python capture_webcam.py --letter C --camera 1 --width 1280 --height 720 --count 8 --interval 1.5 +""" + +import argparse +import os +import re +import time +from pathlib import Path + +import cv2 + +COUNTDOWN_SECONDS = 5 + +def next_sequence_number(captures_dir: Path, letter: str) -> int: + """Return next available sequence number for files like 'A001.PNG'.""" + pattern = re.compile(rf"^{re.escape(letter)}(\d{{3}})\.PNG$", re.IGNORECASE) + max_idx = 0 + if captures_dir.exists(): + for name in os.listdir(captures_dir): + m = pattern.match(name) + if m: + try: + idx = int(m.group(1)) + if idx > max_idx: + max_idx = idx + except ValueError: + pass + return max_idx + 1 + +def draw_text(img, text, org, scale=1.4, color=(0, 255, 0), thickness=2): + cv2.putText(img, text, org, cv2.FONT_HERSHEY_SIMPLEX, scale, color, thickness, cv2.LINE_AA) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--letter", required=True, help="Target letter A–Z. Output files like A001.PNG") + ap.add_argument("--camera", type=int, default=0, help="OpenCV camera index (default: 0)") + ap.add_argument("--width", type=int, default=640, help="Requested capture width (default: 640)") + ap.add_argument("--height", type=int, default=480, help="Requested capture height (default: 480)") + ap.add_argument("--count", type=int, default=5, help="Number of captures to take (default: 5)") + ap.add_argument("--interval", type=float, default=2.0, help="Seconds between captures (default: 2.0)") + args = ap.parse_args() + + letter = args.letter.upper().strip() + if not (len(letter) == 1 and "A" <= letter <= "Z"): + raise SystemExit("Please pass a single letter A–Z to --letter (e.g., --letter A)") + if args.count <= 0: + raise SystemExit("--count must be >= 1") + if args.interval <= 0: + raise SystemExit("--interval must be > 0") + + captures_dir = Path("./captures") + captures_dir.mkdir(parents=True, exist_ok=True) + start_idx = next_sequence_number(captures_dir, letter) + + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise SystemExit(f"❌ Could not open camera index {args.camera}") + + # Try to set resolution (best-effort) + cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width) + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height) + + window_title = f"Capture {letter} (press 'q' to quit)" + print(f"Showing webcam. Countdown {COUNTDOWN_SECONDS}s, then capturing {args.count} frame(s) every {args.interval}s...") + print(f"Saving to: {captures_dir.resolve()} as {letter}NNN.PNG starting at index {start_idx:03d}") + + countdown_done_at = time.time() + COUNTDOWN_SECONDS + # Absolute times when we want to capture (after countdown) + capture_times = [countdown_done_at + i * args.interval for i in range(args.count)] + capture_taken = [False] * args.count + captures_made = 0 + idx = start_idx + + while True: + ok, frame = cap.read() + if not ok: + print("⚠️ Frame grab failed; ending.") + break + + now = time.time() + + # Countdown overlay + if now < countdown_done_at: + remaining = int(round(countdown_done_at - now)) + overlay = frame.copy() + draw_text(overlay, f"Starting in: {remaining}s", (30, 60), scale=2.0, color=(0, 255, 255), thickness=3) + draw_text(overlay, f"Letter: {letter}", (30, 120), scale=1.2, color=(0, 255, 0), thickness=2) + cv2.imshow(window_title, overlay) + else: + # Check if it's time for any pending captures + for i, tcap in enumerate(capture_times): + if (not capture_taken[i]) and now >= tcap: + filename = f"{letter}{idx:03d}.PNG" + out_path = captures_dir / filename + cv2.imwrite(str(out_path), frame) + capture_taken[i] = True + captures_made += 1 + idx += 1 + print(f"📸 Saved {out_path.name}") + + # Overlay progress + elapsed_after = now - countdown_done_at + total_duration = args.interval * (args.count - 1) if args.count > 1 else 0 + remaining_after = max(0.0, total_duration - elapsed_after) + overlay = frame.copy() + draw_text(overlay, f"Capturing {letter}… {captures_made}/{args.count}", (30, 60), + scale=1.5, color=(0, 255, 0), thickness=3) + draw_text(overlay, f"Time left: {int(round(remaining_after))}s", (30, 110), + scale=1.2, color=(0, 255, 255), thickness=2) + cv2.imshow(window_title, overlay) + + # If finished all captures, keep preview up until user quits + if captures_made >= args.count: + draw_text(overlay, "Done! Press 'q' to close.", (30, 160), + scale=1.2, color=(0, 200, 255), thickness=2) + cv2.imshow(window_title, overlay) + + # Quit on 'q' + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + cap.release() + cv2.destroyAllWindows() + +if __name__ == "__main__": + main()