From 816e34cb171e8462e72cecc4a5b740a17719e4e9 Mon Sep 17 00:00:00 2001
From: jared <jnevans@gmail.com>
Date: Mon, 19 Jan 2026 22:27:20 -0500
Subject: [PATCH] Initial commit: handshapes multiclass project

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .gitignore                                    |  14 +
 README.md                                     | 255 ++++++++++++++
 capture_sequence.py                           | 259 ++++++++++++++
 doc/capture_sequence.py                       | 323 ++++++++++++++++++
 doc/eval_val.py                               |  70 ++++
 doc/infer_seq_webcam.py                       | 249 ++++++++++++++
 doc/prep_sequence_resampled.py                |  93 +++++
 doc/train_seq.py                              | 137 ++++++++
 eval_val.py                                   |  61 ++++
 first_attempt_landmark_hands/README.md        | 216 ++++++++++++
 .../capture_sequence.py                       | 176 ++++++++++
 first_attempt_landmark_hands/eval_val.py      |  60 ++++
 .../infer_seq_webcam.py                       | 198 +++++++++++
 first_attempt_landmark_hands/make_seq_dirs.sh |  19 ++
 .../prep_sequence_resampled.py                |  71 ++++
 first_attempt_landmark_hands/train_seq.py     | 136 ++++++++
 first_attempt_landmark_hands/what_to_do.txt   |  24 ++
 infer_seq_webcam.py                           | 227 ++++++++++++
 make_seq_dirs.sh                              |  19 ++
 prep_sequence_resampled.py                    |  77 +++++
 train_seq.py                                  | 120 +++++++
 what_to_do.txt                                |  16 +
 22 files changed, 2820 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100755 capture_sequence.py
 create mode 100644 doc/capture_sequence.py
 create mode 100644 doc/eval_val.py
 create mode 100644 doc/infer_seq_webcam.py
 create mode 100644 doc/prep_sequence_resampled.py
 create mode 100644 doc/train_seq.py
 create mode 100755 eval_val.py
 create mode 100644 first_attempt_landmark_hands/README.md
 create mode 100755 first_attempt_landmark_hands/capture_sequence.py
 create mode 100755 first_attempt_landmark_hands/eval_val.py
 create mode 100755 first_attempt_landmark_hands/infer_seq_webcam.py
 create mode 100755 first_attempt_landmark_hands/make_seq_dirs.sh
 create mode 100755 first_attempt_landmark_hands/prep_sequence_resampled.py
 create mode 100755 first_attempt_landmark_hands/train_seq.py
 create mode 100644 first_attempt_landmark_hands/what_to_do.txt
 create mode 100755 infer_seq_webcam.py
 create mode 100755 make_seq_dirs.sh
 create mode 100755 prep_sequence_resampled.py
 create mode 100755 train_seq.py
 create mode 100644 what_to_do.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2f3839e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+# Ignore everything
+*
+
+# But not directories (so git can traverse into them)
+!*/
+
+# Allow these file types
+!*.py
+!*.txt
+!*.md
+!*.sh
+
+# Don't ignore .gitignore itself
+!.gitignore
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..aaba275
--- /dev/null
+++ b/README.md
@@ -0,0 +1,255 @@
+# Handshapes Multiclass (Holistic) — README
+
+A small end-to-end pipeline that records MediaPipe **Holistic** landmarks, builds fixed-length sequences, trains a **bidirectional GRU** classifier, evaluates it, and runs a **live webcam demo** that recognizes classes such as words (“Mother”, “Father”, “Go”) or letters.
+
+---
+
+## Quick Start
+
+```bash
+# 0) Create class folders
+./make_seq_dirs.sh Mother Father Go
+
+# 1) Capture clips (per class; adjust counts as you like)
+python capture_sequence.py --label Mother --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Mother --split val   --seconds 0.8 --count 20
+python capture_sequence.py --label Father --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Father --split val   --seconds 0.8 --count 20
+python capture_sequence.py --label Go     --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Go     --split val   --seconds 0.8 --count 20
+
+# 2) Build fixed-length dataset (32 frames/clip)
+python prep_sequence_resampled.py --in sequences --out landmarks_seq32 --frames 32
+
+# 3) Train, evaluate, and run live inference
+python train_seq.py --landmarks landmarks_seq32 --out asl_seq32_gru_mother_father_go.pt
+python eval_val.py --landmarks landmarks_seq32 --model asl_seq32_gru_mother_father_go.pt
+python infer_seq_webcam.py --model asl_seq32_gru_mother_father_go.pt --threshold 0.35 --smooth 0.1
+```
+
+Folder layout after capture:
+
+```
+sequences/
+  train/
+    Mother/ clip_001.npz ...
+    Father/ clip_001.npz ...
+    Go/     clip_001.npz ...
+  val/
+    Mother/ ...
+    Father/ ...
+    Go/     ...
+```
+
+---
+
+## Feature Representation (per frame)
+
+From MediaPipe **Holistic**:
+
+* **Right hand** 21×(x,y,z) → 63
+* **Left hand**  21×(x,y,z) → 63
+* **Face**       468×(x,y,z) → 1,404
+* **Pose**       33×(x,y,z,visibility) → 132
+* **Face-relative hand extras**: wrist (x,y) + index tip (x,y) for each hand, expressed in the face-normalized frame → 8
+  **Total** = **1,670 dims** per frame.
+
+### Normalization (high level)
+
+* Hands: translate to wrist, mirror left → right, rotate so middle-finger MCP points +Y, scale by max pairwise distance.
+* Face: center at eye midpoint, scale by inter-ocular distance, rotate to align eyeline horizontally.
+* Pose: center at shoulder midpoint, scale by shoulder width, rotate shoulders horizontal.
+* Extras: per-hand wrist/tip projected into the face frame so the model retains *where* the hand is relative to the face (critical for signs like **Mother** vs **Father**).
+
+---
+
+## How the Pipeline Works
+
+### 1) `make_seq_dirs.sh`
+
+Creates the directory scaffolding under `sequences/` for any labels you pass (letters or words).
+
+* **Usage:** `./make_seq_dirs.sh Mother Father Go`
+* **Why:** Keeps data organized as `train/` and `val/` per class.
+
+---
+
+### 2) `capture_sequence.py`
+
+Records short clips from your webcam and saves per-frame **feature vectors** into compressed `.npz` files.
+
+**Key behaviors**
+
+* Uses **MediaPipe Holistic** to extract right/left hands, full face mesh, and pose.
+* Computes normalized features + face-relative extras.
+* Writes each clip as `sequences/<split>/<label>/clip_XXX.npz` with an array `X` of shape `(T, 1670)`.
+
+**Common flags**
+
+* `--label` (string): class name (e.g., `Mother`, `Go`).
+* `--split`: `train` or `val`.
+* `--seconds` (float): clip length; 0.8s pairs well with 32 frames.
+* `--count` (int): how many clips to record in one run.
+* `--camera`, `--width`, `--height`: webcam settings.
+* `--holistic-complexity` (`0|1|2`): higher is more accurate but slower.
+* UI niceties: 3-second countdown; on-screen progress bar; optional fingertip markers.
+
+---
+
+### 3) `prep_sequence_resampled.py`
+
+Aggregates all `clip_*.npz` files into a fixed-length dataset.
+
+**What it does**
+
+* Loads each clip’s `X` `(T, 1670)` and **linearly resamples** to exactly `N` frames (default `32`), resulting in `(N, 1670)`.
+* Stacks clips into:
+
+  * `train_X.npy` `(Nclips, Nframes, F)`
+  * `train_y.npy` `(Nclips,)`
+  * `val_X.npy`, `val_y.npy`
+  * `class_names.json` (sorted list of class names)
+  * `meta.json` with `{ "frames": N, "input_dim": F }`
+
+**Flags**
+
+* `--in` root of `sequences/`
+* `--out` dataset folder (e.g., `landmarks_seq32`)
+* `--frames` number of frames per clip after resampling (e.g., `16`, `32`, `64`)
+
+> Tip: Reducing `--frames` (e.g., 16) lowers first-prediction latency in the live demo, at the cost of some stability/accuracy.
+
+---
+
+### 4) `train_seq.py`
+
+Trains a **bidirectional GRU** classifier on the resampled sequences.
+
+**What it does**
+
+* Loads `train_*.npy` / `val_*.npy`, `class_names.json`, and `meta.json`.
+* Computes **feature-wise mean/std** on the train set; normalizes train/val.
+* Model: `GRU(input_dim → 128 hidden, bidirectional) → ReLU → Dropout → Linear(num_classes)`.
+* Tracks best **val accuracy**; saves a checkpoint containing:
+
+  * `model` weights
+  * `classes`, `frames`
+  * `X_mean`, `X_std` (for inference normalization)
+
+**Flags**
+
+* `--epochs`, `--batch`, `--lr`: typical training hyperparams.
+* `--out`: model file (e.g., `asl_seq32_gru_mother_father_go.pt`)
+
+---
+
+### 5) `eval_val.py`
+
+Evaluates your saved model on the validation set.
+
+**What it does**
+
+* Loads `val_X.npy`, `val_y.npy`, `class_names.json`, `meta.json`, and the `*.pt` checkpoint.
+* Normalizes `val_X` using the **training** mean/std stored in the checkpoint.
+* Prints **confusion matrix** and a full **classification report** (precision/recall/F1/accuracy).
+
+**Usage**
+
+```bash
+python eval_val.py --landmarks landmarks_seq32 --model asl_seq32_gru_mother_father_go.pt
+```
+
+---
+
+### 6) `infer_seq_webcam.py`
+
+Live webcam demo that streams landmarks, builds a rolling buffer, and classifies in real time.
+
+**Key behaviors**
+
+* Maintains a **rolling window** of `T` frames (from the model’s `frames` value; default 32).
+* No prediction until the buffer is full → expect a short warm-up.
+* Applies the same normalization using the model’s stored `X_mean`/`X_std`.
+* Optional **EMA smoothing** over probabilities for stability.
+* Example **action hook** included: spell “W → E → B” to open a URL.
+
+**Common flags**
+
+* `--threshold` (e.g., `0.35`): minimum top-class probability to “emit” a label.
+* `--smooth` (seconds): temporal EMA (0 disables). Lower = more responsive; higher = steadier.
+* `--holistic-complexity`, `--det-thresh`: detector accuracy/sensitivity tradeoffs.
+
+---
+
+## Parameters & Practical Tips
+
+* **Threshold vs Smooth**
+
+  * Lower `--threshold` (e.g., `0.3–0.4`) → more sensitive, but may produce more false positives.
+  * `--smooth` ≈ `0.1–0.3s` → responsive; `0.5–0.8s` → steadier but laggier.
+
+* **Frames (`--frames` in prep)**
+
+  * `16–24` frames: snappier first detection.
+  * `32` frames: balanced.
+  * `64` frames: more context, slower to first prediction.
+
+* **Data balance & variety**
+
+  * Similar clip counts per class help training.
+  * Vary lighting, small head angles, distance, and speed of motion.
+  * For location-based signs (e.g., Mother vs Father), the **face-relative extras** help the model disambiguate.
+
+---
+
+## File-by-File Summary
+
+| File                         | Purpose                                                                                                                    | Inputs → Outputs                                                                                    |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
+| `make_seq_dirs.sh`           | Creates `train/` and `val/` subfolders for each label you pass in.                                                         | Labels → `sequences/train/<label>/`, `sequences/val/<label>/`                                       |
+| `capture_sequence.py`        | Captures webcam frames, extracts Holistic landmarks, normalizes, builds per-frame 1,670-D features, and saves each clip.   | Webcam → `sequences/<split>/<label>/clip_XXX.npz` (X: `(T,1670)`)                                   |
+| `prep_sequence_resampled.py` | Resamples variable-length clips to fixed length; aggregates into train/val arrays and writes metadata.                     | `sequences/` → `landmarks_seq32/{train_X,train_y,val_X,val_y}.npy`, `class_names.json`, `meta.json` |
+| `train_seq.py`               | Trains a BiGRU multiclass classifier with normalization and simple augmentation.                                           | `landmarks_seq32` → `asl_seq32_gru_*.pt` (includes model, classes, frames, mean/std)                |
+| `eval_val.py`                | Evaluates the saved model on the validation split; prints metrics.                                                         | Model + `landmarks_seq32` → console metrics                                                         |
+| `infer_seq_webcam.py`        | Streams webcam landmarks, builds rolling sequences, classifies in real time; optional action (e.g., open URL on sequence). | Webcam + `asl_seq32_gru_*.pt` → on-screen predictions/actions                                       |
+| `what_to_do.txt`             | Step-by-step command cheat-sheet reflecting the current multi-word workflow.                                               | —                                                                                                   |
+
+---
+
+## Troubleshooting
+
+* **“No classes found in sequences/train/”**
+  Ensure class folders exist: `sequences/train/<Label>/` and `sequences/val/<Label>/`, and that they contain `clip_*.npz`.
+
+* **No live prediction initially**
+  Expected; the model needs the first **T** frames to fill the buffer.
+
+* **Lag or low FPS**
+  Try `--holistic-complexity 0`, reduce camera resolution, or use a smaller `--frames` and retrain.
+
+* **Overconfident but wrong**
+  Raise `--threshold`, increase `--smooth`, or record more varied data per class (especially negatives or near-misses).
+
+---
+
+## Add/Remove Classes
+
+* To **add** a class (e.g., `Go`): create dirs, capture clips, rerun **prep**, retrain, re-eval.
+* To **remove/replace** a class: delete its folders or rename, **then** rerun **prep** and retrain.
+
+---
+
+## Dependencies
+
+* Python 3.x, `numpy`, `opencv-python`, `mediapipe`, `torch`, `scikit-learn` (for evaluation).
+* macOS with Apple Silicon can use MPS acceleration automatically (already handled in the code).
+
+---
+
+## Notes
+
+* Labels are **arbitrary strings** (not restricted to A–Z).
+* Features are **zero-filled** for missing parts in a frame (e.g., if a hand isn’t detected) to keep dimensions stable.
+* The face is used as a global anchor for geometry; keeping the face visible improves robustness.
+
+---
diff --git a/capture_sequence.py b/capture_sequence.py
new file mode 100755
index 0000000..103a5db
--- /dev/null
+++ b/capture_sequence.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# capture_sequence.py
+# Record N short sequences per label with MediaPipe Holistic and build per-frame features:
+#   RightHand(63) + LeftHand(63) + Face(468*3=1404) + Pose(33*4=132) + Face-relative hand extras(8) = 1670 dims
+# Requirements: numpy, opencv-python, mediapipe
+
+import argparse, os, time, math, re
+from pathlib import Path
+import numpy as np, cv2, mediapipe as mp
+
+mp_holistic = mp.solutions.holistic
+
+# ---------- geometry / normalization ----------
+def _angle(v): 
+    return math.atan2(v[1], v[0])
+
+def _rot2d(t):
+    c, s = math.cos(t), math.sin(t)
+    return np.array([[c, -s], [s, c]], dtype=np.float32)
+
+def normalize_hand(pts, handed=None):
+    """Hand (21,3) → translate wrist, mirror left, rotate middle-MCP to +Y, scale by max pairwise distance."""
+    pts = pts.astype(np.float32).copy()
+    pts[:, :2] -= pts[0, :2]
+    if handed and str(handed).lower().startswith("left"):
+        pts[:, 0] *= -1.0
+    v = pts[9, :2]
+    R = _rot2d(math.pi/2 - _angle(v))
+    pts[:, :2] = pts[:, :2] @ R.T
+    xy = pts[:, :2]
+    d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
+    d = 1.0 if d < 1e-6 else float(d)
+    pts[:, :2] /= d; pts[:, 2] /= d
+    return pts  # (21,3)
+
+def normalize_face(face):
+    """Face (468,3) → center at eye midpoint, scale by inter-ocular, rotate eye-line horizontal."""
+    f = face.astype(np.float32).copy()
+    left = f[33, :2]; right = f[263, :2]  # outer eye corners
+    center = 0.5 * (left + right)
+    f[:, :2] -= center[None, :]
+    eye_vec = right - left
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    f[:, :2] /= eye_dist; f[:, 2] /= eye_dist
+    R = _rot2d(-_angle(eye_vec))
+    f[:, :2] = f[:, :2] @ R.T
+    return f  # (468,3)
+
+def normalize_pose(pose):
+    """
+    Pose (33,4: x,y,z,vis) → center at shoulder midpoint, scale by shoulder width, rotate shoulders horizontal.
+    Keep visibility ([:,3]) as-is.
+    """
+    p = pose.astype(np.float32).copy()
+    ls = p[11, :2]; rs = p[12, :2]  # left/right shoulder
+    center = 0.5 * (ls + rs)
+    p[:, :2] -= center[None, :]
+    sw_vec = rs - ls
+    sw = float(np.linalg.norm(sw_vec)) or 1.0
+    p[:, :2] /= sw; p[:, 2] /= sw
+    R = _rot2d(-_angle(sw_vec))
+    p[:, :2] = p[:, :2] @ R.T
+    return p  # (33,4)
+
+def face_frame_transform(face_pts):
+    """
+    Return (center, eye_dist, R) to map image XY to the normalized face frame (same as normalize_face).
+    Use: v' = ((v - center)/eye_dist) @ R.T
+    """
+    left = face_pts[33, :2]; right = face_pts[263, :2]
+    center = 0.5*(left + right)
+    eye_vec = right - left
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    # rotation that aligns eye line to +X (inverse of normalize_face's rotation matrix)
+    # normalize_face uses R = rot(-theta) applied after scaling/centering.
+    theta = _angle(eye_vec)
+    R = _rot2d(-theta)
+    return center, eye_dist, R
+
+def to_face_frame(pt_xy, center, eye_dist, R):
+    v = (pt_xy - center) / eye_dist
+    return (v @ R.T).astype(np.float32)
+
+# ---------- utils ----------
+def next_idx(folder: Path, prefix="clip_"):
+    pat = re.compile(rf"^{re.escape(prefix)}(\d+)\.npz$")
+    mx = 0
+    if folder.exists():
+        for n in os.listdir(folder):
+            m = pat.match(n)
+            if m: mx = max(mx, int(m.group(1)))
+    return mx + 1
+
+def countdown(cap, seconds=3):
+    for i in range(seconds, 0, -1):
+        start = time.time()
+        while time.time() - start < 1.0:
+            ok, frame = cap.read()
+            if not ok: continue
+            h, w = frame.shape[:2]
+            text = str(i)
+            (tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 5, 10)
+            cv2.putText(frame, text, ((w - tw)//2, (h + th)//2),
+                        cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 10, cv2.LINE_AA)
+            msg = "Starting in..."
+            (mw, mh), _ = cv2.getTextSize(msg, cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
+            cv2.putText(frame, msg, ((w - mw)//2, (h//2) - th - 20),
+                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0,255,255), 3, cv2.LINE_AA)
+            cv2.imshow("sequence capture", frame)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                cap.release(); cv2.destroyAllWindows(); raise SystemExit("Aborted during countdown")
+
+def draw_progress_bar(img, frac_remaining, bar_h=16, margin=12):
+    h, w = img.shape[:2]
+    x0, x1 = margin, w - margin
+    y0, y1 = margin, margin + bar_h
+    cv2.rectangle(img, (x0, y0), (x1, y1), (40, 40, 40), -1)
+    cv2.rectangle(img, (x0, y0), (x1, y1), (90, 90, 90), 2)
+    rem_w = int((x1 - x0) * max(0.0, min(1.0, frac_remaining)))
+    if rem_w > 0:
+        cv2.rectangle(img, (x0, y0), (x0 + rem_w, y1), (0, 200, 0), -1)
+
+# ---------- holistic wrapper ----------
+class HolisticDetector:
+    def __init__(self, det_conf=0.5, track_conf=0.5, model_complexity=1):
+        self.h = mp_holistic.Holistic(
+            static_image_mode=False,
+            model_complexity=model_complexity,
+            smooth_landmarks=True,
+            enable_segmentation=False,
+            refine_face_landmarks=False,
+            min_detection_confidence=det_conf,
+            min_tracking_confidence=track_conf,
+        )
+    def process(self, rgb):
+        return self.h.process(rgb)
+
+# ---------- main ----------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--label", required=True, help="Class label (e.g., A, B, Mother, Father, etc.)")
+    ap.add_argument("--split", required=True, choices=["train","val"])
+    ap.add_argument("--seconds", type=float, default=0.8)
+    ap.add_argument("--camera", type=int, default=0)
+    ap.add_argument("--width", type=int, default=640)
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--count", type=int, default=None)
+    ap.add_argument("--det-thresh", type=float, default=0.5)
+    ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
+    args = ap.parse_args()
+
+    L = args.label.strip()
+    if len(L) == 0 or ("/" in L or "\\" in L):
+        raise SystemExit("Use a non-empty label without slashes")
+
+    if args.count is None:
+        args.count = 100 if args.split == "train" else 20
+
+    out_dir = Path("sequences") / args.split / L
+    out_dir.mkdir(parents=True, exist_ok=True)
+    idx = next_idx(out_dir)
+
+    det = HolisticDetector(args.det_thresh, args.det_thresh, args.holistic_complexity)
+
+    cap = cv2.VideoCapture(args.camera)
+    if not cap.isOpened(): raise SystemExit(f"Could not open camera {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width)
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
+
+    print(f"Recording {args.count} clips for {L}/{args.split}, {args.seconds}s each. (R+L hands + face + pose + face-relative extras)")
+    countdown(cap, 3)
+
+    for n in range(args.count):
+        seq_X = []
+        start_t = time.time(); end_t = start_t + args.seconds
+
+        while True:
+            now = time.time()
+            if now >= end_t: break
+            ok, fr = cap.read()
+            if not ok: break
+
+            rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)
+            res = det.process(rgb)
+
+            # hands
+            right_pts = left_pts = None
+            if res.right_hand_landmarks is not None:
+                right_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.right_hand_landmarks.landmark], np.float32)
+            if res.left_hand_landmarks is not None:
+                left_pts  = np.array([[lm.x, lm.y, lm.z] for lm in res.left_hand_landmarks.landmark],  np.float32)
+
+            # face
+            face_pts = None
+            if res.face_landmarks is not None:
+                face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
+
+            # pose
+            pose_arr = None
+            if res.pose_landmarks is not None:
+                pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in res.pose_landmarks.landmark], np.float32)
+
+            # Build feature: require face present and at least one hand (pose optional)
+            if face_pts is not None and (right_pts is not None or left_pts is not None):
+                f_norm = normalize_face(face_pts)  # (468,3)
+
+                # transform pieces to express hand positions in face frame
+                f_center, f_scale, f_R = face_frame_transform(face_pts)
+
+                def hand_face_extras(hand_pts):
+                    if hand_pts is None: 
+                        return np.zeros(4, np.float32)
+                    wrist_xy = hand_pts[0, :2]
+                    tip_xy   = hand_pts[8, :2]
+                    w = to_face_frame(wrist_xy, f_center, f_scale, f_R)
+                    t = to_face_frame(tip_xy,   f_center, f_scale, f_R)
+                    return np.array([w[0], w[1], t[0], t[1]], np.float32)
+
+                rh_ex = hand_face_extras(right_pts)  # 4
+                lh_ex = hand_face_extras(left_pts)   # 4
+
+                rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)
+                lh = normalize_hand(left_pts,  "Left").reshape(-1)  if left_pts  is not None else np.zeros(63, np.float32)
+                p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32)
+
+                feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0)  # (1670,)
+                seq_X.append(feat)
+
+                # optional fingertip markers for visual feedback
+                if right_pts is not None:
+                    pt = normalize_hand(right_pts, "Right")[8, :2]
+                    cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (0,255,0), -1)
+                if left_pts is not None:
+                    pt = normalize_hand(left_pts, "Left")[8, :2]
+                    cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (255,0,0), -1)
+
+            # UI
+            frac_remaining = (end_t - now) / max(1e-6, args.seconds)
+            draw_progress_bar(fr, frac_remaining, bar_h=16, margin=12)
+            cv2.putText(fr, f"{L} {args.split}  Clip {n+1}/{args.count}",
+                        (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA)
+            cv2.imshow("sequence capture", fr)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                cap.release(); cv2.destroyAllWindows(); return
+
+        if seq_X:
+            X = np.stack(seq_X, 0).astype(np.float32)  # (T, 1670)
+            path = out_dir / f"clip_{idx:03d}.npz"
+            np.savez_compressed(path, X=X)
+            print(f"💾 saved {path} frames={X.shape[0]} dims={X.shape[1]}")
+            idx += 1
+        else:
+            print("⚠️ Not enough frames with face + any hand; skipped clip.")
+
+    print("✅ Done recording.")
+    cap.release(); cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/capture_sequence.py b/doc/capture_sequence.py
new file mode 100644
index 0000000..116c928
--- /dev/null
+++ b/doc/capture_sequence.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+# capture_sequence.py
+# Record N short sequences per label with MediaPipe Holistic and build per-frame features:
+#   RightHand(63) + LeftHand(63) + Face(468*3=1404) + Pose(33*4=132) + Face-relative hand extras(8) = 1670 dims
+# Requirements: numpy, opencv-python, mediapipe
+
+import argparse, os, time, math, re               # stdlib: args, filesystem, timing, trig, regex
+from pathlib import Path                           # pathlib for portable paths
+import numpy as np, cv2, mediapipe as mp           # core libs: arrays, webcam/GUI, landmarks
+
+mp_holistic = mp.solutions.holistic                # alias to the Holistic solution entry
+
+# ---------- geometry / normalization ----------
+def _angle(v):
+    """
+    Return atan2(y, x) of a 2D vector.
+    Used to compute the orientation of a segment in the image plane.
+    """
+    return math.atan2(v[1], v[0])                  # angle in radians for rotation normalization
+
+def _rot2d(t):
+    """
+    Build a 2×2 rotation matrix for angle t (radians).
+    Used to rotate landmark sets into a canonical frame.
+    """
+    c, s = math.cos(t), math.sin(t)                # precompute cos/sin for speed
+    return np.array([[c, -s], [s, c]], dtype=np.float32)  # standard 2D rotation matrix
+
+def normalize_hand(pts, handed=None):
+    """
+    Normalize a (21,3) hand landmark array:
+      1) translate so wrist (idx 0) is at origin
+      2) mirror X for left hands so both hands look like right
+      3) rotate so vector from wrist->middle-MCP (idx 9) points +Y
+      4) scale by max pairwise XY distance so size is comparable across frames
+    Returns: (21,3) float32
+    """
+    pts = pts.astype(np.float32).copy()            # make float32 copy (avoid mutating caller)
+    pts[:, :2] -= pts[0, :2]                       # translate: wrist to origin (stabilizes position)
+    if handed and str(handed).lower().startswith("left"):
+        pts[:, 0] *= -1.0                          # mirror X for left hand to canonicalize handedness
+    v = pts[9, :2]                                 # vector from wrist→middle MCP (index 9)
+    R = _rot2d(math.pi/2 - _angle(v))              # rotate so this vector points up (+Y)
+    pts[:, :2] = pts[:, :2] @ R.T                  # apply rotation to XY (keep Z as-is for now)
+    xy = pts[:, :2]                                # convenience view
+    d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()  # max pairwise XY distance (scale)
+    d = 1.0 if d < 1e-6 else float(d)              # avoid divide-by-zero on degenerate frames
+    pts[:, :2] /= d; pts[:, 2] /= d                # isotropic scale XY and Z by same factor
+    return pts                                     # return normalized hand landmarks
+
+def normalize_face(face):
+    """
+    Normalize a (468,3) face mesh:
+      1) center at midpoint between outer eye corners (33, 263)
+      2) scale by inter-ocular distance
+      3) rotate so eye-line is horizontal
+    Returns: (468,3) float32
+    """
+    f = face.astype(np.float32).copy()             # safe copy
+    left = f[33, :2]; right = f[263, :2]           # outer eye corners per MediaPipe indexing
+    center = 0.5 * (left + right)                  # center between eyes anchors the face
+    f[:, :2] -= center[None, :]                    # translate to center
+    eye_vec = right - left                         # vector from left→right eye
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0  # scale factor; avoid zero
+    f[:, :2] /= eye_dist; f[:, 2] /= eye_dist      # scale all dims consistently
+    R = _rot2d(-_angle(eye_vec))                   # rotate so eye line aligns with +X
+    f[:, :2] = f[:, :2] @ R.T                      # apply rotation to XY
+    return f
+
+def normalize_pose(pose):
+    """
+    Normalize a (33,4) pose landmark array (x,y,z,visibility):
+      1) center at shoulder midpoint (11,12)
+      2) scale by shoulder width
+      3) rotate so shoulders are horizontal
+      Visibility channel ([:,3]) is preserved as-is.
+    Returns: (33,4) float32
+    """
+    p = pose.astype(np.float32).copy()             # copy to avoid mutating input
+    ls = p[11, :2]; rs = p[12, :2]                 # left/right shoulder in XY
+    center = 0.5 * (ls + rs)                       # mid-shoulder anchor
+    p[:, :2] -= center[None, :]                    # translate to center
+    sw_vec = rs - ls                               # shoulder vector (scale + rotation anchor)
+    sw = float(np.linalg.norm(sw_vec)) or 1.0      # shoulder width (avoid zero)
+    p[:, :2] /= sw; p[:, 2] /= sw                  # scale pose consistently
+    R = _rot2d(-_angle(sw_vec))                    # rotate so shoulders are horizontal
+    p[:, :2] = p[:, :2] @ R.T                      # apply rotation to XY
+    return p
+
+def face_frame_transform(face_pts):
+    """
+    Compute a transform that maps image XY into the normalized face frame
+    (same definition as in normalize_face).
+    Returns:
+      center  : (2,) eye midpoint
+      eye_dist: scalar inter-ocular distance
+      R       : 2×2 rotation aligning eye-line to +X
+    Use downstream as: v' = ((v - center)/eye_dist) @ R.T
+    """
+    left = face_pts[33, :2]; right = face_pts[263, :2]  # reference points: eye corners
+    center = 0.5*(left + right)                         # face center
+    eye_vec = right - left                              # direction of eye line
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0    # scale of face
+    theta = _angle(eye_vec)                             # angle of eye line
+    R = _rot2d(-theta)                                  # rotation to align with +X
+    return center, eye_dist, R
+
+def to_face_frame(pt_xy, center, eye_dist, R):
+    """
+    Transform a 2D point from image space into the normalized face frame.
+    Inputs are from face_frame_transform().
+    """
+    v = (pt_xy - center) / eye_dist                    # translate + scale
+    return (v @ R.T).astype(np.float32)                # rotate into face frame
+
+# ---------- utils ----------
+def next_idx(folder: Path, prefix="clip_"):
+    """
+    Scan a folder for files like 'clip_###.npz' and return the next index.
+    Keeps your saved clips sequential without collisions.
+    """
+    pat = re.compile(rf"^{re.escape(prefix)}(\d+)\.npz$")  # matches clip index
+    mx = 0                                                # track max index seen
+    if folder.exists():                                   # only if folder exists
+        for n in os.listdir(folder):                      # iterate files
+            m = pat.match(n)                              # regex match
+            if m: mx = max(mx, int(m.group(1)))          # update max on matches
+    return mx + 1                                         # next available index
+
+def countdown(cap, seconds=3):
+    """
+    Show a full-screen countdown overlay before recording starts.
+    Press 'q' to abort during countdown.
+    """
+    for i in range(seconds, 0, -1):                       # 3..2..1
+        start = time.time()                               # ensure ~1s display per number
+        while time.time() - start < 1.0:
+            ok, frame = cap.read()                        # read a frame
+            if not ok: continue                           # skip if camera hiccups
+            h, w = frame.shape[:2]                        # frame size for centering text
+            text = str(i)                                 # the digit to render
+            (tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 5, 10)  # size of big number
+            cv2.putText(frame, text, ((w - tw)//2, (h + th)//2),
+                        cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 10, cv2.LINE_AA)  # draw big red numeral
+            msg = "Starting in..."                        # helper message above the number
+            (mw, mh), _ = cv2.getTextSize(msg, cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
+            cv2.putText(frame, msg, ((w - mw)//2, (h//2) - th - 20),
+                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0,255,255), 3, cv2.LINE_AA)
+            cv2.imshow("sequence capture", frame)         # show overlay
+            if cv2.waitKey(1) & 0xFF == ord('q'):         # allow abort
+                cap.release(); cv2.destroyAllWindows()
+                raise SystemExit("Aborted during countdown")
+
+def draw_progress_bar(img, frac_remaining, bar_h=16, margin=12):
+    """
+    Draw a simple progress bar at the top of the frame.
+    frac_remaining in [0,1] indicates time left in the clip.
+    """
+    h, w = img.shape[:2]                                 # image dimensions
+    x0, x1 = margin, w - margin                          # horizontal extent
+    y0, y1 = margin, margin + bar_h                      # vertical extent
+    cv2.rectangle(img, (x0, y0), (x1, y1), (40, 40, 40), -1)  # dark background bar
+    cv2.rectangle(img, (x0, y0), (x1, y1), (90, 90, 90), 2)   # border
+    rem_w = int((x1 - x0) * max(0.0, min(1.0, frac_remaining)))  # filled width clamped
+    if rem_w > 0:
+        cv2.rectangle(img, (x0, y0), (x0 + rem_w, y1), (0, 200, 0), -1)  # green fill
+
+# ---------- holistic wrapper ----------
+class HolisticDetector:
+    """
+    Thin wrapper around MediaPipe Holistic to fix configuration once and expose process().
+    """
+    def __init__(self, det_conf=0.5, track_conf=0.5, model_complexity=1):
+        # Build the Holistic detector with steady defaults; smooth_landmarks helps temporal stability.
+        self.h = mp_holistic.Holistic(
+            static_image_mode=False,                     # realtime video stream
+            model_complexity=model_complexity,           # 0=fastest, 2=most accurate
+            smooth_landmarks=True,                       # temporal smoothing reduces jitter
+            enable_segmentation=False,                   # not needed; saves compute
+            refine_face_landmarks=False,                 # faster; we only need coarse face
+            min_detection_confidence=det_conf,           # detection threshold
+            min_tracking_confidence=track_conf,          # tracking threshold
+        )
+
+    def process(self, rgb):
+        """
+        Run landmark detection on an RGB frame and return MediaPipe results object.
+        """
+        return self.h.process(rgb)                       # delegate to MP
+
+# ---------- main ----------
+def main():
+    """
+    CLI entry: capture N clips of length --seconds for a given --label and --split,
+    save per-frame 1670-D features into sequences/<split>/<label>/clip_XXX.npz.
+    """
+    ap = argparse.ArgumentParser()                       # CLI flag parsing
+    ap.add_argument("--label", required=True, help="Class label (e.g., A, B, Mother, Father, etc.)")
+    ap.add_argument("--split", required=True, choices=["train","val"])
+    ap.add_argument("--seconds", type=float, default=0.8)
+    ap.add_argument("--camera", type=int, default=0)
+    ap.add_argument("--width", type=int, default=640)
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--count", type=int, default=None)
+    ap.add_argument("--det-thresh", type=float, default=0.5)
+    ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
+    args = ap.parse_args()                               # finalize args
+
+    L = args.label.strip()                               # normalized label string
+    if len(L) == 0 or ("/" in L or "\\" in L):           # basic validation to keep clean paths
+        raise SystemExit("Use a non-empty label without slashes")
+    if args.count is None:                               # default count per split for convenience
+        args.count = 100 if args.split == "train" else 20
+
+    out_dir = Path("sequences") / args.split / L         # where clip_*.npz will go
+    out_dir.mkdir(parents=True, exist_ok=True)           # ensure directory exists
+    idx = next_idx(out_dir)                              # next clip index to use
+
+    det = HolisticDetector(args.det_thresh, args.det_thresh, args.holistic_complexity)  # detector
+    cap = cv2.VideoCapture(args.camera)                  # open camera device
+    if not cap.isOpened():                               # fail early if missing
+        raise SystemExit(f"Could not open camera {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width)        # set capture width
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)      # set capture height
+
+    print(f"Recording {args.count} clips for {L}/{args.split}, {args.seconds}s each. (R+L hands + face + pose + face-relative extras)")
+    countdown(cap, 3)                                    # give operator time to get ready
+
+    for n in range(args.count):                          # loop over requested clips
+        seq_X = []                                       # holds per-frame features
+        start_t = time.time(); end_t = start_t + args.seconds  # fixed-length recording window
+
+        while True:                                      # per-frame capture loop
+            now = time.time()
+            if now >= end_t: break                       # stop after desired duration
+            ok, fr = cap.read()                          # grab a frame
+            if not ok: break                             # camera yielded nothing; end clip
+
+            rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)    # MediaPipe expects RGB
+            res = det.process(rgb)                       # run landmark detection
+
+            # hands
+            right_pts = left_pts = None                  # initialize as missing
+            if res.right_hand_landmarks is not None:     # if right detected…
+                right_pts = np.array([[lm.x, lm.y, lm.z]
+                                       for lm in res.right_hand_landmarks.landmark], np.float32)  # (21,3)
+            if res.left_hand_landmarks is not None:      # if left detected…
+                left_pts  = np.array([[lm.x, lm.y, lm.z]
+                                       for lm in res.left_hand_landmarks.landmark],  np.float32)  # (21,3)
+
+            # face
+            face_pts = None
+            if res.face_landmarks is not None:           # 468 face landmarks
+                face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
+
+            # pose
+            pose_arr = None
+            if res.pose_landmarks is not None:           # 33 pose landmarks with visibility
+                pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility]
+                                     for lm in res.pose_landmarks.landmark], np.float32)
+
+            # Build feature: require face present and at least one hand (pose optional)
+            if face_pts is not None and (right_pts is not None or left_pts is not None):
+                f_norm = normalize_face(face_pts)        # canonicalize face geometry → (468,3)
+
+                # transform pieces to express hand positions in face frame
+                f_center, f_scale, f_R = face_frame_transform(face_pts)  # face frame for extras
+
+                def hand_face_extras(hand_pts):
+                    """
+                    For a hand, return [wrist_x, wrist_y, tip_x, tip_y] in face frame.
+                    If hand missing, returns zeros. Keeps coarse spatial relation to face.
+                    """
+                    if hand_pts is None:
+                        return np.zeros(4, np.float32)   # missing hand → zeros to keep dims fixed
+                    wrist_xy = hand_pts[0, :2]           # wrist point
+                    tip_xy   = hand_pts[8, :2]           # index fingertip (salient for pointing)
+                    w = to_face_frame(wrist_xy, f_center, f_scale, f_R)  # project to face frame
+                    t = to_face_frame(tip_xy,   f_center, f_scale, f_R)
+                    return np.array([w[0], w[1], t[0], t[1]], np.float32)  # pack features
+
+                rh_ex = hand_face_extras(right_pts)      # (4,) right extras
+                lh_ex = hand_face_extras(left_pts)       # (4,) left extras
+
+                rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)  # hand (63,)
+                lh = normalize_hand(left_pts,  "Left" ).reshape(-1) if left_pts  is not None else np.zeros(63, np.float32)  # hand (63,)
+                p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32)       # pose (132,)
+
+                feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0)  # (1670,) full feature
+                seq_X.append(feat)                          # push this frame’s feature vector
+
+                # optional fingertip markers for visual feedback (normalized hand to index tip)
+                if right_pts is not None:
+                    pt = normalize_hand(right_pts, "Right")[8, :2]     # index tip in normalized [0..1]-ish coords
+                    cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (0,255,0), -1)  # green dot
+                if left_pts is not None:
+                    pt = normalize_hand(left_pts, "Left")[8, :2]
+                    cv2.circle(fr, (int(fr.shape[1]*pt[0]), int(fr.shape[0]*pt[1])), 6, (255,0,0), -1)  # blue/red dot
+
+            # UI overlay (progress + label)
+            frac_remaining = (end_t - now) / max(1e-6, args.seconds)  # progress bar fraction
+            draw_progress_bar(fr, frac_remaining, bar_h=16, margin=12)
+            cv2.putText(fr, f"{L} {args.split}  Clip {n+1}/{args.count}",
+                        (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA)
+            cv2.imshow("sequence capture", fr)            # show live preview
+            if cv2.waitKey(1) & 0xFF == ord('q'):         # allow stopping whole session
+                cap.release(); cv2.destroyAllWindows(); return
+
+        # After clip duration, save if we collected any valid frames
+        if seq_X:
+            X = np.stack(seq_X, 0).astype(np.float32)     # (T, 1670) stack into array
+            path = out_dir / f"clip_{idx:03d}.npz"        # next filename
+            np.savez_compressed(path, X=X)                # compressed .npz with key 'X'
+            print(f"💾 saved {path} frames={X.shape[0]} dims={X.shape[1]}")
+            idx += 1                                      # advance index
+        else:
+            print("⚠️ Not enough frames with face + any hand; skipped clip.")  # guardrail
+
+    print("✅ Done recording.")                           # session complete
+    cap.release(); cv2.destroyAllWindows()               # clean up resources
+
+if __name__ == "__main__":
+    main()                                               # run CLI
diff --git a/doc/eval_val.py b/doc/eval_val.py
new file mode 100644
index 0000000..0eb7897
--- /dev/null
+++ b/doc/eval_val.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# Evaluate a trained SeqGRU on the validation set; reads input_dim from meta.json
+
+import os, json, argparse                          # stdlib
+import numpy as np                                  # arrays
+import torch, torch.nn as nn                        # model
+from sklearn.metrics import classification_report, confusion_matrix  # metrics
+
+class SeqGRU(nn.Module):
+    """
+    BiGRU classifier head:
+      GRU(input_dim → hidden, bidirectional) → Linear/ReLU/Dropout → Linear(num_classes)
+    Uses the last time step's hidden state for classification.
+    """
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)  # temporal encoder
+        self.head = nn.Sequential(                                                  # MLP head
+            nn.Linear(hidden*2, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h, _ = self.gru(x)                           # h: (B, T, 2*hidden)
+        return self.head(h[:, -1, :])                # take last time step → logits
+
+def main():
+    """
+    Load val split + model checkpoint, normalize using stored mean/std, run inference,
+    then print confusion matrix and classification report.
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32")  # dataset folder
+    ap.add_argument("--model", required=True)                  # .pt checkpoint path
+    args = ap.parse_args()
+
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))    # (N, T, F)
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))    # (N,)
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))  # label names
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))            # frames, input_dim
+    T = int(meta.get("frames", vaX.shape[1]))                  # clip length
+    input_dim = int(meta.get("input_dim", vaX.shape[-1]))      # feature dimension
+
+    state = torch.load(args.model, map_location="cpu", weights_only=False)  # load checkpoint dict
+    X_mean, X_std = state["X_mean"], state["X_std"]             # stored normalization stats
+    if isinstance(X_mean, torch.Tensor): X_mean = X_mean.numpy()  # ensure numpy arrays
+    if isinstance(X_std,  torch.Tensor): X_std  = X_std.numpy()
+    X_mean = X_mean.astype(np.float32)                          # float32 for compute
+    X_std  = (X_std.astype(np.float32) + 1e-6)                  # add epsilon for safety
+
+    vaXn = (vaX - X_mean) / X_std                               # normalize val features
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  # accel if on Mac
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes))                   # build model
+    model.load_state_dict(state["model"])                    # load trained weights
+    model.eval().to(device)                                  # eval mode
+
+    with torch.no_grad():                                    # no grad for eval
+        xb = torch.from_numpy(vaXn).float().to(device)       # tensorize val set
+        logits = model(xb)                                   # forward pass
+        pred = logits.argmax(1).cpu().numpy()                # top-1 class indices
+
+    cm = confusion_matrix(vaY, pred)                         # confusion matrix
+    print("Classes:", classes)
+    print("\nConfusion matrix (rows=true, cols=pred):\n", cm)
+    print("\nReport:\n", classification_report(vaY, pred, target_names=classes))  # precision/recall/F1
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/infer_seq_webcam.py b/doc/infer_seq_webcam.py
new file mode 100644
index 0000000..750924a
--- /dev/null
+++ b/doc/infer_seq_webcam.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Live webcam inference for two hands + full face + pose + face-relative hand extras (1670 dims/frame).
+Works for letters (A..Z) or word classes (e.g., Mother, Father).
+Optionally detects the sequence W → E → B to open a URL.
+"""
+
+import os, math, argparse, time, webbrowser          # stdlib
+import numpy as np                                   # arrays
+import cv2                                           # webcam UI
+import torch                                         # inference
+import mediapipe as mp                               # Holistic landmarks
+# Quiet logs: reduce console noise from TF/absl/OpenCV
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"; os.environ["GLOG_minloglevel"] = "2"
+import absl.logging; absl.logging.set_verbosity(absl.logging.ERROR)
+cv2.setLogLevel(0)
+
+mp_holistic = mp.solutions.holistic
+
+# ---------- normalization ----------
+def _angle(v):
+    """atan2 for 2D vector."""
+    return math.atan2(v[1], v[0])
+
+def _rot2d(t):
+    """2×2 rotation matrix for angle t."""
+    c, s = math.cos(t), math.sin(t)
+    return np.array([[c, -s], [s, c]], dtype=np.float32)
+
+def normalize_hand(pts, handed=None):
+    """
+    Wrist-translate, mirror left→right, rotate so middle MCP is +Y, scale by max XY spread.
+    Returns (21,3).
+    """
+    pts = pts.astype(np.float32).copy()
+    pts[:, :2] -= pts[0, :2]
+    if handed and str(handed).lower().startswith("left"): pts[:, 0] *= -1.0
+    v = pts[9, :2]; R = _rot2d(math.pi/2 - _angle(v))
+    pts[:, :2] = pts[:, :2] @ R.T
+    xy = pts[:, :2]; d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
+    d = 1.0 if d < 1e-6 else float(d)
+    pts[:, :2] /= d; pts[:, 2] /= d
+    return pts
+
+def normalize_face(face):
+    """Center at eye midpoint, scale by inter-ocular, rotate eye-line horizontal; returns (468,3)."""
+    f = face.astype(np.float32).copy()
+    left, right = f[33, :2], f[263, :2]
+    center = 0.5*(left+right)
+    f[:, :2] -= center[None, :]
+    eye_vec = right - left; eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    f[:, :2] /= eye_dist; f[:, 2] /= eye_dist
+    R = _rot2d(-_angle(eye_vec)); f[:, :2] = f[:, :2] @ R.T
+    return f
+
+def normalize_pose(pose):
+    """Center at shoulder midpoint, scale by shoulder width, rotate shoulders horizontal; returns (33,4)."""
+    p = pose.astype(np.float32).copy()
+    ls, rs = p[11, :2], p[12, :2]
+    center = 0.5*(ls+rs); p[:, :2] -= center[None, :]
+    sw_vec = rs - ls; sw = float(np.linalg.norm(sw_vec)) or 1.0
+    p[:, :2] /= sw; p[:, 2] /= sw
+    R = _rot2d(-_angle(sw_vec)); p[:, :2] = p[:, :2] @ R.T
+    return p
+
+def face_frame_transform(face_pts):
+    """Return (center, eye_dist, R) to project points into the face-normalized frame."""
+    left = face_pts[33, :2]; right = face_pts[263, :2]
+    center = 0.5*(left + right)
+    eye_vec = right - left
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    R = _rot2d(-_angle(eye_vec))
+    return center, eye_dist, R
+
+def to_face_frame(pt_xy, center, eye_dist, R):
+    """Project a 2D point into the face frame."""
+    v = (pt_xy - center) / eye_dist
+    return (v @ R.T).astype(np.float32)
+
+# ---------- model ----------
+class SeqGRU(torch.nn.Module):
+    """
+    BiGRU classifier used at training time; same shape and head for inference.
+    """
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = torch.nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(hidden*2, 128), torch.nn.ReLU(), torch.nn.Dropout(0.2),
+            torch.nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x)                            # (B,T,2H)
+        return self.head(h[:, -1, :])                # last-time-step logits
+
+# ---------- main ----------
+def main():
+    """
+    Stream webcam, build rolling window of T frames, normalize with training stats,
+    classify with BiGRU, overlay current top prediction, and optionally trigger
+    an action when the sequence 'W', 'E', 'B' is observed.
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True)                    # path to .pt checkpoint
+    ap.add_argument("--camera", type=int, default=0)             # webcam device index
+    ap.add_argument("--threshold", type=float, default=0.35)     # emit threshold for top prob
+    ap.add_argument("--smooth", type=float, default=0.1, help="EMA window (seconds); 0 disables")
+    ap.add_argument("--width", type=int, default=640)            # capture resolution
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])  # accuracy/speed
+    ap.add_argument("--det-thresh", type=float, default=0.5)     # detector confidence thresholds
+    ap.add_argument("--url", type=str, default="https://www.google.com")            # used on WEB
+    args = ap.parse_args()
+
+    state = torch.load(args.model, map_location="cpu", weights_only=False)          # load checkpoint dict
+    classes = state["classes"]                                                      # label names
+    T = int(state.get("frames", 32))                                                # window length
+    X_mean = state["X_mean"].cpu().numpy().astype(np.float32)                       # normalization stats
+    X_std  = (state["X_std"].cpu().numpy().astype(np.float32) + 1e-6)
+    input_dim = X_mean.shape[-1]                                                    # expected F (1670)
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  # Apple MPS if avail
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes)).to(device)        # same arch
+    model.load_state_dict(state["model"]); model.eval()                                         # load weights
+
+    hol = mp_holistic.Holistic(                                                     # configure detector
+        static_image_mode=False,
+        model_complexity=args.holistic_complexity,
+        smooth_landmarks=True,
+        enable_segmentation=False,
+        refine_face_landmarks=False,
+        min_detection_confidence=args.det_thresh,
+        min_tracking_confidence=args.det_thresh,
+    )
+
+    cap = cv2.VideoCapture(args.camera)                                             # open camera
+    if not cap.isOpened(): raise SystemExit(f"❌ Could not open camera {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width); cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
+
+    print(f"✅ Loaded {args.model}  frames={T}  classes={classes}  input_dim={input_dim}")
+    print("Press 'q' to quit.")
+
+    seq_buffer, ema_probs = [], None                                                # rolling window + smoother
+    last_ts = time.time()                                                           # for EMA time constant
+    last_emitted = None                                                             # de-bounce repeated prints
+    history = []                                                                    # recent emitted labels
+
+    while True:
+        ok, frame = cap.read()                                                      # grab a frame
+        if not ok: break
+        now = time.time(); dt = max(1e-6, now - last_ts); last_ts = now            # frame delta seconds
+
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)                                # BGR→RGB
+        res = hol.process(rgb)                                                      # run detection
+
+        overlay = "No face/hand"                                                    # default HUD text
+        current = None                                                              # currently confident label
+
+        # hands
+        right_pts = left_pts = None
+        if res.right_hand_landmarks is not None:
+            right_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.right_hand_landmarks.landmark], np.float32)
+        if res.left_hand_landmarks is not None:
+            left_pts  = np.array([[lm.x, lm.y, lm.z] for lm in res.left_hand_landmarks.landmark],  np.float32)
+
+        # face
+        face_pts = None
+        if res.face_landmarks is not None:
+            face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
+
+        # pose
+        pose_arr = None
+        if res.pose_landmarks is not None:
+            pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in res.pose_landmarks.landmark], np.float32)
+
+        if face_pts is not None and (right_pts is not None or left_pts is not None):
+            f_norm = normalize_face(face_pts)                                       # normalized face (anchor)
+
+            # build extras in face frame (preserve where hands are relative to face)
+            f_center, f_scale, f_R = face_frame_transform(face_pts)
+
+            def hand_face_extras(hand_pts):
+                """Return [wrist.x, wrist.y, tip.x, tip.y] projected into the face frame, or zeros."""
+                if hand_pts is None:
+                    return np.zeros(4, np.float32)
+                wrist_xy = hand_pts[0, :2]
+                tip_xy   = hand_pts[8, :2]
+                w = to_face_frame(wrist_xy, f_center, f_scale, f_R)
+                t = to_face_frame(tip_xy,   f_center, f_scale, f_R)
+                return np.array([w[0], w[1], t[0], t[1]], np.float32)
+
+            rh_ex = hand_face_extras(right_pts)
+            lh_ex = hand_face_extras(left_pts)
+
+            rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)
+            lh = normalize_hand(left_pts,  "Left" ).reshape(-1) if left_pts  is not None else np.zeros(63, np.float32)
+            p_norm = normalize_pose(pose_arr).reshape(-1)      if pose_arr is not None else np.zeros(33*4, np.float32)
+
+            feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0)  # (1670,)
+            seq_buffer.append(feat)                                           # push newest feature frame
+            if len(seq_buffer) > T: seq_buffer.pop(0)                         # keep last T frames only
+
+            if len(seq_buffer) == T:                                          # only infer when buffer full
+                X = np.stack(seq_buffer, 0)                                   # (T, F)
+                Xn = (X - X_mean) / X_std                                     # normalize with training stats
+                xt = torch.from_numpy(Xn).float().unsqueeze(0).to(device)     # (1, T, F)
+
+                with torch.no_grad():                                         # inference (no grads)
+                    probs = torch.softmax(model(xt), dim=1)[0].cpu().numpy()  # class probabilities
+
+                if args.smooth > 0:
+                    alpha = 1.0 - math.exp(-dt / args.smooth)                 # EMA with time-based alpha
+                    ema_probs = probs if ema_probs is None else (1.0 - alpha) * ema_probs + alpha * probs
+                    use = ema_probs
+                else:
+                    use = probs
+
+                top_idx = int(np.argmax(use)); top_p = float(use[top_idx]); top_cls = classes[top_idx]  # best class
+                overlay = f"{top_cls} {top_p*100:.1f}%"                         # HUD text
+                if top_p >= args.threshold: current = top_cls                   # only emit when confident
+        else:
+            seq_buffer, ema_probs = [], None                                    # reset if face+hand not available
+
+        # Emit on change & optional "WEB" sequence trigger
+        if current is not None and current != last_emitted:
+            print(f"Detected: {current}")                                       # console feedback
+            last_emitted = current
+            history.append(current)                                             # remember last few
+            if len(history) > 3: history.pop(0)
+            if history == ["W","E","B"]:                                        # simple finite-seq detector
+                print("🚀 Detected WEB! Opening browser…")
+                try: webbrowser.open(args.url)                                  # launch default browser
+                except Exception as e: print(f"⚠️ Browser open failed: {e}")
+                history.clear()                                                 # reset after triggering
+
+        # Overlay HUD
+        buf = f"buf={len(seq_buffer)}/{T}"                                      # show buffer fill
+        if ema_probs is not None:
+            ti = int(np.argmax(ema_probs)); tp = float(ema_probs[ti]); tc = classes[ti]
+            buf += f"  top={tc} {tp:.2f}"                                       # show smoothed top prob
+        cv2.putText(frame, overlay, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)
+        cv2.putText(frame, buf,     (20, 75), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
+        cv2.imshow("ASL demo (R+L hands + face + pose + extras)", frame)        # preview window
+        if cv2.waitKey(1) & 0xFF == ord('q'): break                             # quit key
+
+    cap.release(); cv2.destroyAllWindows()                                      # cleanup
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/prep_sequence_resampled.py b/doc/prep_sequence_resampled.py
new file mode 100644
index 0000000..8108bd1
--- /dev/null
+++ b/doc/prep_sequence_resampled.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# Build fixed-length (N frames) dataset from sequences/<split>/<CLASS>/clip_*.npz
+
+import argparse, os, glob, json                   # stdlib utilities
+from pathlib import Path
+import numpy as np                                 # arrays
+
+def resample_sequence(X, N=32):
+    """
+    Linearly resample a variable-length sequence (T,F) to exactly (N,F) over the frame index.
+    This preserves temporal order and distributes frames evenly across the clip.
+    """
+    T = len(X)                                     # original number of frames
+    if T == 0: return np.zeros((N, X.shape[1]), np.float32)  # empty → zeros
+    if T == 1: return np.repeat(X, N, axis=0)                 # single frame → tile N times
+    src = np.linspace(0, T-1, num=T, dtype=np.float32)        # original frame positions
+    dst = np.linspace(0, T-1, num=N, dtype=np.float32)        # desired positions
+    out = np.zeros((N, X.shape[1]), np.float32)               # allocate result
+    for d in range(X.shape[1]):                               # interpolate each feature independently
+        out[:, d] = np.interp(dst, src, X[:, d])             # linear interpolation
+    return out
+
+def load_classes(seq_root: Path):
+    """
+    Discover class subfolders under sequences/train/.
+    Ignores hidden/system directories. Returns sorted list of class names.
+    """
+    train_dir = seq_root / "train"
+    if not train_dir.exists():
+        raise SystemExit(f"Missing folder: {train_dir}")
+    classes = sorted([
+        p.name for p in train_dir.iterdir()
+        if p.is_dir() and not p.name.startswith(".")
+    ])
+    if not classes:
+        raise SystemExit("No classes found in sequences/train/ (folders should be class names like Mother, Father, etc.)")
+    return classes
+
+def collect_split(seq_root: Path, split: str, classes, N):
+    """
+    Collect all clips for a given split ('train' or 'val'):
+      - Load each clip_*.npz
+      - Resample to (N,F)
+      - Stack into X (num_clips, N, F) and y (num_clips,)
+    """
+    Xs, ys = [], []
+    for ci, cls in enumerate(classes):                            # class index, name
+        for f in sorted(glob.glob(str(seq_root / split / cls / "clip_*.npz"))):  # iterate clips
+            d = np.load(f)                                        # load .npz
+            Xi = d["X"].astype(np.float32)                        # (T,F) features
+            XiN = resample_sequence(Xi, N)                        # (N,F) resampled
+            Xs.append(XiN); ys.append(ci)                         # add to lists
+    if Xs:
+        X = np.stack(Xs, 0); y = np.array(ys, np.int64)           # stack arrays
+    else:
+        X = np.zeros((0, N, 1), np.float32); y = np.zeros((0,), np.int64)  # empty split guard
+    return X, y
+
+def main():
+    """
+    CLI: read sequences/*/*/clip_*.npz, resample to --frames, and write dataset arrays and metadata.
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in", dest="in_dir", default="sequences")     # source root
+    ap.add_argument("--out", default="landmarks_seq32")             # destination folder
+    ap.add_argument("--frames", type=int, default=32)               # target frames per clip
+    args = ap.parse_args()
+
+    seq_root = Path(args.in_dir)                                    # resolve input root
+    outdir = Path(args.out); outdir.mkdir(parents=True, exist_ok=True)
+
+    classes = load_classes(seq_root)                                # discover class names
+    trX, trY = collect_split(seq_root, "train", classes, args.frames)  # build train split
+    vaX, vaY = collect_split(seq_root, "val",   classes, args.frames)  # build val split
+
+    if trX.size == 0 and vaX.size == 0:                             # sanity check
+        raise SystemExit("Found no clips. Did you run capture and save any clip_*.npz files?")
+
+    np.save(outdir/"train_X.npy", trX)                              # save arrays
+    np.save(outdir/"train_y.npy", trY)
+    np.save(outdir/"val_X.npy",   vaX)
+    np.save(outdir/"val_y.npy",   vaY)
+    json.dump(classes, open(outdir/"class_names.json", "w"))        # save labels
+
+    # Detect true feature dimension from data (in case it changes)
+    input_dim = int(trX.shape[-1] if trX.size else vaX.shape[-1])
+    json.dump({"frames": args.frames, "input_dim": input_dim}, open(outdir/"meta.json","w"))
+
+    print(f"Saved dataset → {outdir}")
+    print(f"  train {trX.shape}, val {vaX.shape}, classes={classes}, input_dim={input_dim}")
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/train_seq.py b/doc/train_seq.py
new file mode 100644
index 0000000..5d3a925
--- /dev/null
+++ b/doc/train_seq.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# Train BiGRU on (T, F) sequences; reads input_dim from meta.json
+
+import os, json, argparse                           # stdlib
+import numpy as np                                   # arrays
+import torch, torch.nn as nn                         # model/ops
+from torch.utils.data import Dataset, DataLoader     # data pipeline
+
+def get_device():
+    """
+    Prefer Apple Silicon's MPS if available; fallback to CPU/GPU accordingly.
+    """
+    return torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+
+class SeqDataset(Dataset):
+    """
+    Simple dataset wrapper with optional light augmentation.
+    """
+    def __init__(self, X, y, augment=False):
+        self.X = X.astype(np.float32)               # ensure float32 features
+        self.y = y.astype(np.int64)                 # class indices as int64
+        self.augment = augment
+    def __len__(self): return len(self.y)           # number of samples
+    def _augment(self, seq):
+        # Add tiny Gaussian noise; helpful regularizer for high-D continuous features.
+        return seq + np.random.normal(0, 0.01, size=seq.shape).astype(np.float32)
+    def __getitem__(self, i):
+        xi = self.X[i]                               # (T, F)
+        if self.augment: xi = self._augment(xi)      # optional noise
+        return torch.from_numpy(xi).float(), int(self.y[i])  # return (tensor, label)
+
+class SeqGRU(nn.Module):
+    """
+    BiGRU → MLP head classifier.
+    Uses last time step of GRU outputs (many-to-one).
+    """
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = nn.Sequential(
+            nn.Linear(hidden*2, 128), nn.ReLU(), nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x)                            # (B,T,2H)
+        return self.head(h[:, -1, :])                # logits (B,C)
+
+def main():
+    """
+    Train loop:
+      - Load prepared dataset
+      - Compute global mean/std on train and normalize train/val
+      - Train BiGRU with AdamW + cosine schedule
+      - Save best checkpoint by val accuracy (includes mean/std)
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32")  # dataset folder
+    ap.add_argument("--epochs", type=int, default=40)
+    ap.add_argument("--batch", type=int, default=64)
+    ap.add_argument("--lr", type=float, default=1e-3)
+    ap.add_argument("--out", default="asl_seq32_gru.pt")       # model save path
+    args = ap.parse_args()
+
+    trX = np.load(os.path.join(args.landmarks,"train_X.npy"))  # (Ntr, T, F)
+    trY = np.load(os.path.join(args.landmarks,"train_y.npy"))  # (Ntr,)
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))    # (Nva, T, F)
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))    # (Nva,)
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))
+    T = int(meta["frames"])                                    # #frames per clip
+    input_dim = int(meta.get("input_dim", trX.shape[-1]))      # feature dim (safety)
+
+    print(f"Loaded: train {trX.shape}  val {vaX.shape}  classes={classes}  input_dim={input_dim}")
+
+    # Global normalization (feature-wise) computed on TRAIN ONLY
+    X_mean = trX.reshape(-1, trX.shape[-1]).mean(axis=0, keepdims=True).astype(np.float32)  # (1,F)
+    X_std  = trX.reshape(-1, trX.shape[-1]).std(axis=0,  keepdims=True).astype(np.float32) + 1e-6
+    trXn   = (trX - X_mean) / X_std                            # normalize train
+    vaXn   = (vaX - X_mean) / X_std                            # normalize val using train stats
+
+    tr_ds = SeqDataset(trXn, trY, augment=True)                # datasets
+    va_ds = SeqDataset(vaXn, vaY, augment=False)
+    tr_dl = DataLoader(tr_ds, batch_size=args.batch, shuffle=True)   # loaders
+    va_dl = DataLoader(va_ds, batch_size=args.batch, shuffle=False)
+
+    device = get_device()                                      # target device
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes)).to(device)
+    crit = nn.CrossEntropyLoss()                                # standard multi-class loss
+    opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-4)  # AdamW helps generalization
+    sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs)    # smooth LR decay
+
+    best_acc, best_state = 0.0, None                           # track best val acc
+    for epoch in range(1, args.epochs+1):
+        model.train()
+        tot, correct, loss_sum = 0, 0, 0.0
+        for xb, yb in tr_dl:
+            xb, yb = xb.to(device), yb.to(device)              # move to device
+            opt.zero_grad(set_to_none=True)                    # reset grads
+            logits = model(xb)                                 # forward
+            loss = crit(logits, yb)                            # compute loss
+            loss.backward()                                    # backprop
+            opt.step()                                         # update weights
+            loss_sum += loss.item() * yb.size(0)               # accumulate loss
+            correct += (logits.argmax(1)==yb).sum().item()     # count train correct
+            tot += yb.size(0)                                  # sample counter
+        tr_loss = loss_sum / max(1, tot)
+        tr_acc = correct / max(1, tot)
+
+        model.eval()
+        vtot, vcorrect = 0, 0
+        with torch.no_grad():
+            for xb, yb in va_dl:
+                xb, yb = xb.to(device), yb.to(device)
+                logits = model(xb)
+                vcorrect += (logits.argmax(1)==yb).sum().item()
+                vtot += yb.size(0)
+        va_acc = vcorrect / max(1, vtot)                       # validation accuracy
+        sch.step()                                             # update LR schedule
+
+        print(f"Epoch {epoch:02d}: train_loss={tr_loss:.4f} train_acc={tr_acc:.3f} val_acc={va_acc:.3f}")
+
+        if va_acc > best_acc:                                  # save best checkpoint
+            best_acc = va_acc
+            best_state = {
+                "model": model.state_dict(),
+                "classes": classes,
+                "frames": T,
+                "X_mean": torch.from_numpy(X_mean),
+                "X_std":  torch.from_numpy(X_std),
+            }
+            torch.save(best_state, args.out)
+            print(f"  ✅ Saved best → {args.out} (val_acc={best_acc:.3f})")
+
+    print("Done. Best val_acc:", best_acc)
+
+if __name__ == "__main__":
+    main()
diff --git a/eval_val.py b/eval_val.py
new file mode 100755
index 0000000..3a0af78
--- /dev/null
+++ b/eval_val.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# Evaluate a trained SeqGRU on the validation set; reads input_dim from meta.json
+
+import os, json, argparse
+import numpy as np
+import torch, torch.nn as nn
+from sklearn.metrics import classification_report, confusion_matrix
+
+class SeqGRU(nn.Module):
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = nn.Sequential(
+            nn.Linear(hidden*2, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x)
+        return self.head(h[:, -1, :])
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32")
+    ap.add_argument("--model", required=True)
+    args = ap.parse_args()
+
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))
+    T = int(meta.get("frames", vaX.shape[1]))
+    input_dim = int(meta.get("input_dim", vaX.shape[-1]))
+
+    state = torch.load(args.model, map_location="cpu", weights_only=False)
+    X_mean, X_std = state["X_mean"], state["X_std"]
+    if isinstance(X_mean, torch.Tensor): X_mean = X_mean.numpy()
+    if isinstance(X_std,  torch.Tensor): X_std  = X_std.numpy()
+    X_mean = X_mean.astype(np.float32)
+    X_std  = (X_std.astype(np.float32) + 1e-6)
+
+    vaXn = (vaX - X_mean) / X_std
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes))
+    model.load_state_dict(state["model"])
+    model.eval().to(device)
+
+    with torch.no_grad():
+        xb = torch.from_numpy(vaXn).float().to(device)
+        logits = model(xb)
+        pred = logits.argmax(1).cpu().numpy()
+
+    cm = confusion_matrix(vaY, pred)
+    print("Classes:", classes)
+    print("\nConfusion matrix (rows=true, cols=pred):\n", cm)
+    print("\nReport:\n", classification_report(vaY, pred, target_names=classes))
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/README.md b/first_attempt_landmark_hands/README.md
new file mode 100644
index 0000000..3445f51
--- /dev/null
+++ b/first_attempt_landmark_hands/README.md
@@ -0,0 +1,216 @@
+# Handshape Sequence Classifier (MediaPipe + PyTorch, macOS MPS-ready)
+
+Live ASL handshape letter demo powered by MediaPipe Hands landmarks and a bidirectional GRU sequence model.
+Record short clips per letter, resample to a fixed length, train, evaluate, and run a real-time webcam demo that can react to detected letter sequences (e.g., **W → E → B** opens a URL).
+
+## Features
+
+* **Data capture UI:** 3-second centered countdown + top progress bar; fingertip dot feedback.
+* **Robust normalization:** wrist-anchored, left/right mirroring, rotation to +Y, scale by max pairwise distance.
+* **Fixed-length preprocessing:** linear resampling to *N* frames (default **32**).
+* **Sequence model:** BiGRU (128 hidden × 2) → MLP head; light augmentation during training.
+* **Live inference:** EMA smoothing + thresholding; emits letters only on change; detects special sequences (**WEB**) and opens a browser.
+
+---
+
+## Quick Start
+
+```bash
+# 0) (optional) Create & activate a virtual env
+python -m venv .venv && source .venv/bin/activate
+
+# 1) Install deps
+pip install numpy opencv-python mediapipe torch scikit-learn
+
+# 2) Make directories for the letters you’ll collect
+./make_seq_dirs.sh A B J Z
+
+# 3) Capture short clips per letter (train/val)
+python capture_sequence.py --label A --split train
+python capture_sequence.py --label A --split val
+# ...repeat for B, J, Z
+
+# 4) Preprocess → fixed-length dataset (32 frames)
+python prep_sequence_resampled.py --in sequences --out landmarks_seq32 --frames 32
+
+# 5) Train the BiGRU
+python train_seq.py --landmarks landmarks_seq32 --epochs 40 --batch 64 --lr 1e-3 \
+  --out asl_seq32_gru_ABJZ.pt
+
+# 6) Evaluate on the validation set (confusion matrix + report)
+python eval_val.py --landmarks landmarks_seq32 --model asl_seq32_gru_ABJZ.pt
+
+# 7) Live webcam demo (press 'q' to quit)
+python infer_seq_webcam.py --model asl_seq32_gru_ABJZ.pt --threshold 0.8 --smooth 0.7
+```
+
+> **WEB trigger:** In the live demo, if the emitted letters form **W → E → B**, the app prints a message and opens `--url` (default: Google).
+> Example: `--url https://www.gallaudet.edu`
+
+---
+
+## Repository Layout
+
+```
+handshapes-multiclass/
+├─ make_seq_dirs.sh                 # creates sequences/train|val/<LETTER>/
+├─ capture_sequence.py              # webcam capture → clip_XXX.npz (X: (T,63), tip: (T,2))
+├─ prep_sequence_resampled.py       # resample clips to fixed N frames → landmarks_seq32/
+├─ train_seq.py                     # train BiGRU; saves best checkpoint (.pt + stats)
+├─ eval_val.py                      # evaluate on val set; prints metrics
+├─ infer_seq_webcam.py              # live demo; emits letters; detects "WEB" → opens URL
+├─ what_to_do.txt                   # quick, step-by-step playbook
+└─ sequences/                       # created by you (after running make_seq_dirs.sh)
+   ├─ train/<LETTER>/clip_XXX.npz
+   └─ val/<LETTER>/clip_XXX.npz
+```
+
+**Clip file format (`clip_XXX.npz`)**
+
+* `X`: `(T, 63)` — per-frame normalized landmarks (21 points × (x, y, z))
+* `tip`: `(T, 2)` — normalized index fingertip positions (for sanity checks)
+
+**Prepared dataset (`landmarks_seq32/`)**
+
+* `train_X.npy`, `train_y.npy`, `val_X.npy`, `val_y.npy`
+* `class_names.json` (e.g., `["A","B","J","Z"]`)
+* `meta.json` (e.g., `{"frames":32,"input_dim":63}`)
+
+**Checkpoint (`*.pt`)**
+
+* `model` (state_dict), `classes`, `frames`, `X_mean`, `X_std`
+
+---
+
+## Normalization (consistent across capture & inference)
+
+1. Translate so **wrist** (landmark 0) is at the origin.
+2. If detected **left** hand, mirror `x *= -1`.
+3. Rotate so the **middle-finger MCP** (landmark 9) points along **+Y**.
+4. Scale all coords by the **max pairwise distance** among 2D landmarks.
+5. Flatten to **63 features** per frame.
+
+This ensures letter-style, not camera pose, drives classification.
+
+---
+
+## Training Details
+
+* **Model:** BiGRU (input=63, hidden=128, bidirectional) → `[Linear(256→128), ReLU, Dropout(0.2), Linear(128→num_classes)]`
+* **Optimizer:** AdamW (`lr=1e-3`, `weight_decay=1e-4`)
+* **Scheduler:** CosineAnnealingLR (`T_max = epochs`)
+* **Augmentation:** small 2D rotate (±7°), scale (±10%), Gaussian noise (σ=0.01)
+* **Normalization:** global `X_mean`/`X_std` computed over **train** (time+batch), applied to both train & val and saved into the checkpoint.
+
+---
+
+## Live Inference Behavior
+
+* Maintains a rolling buffer of **T = frames** (from the checkpoint).
+* Applies the saved `X_mean`/`X_std`.
+* **EMA smoothing** over softmax probs with time constant `--smooth` (seconds).
+* Emits a letter only if:
+
+  * top prob ≥ `--threshold` (e.g., 0.8), **and**
+  * the letter **changed** from the previous emission (prevents repeats).
+* Tracks a short history of emitted letters to detect **W → E → B**; on match:
+
+  * prints “Detected WEB! …”
+  * calls `webbrowser.open(--url)`
+
+**Common flags**
+
+```bash
+# Camera & size
+--camera 0 --width 640 --height 480
+
+# Confidence vs. latency tradeoffs
+--threshold 0.85           # higher → fewer false positives
+--smooth 1.0               # higher → steadier output but more lag
+
+# Action on sequence
+--url https://example.com
+```
+
+---
+
+## Tips for High Accuracy
+
+* Record **balanced** train/val counts per class (e.g., 100 train / 20 val).
+* Keep the hand **centered**, well lit, and mostly **single-hand** (model expects 1 hand).
+* Maintain consistent **distance** and **orientation** during capture.
+* If you add new letters later, just record them, re-run preprocessing, and retrain — classes are **auto-discovered** from `sequences/train/*`.
+
+---
+
+## macOS (M-series) Notes
+
+* PyTorch will automatically use **Metal (MPS)** if available (`torch.backends.mps.is_available()`); otherwise CPU.
+* If the webcam feed looks low FPS, try reducing `--width/--height` or raising `--threshold` / `--smooth`.
+
+---
+
+## Troubleshooting
+
+* **“Could not open camera”** → try `--camera 1` (or check macOS camera permission).
+* **No detections / “No hand” on screen** → improve lighting, ensure a single clear hand, check MediaPipe install.
+* **Model emits wrong letters** → increase `--threshold`, collect more data, or raise `--smooth`.
+* **Mismatch T during inference** → ensure `--frames` at preprocessing matches the checkpoint’s `frames` (saved & auto-used).
+
+---
+
+## Commands Reference
+
+### Create class folders
+
+```bash
+./make_seq_dirs.sh A B J Z
+```
+
+### Capture clips
+
+```bash
+python capture_sequence.py --label A --split train --seconds 0.8 --count 100
+python capture_sequence.py --label A --split val   --seconds 0.8 --count 20
+```
+
+### Prepare dataset (resample to 32 frames)
+
+```bash
+python prep_sequence_resampled.py --in sequences --out landmarks_seq32 --frames 32
+```
+
+### Train
+
+```bash
+python train_seq.py --landmarks landmarks_seq32 --epochs 40 --batch 64 --lr 1e-3 \
+  --out asl_seq32_gru_ABJZ.pt
+```
+
+### Evaluate
+
+```bash
+python eval_val.py --landmarks landmarks_seq32 --model asl_seq32_gru_ABJZ.pt
+```
+
+### Live demo (open URL on “WEB”)
+
+```bash
+python infer_seq_webcam.py --model asl_seq32_gru_ABJZ.pt --threshold 0.8 --smooth 0.7 \
+  --url https://www.gallaudet.edu
+```
+
+---
+
+## License
+
+MIT
+
+---
+
+## Acknowledgments
+
+* **MediaPipe Hands** for robust, fast hand landmark detection.
+* **PyTorch** for flexible sequence modeling on CPU/MPS.
+
+---
\ No newline at end of file
diff --git a/first_attempt_landmark_hands/capture_sequence.py b/first_attempt_landmark_hands/capture_sequence.py
new file mode 100755
index 0000000..9428e35
--- /dev/null
+++ b/first_attempt_landmark_hands/capture_sequence.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+# capture_sequence.py
+# Automatically record N short sequences for each label (default: 100 train / 20 val)
+# Centered 3-second countdown before recording.
+# Per-clip depleting progress bar (full → empty) across the top during capture.
+
+import argparse, os, time, math, re
+from pathlib import Path
+import numpy as np, cv2, mediapipe as mp
+
+def normalize_frame(pts, handed=None):
+    pts = pts.astype(np.float32).copy()
+    pts[:, :2] -= pts[0, :2]
+    if handed and handed.lower().startswith("left"):
+        pts[:, 0] *= -1.0
+    v = pts[9, :2]
+    ang = math.atan2(v[1], v[0])
+    c, s = math.cos(math.pi/2 - ang), math.sin(math.pi/2 - ang)
+    R = np.array([[c, -s], [s, c]], np.float32)
+    pts[:, :2] = pts[:, :2] @ R.T
+    xy = pts[:, :2]
+    d = np.max(np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1))
+    if d < 1e-6: d = 1.0
+    pts[:, :2] /= d; pts[:, 2] /= d
+    return pts
+
+def next_idx(folder: Path, prefix="clip_"):
+    pat = re.compile(rf"^{re.escape(prefix)}(\d+)\.npz$")
+    mx = 0
+    if folder.exists():
+        for n in os.listdir(folder):
+            m = pat.match(n)
+            if m: mx = max(mx, int(m.group(1)))
+    return mx + 1
+
+def countdown(cap, seconds=3):
+    """Display a centered countdown before starting capture."""
+    for i in range(seconds, 0, -1):
+        start = time.time()
+        while time.time() - start < 1.0:
+            ok, frame = cap.read()
+            if not ok:
+                continue
+            h, w = frame.shape[:2]
+
+            # Main big number in center
+            text = str(i)
+            font_scale = 5
+            thickness = 10
+            (tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
+            cv2.putText(frame, text,
+                        ((w - tw)//2, (h + th)//2),
+                        cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0,0,255), thickness, cv2.LINE_AA)
+
+            # Smaller message above
+            msg = "Starting in..."
+            font_scale_msg = 1.2
+            thickness_msg = 3
+            (mw, mh), _ = cv2.getTextSize(msg, cv2.FONT_HERSHEY_SIMPLEX, font_scale_msg, thickness_msg)
+            cv2.putText(frame, msg,
+                        ((w - mw)//2, (h//2) - th - 20),
+                        cv2.FONT_HERSHEY_SIMPLEX, font_scale_msg, (0,255,255), thickness_msg, cv2.LINE_AA)
+
+            cv2.imshow("sequence capture", frame)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                cap.release(); cv2.destroyAllWindows(); raise SystemExit("Aborted during countdown")
+
+def draw_progress_bar(img, frac_remaining, bar_h=16, margin=12):
+    """
+    Draw a top progress bar that starts full and depletes to empty.
+    frac_remaining: 1.0 at start → 0.0 at end.
+    """
+    h, w = img.shape[:2]
+    x0, x1 = margin, w - margin
+    y0, y1 = margin, margin + bar_h
+
+    # Background bar
+    cv2.rectangle(img, (x0, y0), (x1, y1), (40, 40, 40), -1)  # dark gray
+    cv2.rectangle(img, (x0, y0), (x1, y1), (90, 90, 90), 2)   # border
+
+    # Foreground (remaining)
+    rem_w = int((x1 - x0) * max(0.0, min(1.0, frac_remaining)))
+    if rem_w > 0:
+        cv2.rectangle(img, (x0, y0), (x0 + rem_w, y1), (0, 200, 0), -1)  # green
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--label", required=True, help="Letter label (A..Z)")
+    ap.add_argument("--split", required=True, choices=["train","val"])
+    ap.add_argument("--seconds", type=float, default=0.8, help="Clip length (s)")
+    ap.add_argument("--camera", type=int, default=0)
+    ap.add_argument("--width", type=int, default=640)
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--count", type=int, default=None,
+                    help="How many clips (default=100 train, 20 val)")
+    args = ap.parse_args()
+
+    if args.count is None:
+        args.count = 100 if args.split == "train" else 20
+
+    L = args.label.upper().strip()
+    if not (len(L) == 1 and "A" <= L <= "Z"):
+        raise SystemExit("Use --label A..Z")
+
+    out_dir = Path("sequences") / args.split / L
+    out_dir.mkdir(parents=True, exist_ok=True)
+    idx = next_idx(out_dir)
+
+    hands = mp.solutions.hands.Hands(
+        static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5
+    )
+    cap = cv2.VideoCapture(args.camera)
+    if not cap.isOpened():
+        raise SystemExit(f"Could not open camera {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width)
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
+
+    print(f"Recording {args.count} clips for {L}/{args.split}, {args.seconds}s each.")
+    countdown(cap, 3)
+
+    for n in range(args.count):
+        seq_X, seq_tip = [], []
+        start_t = time.time()
+        end_t = start_t + args.seconds
+
+        while True:
+            now = time.time()
+            if now >= end_t:
+                break
+
+            ok, fr = cap.read()
+            if not ok:
+                break
+
+            rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)
+            res = hands.process(rgb)
+            if res.multi_hand_landmarks:
+                ih = res.multi_hand_landmarks[0]
+                handed = None
+                if res.multi_handedness:
+                    handed = res.multi_handedness[0].classification[0].label
+                pts = np.array([[lm.x, lm.y, lm.z] for lm in ih.landmark], np.float32)
+                pts = normalize_frame(pts, handed)
+                seq_X.append(pts.reshape(-1))
+                seq_tip.append(pts[8, :2])
+
+                # draw fingertip marker (for feedback)
+                cv2.circle(fr,
+                           (int(fr.shape[1] * pts[8, 0]), int(fr.shape[0] * pts[8, 1])),
+                           6, (0, 255, 0), -1)
+
+            # overlay progress + status
+            frac_remaining = (end_t - now) / max(1e-6, args.seconds)  # 1 → 0
+            draw_progress_bar(fr, frac_remaining, bar_h=16, margin=12)
+            cv2.putText(fr, f"{L} {args.split}  Clip {n+1}/{args.count}",
+                        (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA)
+
+            cv2.imshow("sequence capture", fr)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                cap.release(); cv2.destroyAllWindows(); return
+
+        if seq_X:
+            X = np.stack(seq_X, 0)
+            tip = np.stack(seq_tip, 0)
+            path = out_dir / f"clip_{idx:03d}.npz"
+            np.savez_compressed(path, X=X, tip=tip)
+            print(f"💾 saved {path} frames={X.shape[0]}")
+            idx += 1
+        else:
+            print("⚠️ No hand detected; skipped clip.")
+
+    print("✅ Done recording.")
+    cap.release(); cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/eval_val.py b/first_attempt_landmark_hands/eval_val.py
new file mode 100755
index 0000000..d8fd3f8
--- /dev/null
+++ b/first_attempt_landmark_hands/eval_val.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# eval_seq_val.py
+import os, json, argparse
+import numpy as np
+import torch, torch.nn as nn
+from sklearn.metrics import classification_report, confusion_matrix
+
+class SeqGRU(nn.Module):
+    def __init__(self, input_dim=63, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = nn.Sequential(
+            nn.Linear(hidden*2, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x)
+        h_last = h[:, -1, :]
+        return self.head(h_last)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32")
+    ap.add_argument("--model", default="asl_seq32_gru_ABJZ.pt")
+    args = ap.parse_args()
+
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))   # (N, T, 63)
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))
+    T = int(meta.get("frames", 32))
+
+    state = torch.load(args.model, map_location="cpu", weights_only=False)
+    X_mean, X_std = state["X_mean"], state["X_std"]
+    if isinstance(X_mean, torch.Tensor): X_mean = X_mean.numpy()
+    if isinstance(X_std,  torch.Tensor): X_std  = X_std.numpy()
+    X_mean = X_mean.astype(np.float32)
+    X_std  = (X_std.astype(np.float32) + 1e-6)
+
+    vaXn = (vaX - X_mean) / X_std
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+    model = SeqGRU(63, 128, num_classes=len(classes))
+    model.load_state_dict(state["model"])
+    model.eval().to(device)
+
+    with torch.no_grad():
+        xb = torch.from_numpy(vaXn).float().to(device)
+        logits = model(xb)
+        pred = logits.argmax(1).cpu().numpy()
+
+    cm = confusion_matrix(vaY, pred)
+    print("Classes:", classes)
+    print("\nConfusion matrix (rows=true, cols=pred):\n", cm)
+    print("\nReport:\n", classification_report(vaY, pred, target_names=classes))
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/infer_seq_webcam.py b/first_attempt_landmark_hands/infer_seq_webcam.py
new file mode 100755
index 0000000..fee21ff
--- /dev/null
+++ b/first_attempt_landmark_hands/infer_seq_webcam.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""
+infer_seq_webcam.py
+Live webcam demo: detect a hand with MediaPipe, normalize landmarks,
+classify with a trained sequence GRU model (multiclass).
+
+Examples:
+  python infer_seq_webcam.py --model asl_seq32_gru_ABJZ.pt --threshold 0.8 --smooth 0.7
+  python infer_seq_webcam.py --model asl_seq32_gru_ABJZ.pt --threshold 0.85 --smooth 1.0 --url https://www.google.com
+"""
+
+import os, math, argparse, time, webbrowser
+import numpy as np
+import cv2
+import torch
+import mediapipe as mp
+
+# --- Quiet logs ---
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["GLOG_minloglevel"] = "2"
+import absl.logging
+absl.logging.set_verbosity(absl.logging.ERROR)
+cv2.setLogLevel(0)
+
+# ---------- geometry helpers ----------
+def _angle(v): return math.atan2(v[1], v[0])
+def _rot2d(t):
+    c, s = math.cos(t), math.sin(t)
+    return np.array([[c, -s], [s, c]], dtype=np.float32)
+
+def normalize_landmarks(pts, handedness_label=None):
+    """
+    pts: (21,3) MediaPipe normalized coords in [0..1]
+    Steps: translate wrist->origin, mirror left to right, rotate to +Y, scale by max pairwise distance.
+    Returns: (63,) float32
+    """
+    pts = pts.astype(np.float32).copy()
+    pts[:, :2] -= pts[0, :2]
+    if handedness_label and handedness_label.lower().startswith("left"):
+        pts[:, 0] *= -1.0
+    v = pts[9, :2]  # middle MCP
+    R = _rot2d(math.pi/2 - _angle(v))
+    pts[:, :2] = pts[:, :2] @ R.T
+    xy = pts[:, :2]
+    d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
+    d = 1.0 if d < 1e-6 else float(d)
+    pts[:, :2] /= d; pts[:, 2] /= d
+    return pts.reshape(-1)
+
+# ---------- sequence model ----------
+class SeqGRU(torch.nn.Module):
+    def __init__(self, input_dim=63, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = torch.nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(hidden*2, 128),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.2),
+            torch.nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h, _ = self.gru(x)          # (B,T,2H)
+        h_last = h[:, -1, :]        # or h.mean(1)
+        return self.head(h_last)
+
+# ---------- main ----------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True, help="Path to trained .pt model")
+    ap.add_argument("--camera", type=int, default=0)
+    ap.add_argument("--threshold", type=float, default=0.8)
+    ap.add_argument("--smooth", type=float, default=0.7,
+                    help="EMA smoothing window in seconds (0 disables smoothing)")
+    ap.add_argument("--width", type=int, default=640)
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--url", type=str, default="https://www.google.com",
+                    help="URL to open when the sequence W→E→B is detected")
+    args = ap.parse_args()
+
+    if not os.path.exists(args.model):
+        raise SystemExit(f"❌ Model file not found: {args.model}")
+
+    # Load checkpoint (support numpy or tensor stats; support 'frames' if present)
+    state = torch.load(args.model, map_location="cpu", weights_only=False)
+    classes = state["classes"]
+    T = int(state.get("frames", 32))
+
+    X_mean, X_std = state["X_mean"], state["X_std"]
+    if isinstance(X_mean, torch.Tensor): X_mean = X_mean.cpu().numpy()
+    if isinstance(X_std,  torch.Tensor): X_std  = X_std.cpu().numpy()
+    X_mean = X_mean.astype(np.float32)
+    X_std  = (X_std.astype(np.float32) + 1e-6)
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+    model = SeqGRU(63, 128, num_classes=len(classes)).to(device)
+    model.load_state_dict(state["model"])
+    model.eval()
+
+    hands = mp.solutions.hands.Hands(
+        static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5
+    )
+
+    cap = cv2.VideoCapture(args.camera)
+    if not cap.isOpened():
+        raise SystemExit(f"❌ Could not open camera index {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width)
+    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
+
+    print(f"✅ Loaded {args.model}  frames={T}  classes={classes}")
+    print("Press 'q' to quit.")
+
+    seq_buffer, ema_probs = [], None
+    last_ts = time.time()
+    last_emitted_letter = None
+
+    # Rolling history of emitted letters to detect the sequence "WEB"
+    detected_history = []  # only stores emitted letters (deduped by change)
+
+    while True:
+        ok, frame = cap.read()
+        if not ok: break
+        now = time.time()
+        dt = max(1e-6, now - last_ts)
+        last_ts = now
+
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        res = hands.process(rgb)
+
+        overlay_text = "No hand"
+        current_letter = None
+
+        if res.multi_hand_landmarks:
+            ih = res.multi_hand_landmarks[0]
+            handed = None
+            if res.multi_handedness:
+                handed = res.multi_handedness[0].classification[0].label
+            pts = np.array([[lm.x, lm.y, lm.z] for lm in ih.landmark], dtype=np.float32)
+            feat = normalize_landmarks(pts, handedness_label=handed)
+            seq_buffer.append(feat)
+            if len(seq_buffer) > T: seq_buffer.pop(0)
+
+            if len(seq_buffer) == T:
+                X = np.stack(seq_buffer, 0)
+                Xn = (X - X_mean) / X_std
+                xt = torch.from_numpy(Xn).float().unsqueeze(0).to(device)
+                with torch.no_grad():
+                    logits = model(xt)
+                    probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
+
+                if args.smooth > 0:
+                    alpha = 1.0 - math.exp(-dt / args.smooth)
+                    if ema_probs is None: ema_probs = probs
+                    else: ema_probs = (1.0 - alpha) * ema_probs + alpha * probs
+                    use_probs = ema_probs
+                else:
+                    use_probs = probs
+
+                top_idx = int(np.argmax(use_probs))
+                top_p = float(use_probs[top_idx])
+                top_cls = classes[top_idx]
+
+                if top_p >= args.threshold:
+                    overlay_text = f"{top_cls} {top_p*100:.1f}%"
+                    current_letter = top_cls
+        else:
+            seq_buffer, ema_probs = [], None
+
+        # Only emit when a *letter* changes (ignore no-hand and repeats)
+        if current_letter is not None and current_letter != last_emitted_letter:
+            print(f"Detected: {current_letter}")
+            last_emitted_letter = current_letter
+
+            # Update rolling history
+            detected_history.append(current_letter)
+            if len(detected_history) > 3:
+                detected_history.pop(0)
+
+            # Check for special sequence "WEB"
+            if detected_history == ["W", "E", "B"]:
+                print("🚀 Detected WEB! Time to open the web browser app.")
+                try:
+                    webbrowser.open(args.url)
+                except Exception as e:
+                    print(f"⚠️ Failed to open browser: {e}")
+                detected_history.clear()  # fire once per occurrence
+
+        # On-screen overlay (still shows "No hand" when nothing is detected)
+        cv2.putText(frame, overlay_text, (20, 40),
+                    cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)
+        cv2.imshow("ASL sequence demo", frame)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+    cap.release()
+    cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/make_seq_dirs.sh b/first_attempt_landmark_hands/make_seq_dirs.sh
new file mode 100755
index 0000000..0b17584
--- /dev/null
+++ b/first_attempt_landmark_hands/make_seq_dirs.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Create sequences/<train|val>/<LETTER>/ for the given letters.
+# Example: ./make_seq_dirs.sh A B J Z
+
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+  echo "Usage: $0 LETTER [LETTER ...]   e.g.  $0 A B J Z"
+  exit 1
+fi
+
+ROOT="sequences"
+for SPLIT in train val; do
+  for L in "$@"; do
+    mkdir -p "$ROOT/$SPLIT/$L"
+  done
+done
+
+echo "✅ Created $ROOT/train and $ROOT/val for: $*"
diff --git a/first_attempt_landmark_hands/prep_sequence_resampled.py b/first_attempt_landmark_hands/prep_sequence_resampled.py
new file mode 100755
index 0000000..3b281f8
--- /dev/null
+++ b/first_attempt_landmark_hands/prep_sequence_resampled.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# prep_sequence_resampled.py
+# Build a fixed-length (N frames) multiclass dataset from sequences/<split>/<CLASS>/clip_*.npz
+import argparse, os, glob, json
+from pathlib import Path
+import numpy as np
+
+def resample_sequence(X, N=32):
+    # X: (T,63)  -> (N,63) by linear interpolation along frame index
+    T = len(X)
+    if T == 0:
+        return np.zeros((N, X.shape[1]), np.float32)
+    if T == 1:
+        return np.repeat(X, N, axis=0)
+    src = np.linspace(0, T-1, num=T)
+    dst = np.linspace(0, T-1, num=N)
+    out = np.zeros((N, X.shape[1]), np.float32)
+    for d in range(X.shape[1]):
+        out[:, d] = np.interp(dst, src, X[:, d])
+    return out.astype(np.float32)
+
+def load_classes(seq_root: Path):
+    # classes are subdirs in sequences/train/
+    classes = sorted([p.name for p in (seq_root/"train").iterdir() if p.is_dir()])
+    classes = [c for c in classes if len(c)==1 and "A"<=c<="Z"]
+    if not classes:
+        raise SystemExit("No letter classes found in sequences/train/")
+    return classes
+
+def collect_split(seq_root: Path, split: str, classes, N):
+    Xs, ys = [], []
+    for ci, cls in enumerate(classes):
+        for f in sorted(glob.glob(str(seq_root/split/cls/"clip_*.npz"))):
+            d = np.load(f)
+            Xi = d["X"].astype(np.float32)   # (T,63)
+            XiN = resample_sequence(Xi, N)   # (N,63)
+            Xs.append(XiN); ys.append(ci)
+    if Xs:
+        X = np.stack(Xs, 0)
+        y = np.array(ys, np.int64)
+    else:
+        X = np.zeros((0, N, 63), np.float32); y = np.zeros((0,), np.int64)
+    return X, y
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in", dest="in_dir", default="sequences", help="Root sequences/ with train/ and val/")
+    ap.add_argument("--out", default="landmarks_seq32", help="Output folder with npy files")
+    ap.add_argument("--frames", type=int, default=32, help="Frames per clip after resampling (default: 32)")
+    args = ap.parse_args()
+
+    seq_root = Path(args.in_dir)
+    outdir = Path(args.out)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    classes = load_classes(seq_root)
+    trX, trY = collect_split(seq_root, "train", classes, args.frames)
+    vaX, vaY = collect_split(seq_root, "val", classes, args.frames)
+
+    np.save(outdir/"train_X.npy", trX)
+    np.save(outdir/"train_y.npy", trY)
+    np.save(outdir/"val_X.npy", vaX)
+    np.save(outdir/"val_y.npy", vaY)
+    json.dump(classes, open(outdir/"class_names.json", "w"))
+    json.dump({"frames": args.frames, "input_dim": 63}, open(outdir/"meta.json","w"))
+
+    print(f"Saved dataset → {outdir}")
+    print(f"  train {trX.shape}, val {vaX.shape}, classes={classes}")
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/train_seq.py b/first_attempt_landmark_hands/train_seq.py
new file mode 100755
index 0000000..30641af
--- /dev/null
+++ b/first_attempt_landmark_hands/train_seq.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# train_seq.py
+import os, json, argparse
+import numpy as np
+import torch, torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+def get_device():
+    return torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+
+class SeqDataset(Dataset):
+    def __init__(self, X, y, augment=False):
+        self.X = X.astype(np.float32)   # (Nclip, T, 63)
+        self.y = y.astype(np.int64)
+        self.augment = augment
+
+    def __len__(self): return len(self.y)
+
+    def _augment(self, seq):  # seq: (T,63)
+        T = seq.shape[0]
+        pts = seq.reshape(T, 21, 3).copy()
+        # small 2D rotation (±7°) + scale (±10%) + Gaussian noise (σ=0.01)
+        ang = np.deg2rad(np.random.uniform(-7, 7))
+        c, s = np.cos(ang), np.sin(ang)
+        R = np.array([[c,-s],[s,c]], np.float32)
+        scale = np.random.uniform(0.9, 1.1)
+        pts[:, :, :2] = (pts[:, :, :2] @ R.T) * scale
+        pts += np.random.normal(0, 0.01, size=pts.shape).astype(np.float32)
+        return pts.reshape(T, 63)
+
+    def __getitem__(self, i):
+        xi = self.X[i]
+        if self.augment:
+            xi = self._augment(xi)
+        return torch.from_numpy(xi).float(), int(self.y[i])
+
+class SeqGRU(nn.Module):
+    def __init__(self, input_dim=63, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = nn.Sequential(
+            nn.Linear(hidden*2, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):           # x: (B,T,63)
+        h,_ = self.gru(x)           # (B,T,2H)
+        h_last = h[:, -1, :]        # or mean over time: h.mean(1)
+        return self.head(h_last)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32", help="Folder from prep_sequence_resampled.py")
+    ap.add_argument("--epochs", type=int, default=40)
+    ap.add_argument("--batch", type=int, default=64)
+    ap.add_argument("--lr", type=float, default=1e-3)
+    ap.add_argument("--out", default="asl_seq32_gru.pt")
+    args = ap.parse_args()
+
+    # Load dataset
+    trX = np.load(os.path.join(args.landmarks,"train_X.npy"))  # (N, T, 63)
+    trY = np.load(os.path.join(args.landmarks,"train_y.npy"))
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))
+    T = int(meta["frames"])
+
+    print(f"Loaded: train {trX.shape}  val {vaX.shape}  classes={classes}")
+
+    # Global mean/std over train (time+batch)
+    X_mean = trX.reshape(-1, trX.shape[-1]).mean(axis=0, keepdims=True).astype(np.float32)  # (1,63)
+    X_std  = trX.reshape(-1, trX.shape[-1]).std(axis=0, keepdims=True).astype(np.float32) + 1e-6
+    trXn   = (trX - X_mean) / X_std
+    vaXn   = (vaX - X_mean) / X_std
+
+    tr_ds = SeqDataset(trXn, trY, augment=True)
+    va_ds = SeqDataset(vaXn, vaY, augment=False)
+    tr_dl = DataLoader(tr_ds, batch_size=args.batch, shuffle=True)
+    va_dl = DataLoader(va_ds, batch_size=args.batch, shuffle=False)
+
+    device = get_device()
+    model = SeqGRU(input_dim=63, hidden=128, num_classes=len(classes)).to(device)
+    crit = nn.CrossEntropyLoss()
+    opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-4)
+    sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs)
+
+    best_acc, best_state = 0.0, None
+    for epoch in range(1, args.epochs+1):
+        # Train
+        model.train()
+        tot, correct, loss_sum = 0, 0, 0.0
+        for xb, yb in tr_dl:
+            xb, yb = xb.to(device), yb.to(device)
+            opt.zero_grad(set_to_none=True)
+            logits = model(xb)
+            loss = crit(logits, yb)
+            loss.backward()
+            opt.step()
+            loss_sum += loss.item() * yb.size(0)
+            correct += (logits.argmax(1)==yb).sum().item()
+            tot += yb.size(0)
+        tr_loss = loss_sum / max(1, tot)
+        tr_acc = correct / max(1, tot)
+
+        # Validate
+        model.eval()
+        vtot, vcorrect = 0, 0
+        with torch.no_grad():
+            for xb, yb in va_dl:
+                xb, yb = xb.to(device), yb.to(device)
+                logits = model(xb)
+                vcorrect += (logits.argmax(1)==yb).sum().item()
+                vtot += yb.size(0)
+        va_acc = vcorrect / max(1, vtot)
+        sch.step()
+
+        print(f"Epoch {epoch:02d}: train_loss={tr_loss:.4f} train_acc={tr_acc:.3f} val_acc={va_acc:.3f}")
+
+        if va_acc > best_acc:
+            best_acc = va_acc
+            best_state = {
+                "model": model.state_dict(),
+                "classes": classes,
+                "frames": T,
+                "X_mean": torch.from_numpy(X_mean),   # tensors → future-proof
+                "X_std":  torch.from_numpy(X_std),
+            }
+            torch.save(best_state, args.out)
+            print(f"  ✅ Saved best → {args.out} (val_acc={best_acc:.3f})")
+
+    print("Done. Best val_acc:", best_acc)
+
+if __name__ == "__main__":
+    main()
diff --git a/first_attempt_landmark_hands/what_to_do.txt b/first_attempt_landmark_hands/what_to_do.txt
new file mode 100644
index 0000000..d7ccda3
--- /dev/null
+++ b/first_attempt_landmark_hands/what_to_do.txt
@@ -0,0 +1,24 @@
+# 1) Create dirs
+# ./make_seq_dirs.sh A B J Z
+
+# 2) Capture clips (0.8s each by default)
+python capture_sequence.py --label A --split train
+python capture_sequence.py --label A --split val
+python capture_sequence.py --label B --split train
+python capture_sequence.py --label B --split val
+python capture_sequence.py --label J --split train
+python capture_sequence.py --label J --split val
+python capture_sequence.py --label Z --split train
+python capture_sequence.py --label Z --split val
+
+# 3) Preprocess to 32 frames (auto-picks classes from sequences/train/*)
+python prep_sequence_resampled.py --in sequences --out landmarks_seq32 --frames 32
+
+# 4) Train GRU (multiclass on A/B/J/Z)
+python train_seq.py --landmarks landmarks_seq32 --epochs 40 --batch 64 --lr 1e-3 --out asl_seq32_gru_ABJZ.pt
+
+# 5) Live inference
+python infer_seq_webcam.py --model asl_seq32_gru_ABJZ.pt --threshold 0.6 --smooth 0.2
+
+# If you later add more letters (e.g., C, D), 
+# just create those folders, record clips, re-run the prep step, then train again — the pipeline will include whatever letters exist under sequences/train/.
diff --git a/infer_seq_webcam.py b/infer_seq_webcam.py
new file mode 100755
index 0000000..bc8cb29
--- /dev/null
+++ b/infer_seq_webcam.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+"""
+Live webcam inference for two hands + full face + pose + face-relative hand extras (1670 dims/frame).
+Works for letters (A..Z) or word classes (e.g., Mother, Father).
+Optionally detects the sequence W → E → B to open a URL.
+"""
+
+import os, math, argparse, time, webbrowser
+import numpy as np
+import cv2
+import torch
+import mediapipe as mp
+
+# Quiet logs
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"; os.environ["GLOG_minloglevel"] = "2"
+import absl.logging; absl.logging.set_verbosity(absl.logging.ERROR)
+cv2.setLogLevel(0)
+
+mp_holistic = mp.solutions.holistic
+
+# ---------- normalization ----------
+def _angle(v): 
+    return math.atan2(v[1], v[0])
+
+def _rot2d(t):
+    c, s = math.cos(t), math.sin(t)
+    return np.array([[c, -s], [s, c]], dtype=np.float32)
+
+def normalize_hand(pts, handed=None):
+    pts = pts.astype(np.float32).copy()
+    pts[:, :2] -= pts[0, :2]
+    if handed and str(handed).lower().startswith("left"): pts[:, 0] *= -1.0
+    v = pts[9, :2]; R = _rot2d(math.pi/2 - _angle(v))
+    pts[:, :2] = pts[:, :2] @ R.T
+    xy = pts[:, :2]; d = np.linalg.norm(xy[None,:,:] - xy[:,None,:], axis=-1).max()
+    d = 1.0 if d < 1e-6 else float(d)
+    pts[:, :2] /= d; pts[:, 2] /= d
+    return pts
+
+def normalize_face(face):
+    f = face.astype(np.float32).copy()
+    left, right = f[33, :2], f[263, :2]
+    center = 0.5*(left+right)
+    f[:, :2] -= center[None, :]
+    eye_vec = right - left; eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    f[:, :2] /= eye_dist; f[:, 2] /= eye_dist
+    R = _rot2d(-_angle(eye_vec)); f[:, :2] = f[:, :2] @ R.T
+    return f
+
+def normalize_pose(pose):
+    p = pose.astype(np.float32).copy()
+    ls, rs = p[11, :2], p[12, :2]
+    center = 0.5*(ls+rs); p[:, :2] -= center[None, :]
+    sw_vec = rs - ls; sw = float(np.linalg.norm(sw_vec)) or 1.0
+    p[:, :2] /= sw; p[:, 2] /= sw
+    R = _rot2d(-_angle(sw_vec)); p[:, :2] = p[:, :2] @ R.T
+    return p
+
+def face_frame_transform(face_pts):
+    left = face_pts[33, :2]; right = face_pts[263, :2]
+    center = 0.5*(left + right)
+    eye_vec = right - left
+    eye_dist = float(np.linalg.norm(eye_vec)) or 1.0
+    R = _rot2d(-_angle(eye_vec))
+    return center, eye_dist, R
+
+def to_face_frame(pt_xy, center, eye_dist, R):
+    v = (pt_xy - center) / eye_dist
+    return (v @ R.T).astype(np.float32)
+
+# ---------- model ----------
+class SeqGRU(torch.nn.Module):
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = torch.nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(hidden*2, 128), torch.nn.ReLU(), torch.nn.Dropout(0.2),
+            torch.nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x); return self.head(h[:, -1, :])
+
+# ---------- main ----------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True)
+    ap.add_argument("--camera", type=int, default=0)
+    ap.add_argument("--threshold", type=float, default=0.35)
+    ap.add_argument("--smooth", type=float, default=0.1, help="EMA window (seconds); 0 disables")
+    ap.add_argument("--width", type=int, default=640)
+    ap.add_argument("--height", type=int, default=480)
+    ap.add_argument("--holistic-complexity", type=int, default=1, choices=[0,1,2])
+    ap.add_argument("--det-thresh", type=float, default=0.5)
+    ap.add_argument("--url", type=str, default="https://www.google.com")
+    args = ap.parse_args()
+
+    state = torch.load(args.model, map_location="cpu", weights_only=False)
+    classes = state["classes"]
+    T = int(state.get("frames", 32))
+    X_mean = state["X_mean"].cpu().numpy().astype(np.float32)
+    X_std  = (state["X_std"].cpu().numpy().astype(np.float32) + 1e-6)
+    input_dim = X_mean.shape[-1]  # expected 1670
+
+    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes)).to(device)
+    model.load_state_dict(state["model"]); model.eval()
+
+    hol = mp_holistic.Holistic(
+        static_image_mode=False,
+        model_complexity=args.holistic_complexity,
+        smooth_landmarks=True,
+        enable_segmentation=False,
+        refine_face_landmarks=False,
+        min_detection_confidence=args.det_thresh,
+        min_tracking_confidence=args.det_thresh,
+    )
+
+    cap = cv2.VideoCapture(args.camera)
+    if not cap.isOpened(): raise SystemExit(f"❌ Could not open camera {args.camera}")
+    cap.set(cv2.CAP_PROP_FRAME_WIDTH, args.width); cap.set(cv2.CAP_PROP_FRAME_HEIGHT, args.height)
+
+    print(f"✅ Loaded {args.model}  frames={T}  classes={classes}  input_dim={input_dim}")
+    print("Press 'q' to quit.")
+
+    seq_buffer, ema_probs = [], None
+    last_ts = time.time()
+    last_emitted = None
+    history = []
+
+    while True:
+        ok, frame = cap.read()
+        if not ok: break
+        now = time.time(); dt = max(1e-6, now - last_ts); last_ts = now
+
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        res = hol.process(rgb)
+
+        overlay = "No face/hand"
+        current = None
+
+        # hands
+        right_pts = left_pts = None
+        if res.right_hand_landmarks is not None:
+            right_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.right_hand_landmarks.landmark], np.float32)
+        if res.left_hand_landmarks is not None:
+            left_pts  = np.array([[lm.x, lm.y, lm.z] for lm in res.left_hand_landmarks.landmark],  np.float32)
+
+        # face
+        face_pts = None
+        if res.face_landmarks is not None:
+            face_pts = np.array([[lm.x, lm.y, lm.z] for lm in res.face_landmarks.landmark], np.float32)
+
+        # pose
+        pose_arr = None
+        if res.pose_landmarks is not None:
+            pose_arr = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in res.pose_landmarks.landmark], np.float32)
+
+        if face_pts is not None and (right_pts is not None or left_pts is not None):
+            f_norm = normalize_face(face_pts)
+            f_center, f_scale, f_R = face_frame_transform(face_pts)
+
+            def hand_face_extras(hand_pts):
+                if hand_pts is None: 
+                    return np.zeros(4, np.float32)
+                wrist_xy = hand_pts[0, :2]
+                tip_xy   = hand_pts[8, :2]
+                w = to_face_frame(wrist_xy, f_center, f_scale, f_R)
+                t = to_face_frame(tip_xy,   f_center, f_scale, f_R)
+                return np.array([w[0], w[1], t[0], t[1]], np.float32)
+
+            rh_ex = hand_face_extras(right_pts)
+            lh_ex = hand_face_extras(left_pts)
+
+            rh = normalize_hand(right_pts, "Right").reshape(-1) if right_pts is not None else np.zeros(63, np.float32)
+            lh = normalize_hand(left_pts,  "Left").reshape(-1)  if left_pts  is not None else np.zeros(63, np.float32)
+            p_norm = normalize_pose(pose_arr).reshape(-1) if pose_arr is not None else np.zeros(33*4, np.float32)
+
+            feat = np.concatenate([rh, lh, f_norm.reshape(-1), p_norm, rh_ex, lh_ex], axis=0)  # (1670,)
+            seq_buffer.append(feat)
+            if len(seq_buffer) > T: seq_buffer.pop(0)
+
+            if len(seq_buffer) == T:
+                X = np.stack(seq_buffer, 0)
+                Xn = (X - X_mean) / X_std
+                xt = torch.from_numpy(Xn).float().unsqueeze(0).to(device)
+                with torch.no_grad():
+                    probs = torch.softmax(model(xt), dim=1)[0].cpu().numpy()
+
+                if args.smooth > 0:
+                    alpha = 1.0 - math.exp(-dt / args.smooth)
+                    ema_probs = probs if ema_probs is None else (1.0 - alpha) * ema_probs + alpha * probs
+                    use = ema_probs
+                else:
+                    use = probs
+
+                top_idx = int(np.argmax(use)); top_p = float(use[top_idx]); top_cls = classes[top_idx]
+                overlay = f"{top_cls} {top_p*100:.1f}%"
+                if top_p >= args.threshold: current = top_cls
+        else:
+            seq_buffer, ema_probs = [], None
+
+        # Emit on change & optional "WEB" sequence trigger
+        if current is not None and current != last_emitted:
+            print(f"Detected: {current}")
+            last_emitted = current
+            history.append(current)
+            if len(history) > 3: history.pop(0)
+            if history == ["W","E","B"]:
+                print("🚀 Detected WEB! Opening browser…")
+                try: webbrowser.open(args.url)
+                except Exception as e: print(f"⚠️ Browser open failed: {e}")
+                history.clear()
+
+        # Overlay
+        buf = f"buf={len(seq_buffer)}/{T}"
+        if ema_probs is not None:
+            ti = int(np.argmax(ema_probs)); tp = float(ema_probs[ti]); tc = classes[ti]
+            buf += f"  top={tc} {tp:.2f}"
+        cv2.putText(frame, overlay, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)
+        cv2.putText(frame, buf, (20, 75), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
+        cv2.imshow("ASL demo (R+L hands + face + pose + extras)", frame)
+        if cv2.waitKey(1) & 0xFF == ord('q'): break
+
+    cap.release(); cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/make_seq_dirs.sh b/make_seq_dirs.sh
new file mode 100755
index 0000000..0b17584
--- /dev/null
+++ b/make_seq_dirs.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Create sequences/<train|val>/<LETTER>/ for the given letters.
+# Example: ./make_seq_dirs.sh A B J Z
+
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+  echo "Usage: $0 LETTER [LETTER ...]   e.g.  $0 A B J Z"
+  exit 1
+fi
+
+ROOT="sequences"
+for SPLIT in train val; do
+  for L in "$@"; do
+    mkdir -p "$ROOT/$SPLIT/$L"
+  done
+done
+
+echo "✅ Created $ROOT/train and $ROOT/val for: $*"
diff --git a/prep_sequence_resampled.py b/prep_sequence_resampled.py
new file mode 100755
index 0000000..aba5aa5
--- /dev/null
+++ b/prep_sequence_resampled.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Build fixed-length (N frames) dataset from sequences/<split>/<CLASS>/clip_*.npz
+import argparse, os, glob, json
+from pathlib import Path
+import numpy as np
+
+def resample_sequence(X, N=32):
+    # X: (T,F) -> (N,F) via linear interpolation over frame index
+    T = len(X)
+    if T == 0: return np.zeros((N, X.shape[1]), np.float32)
+    if T == 1: return np.repeat(X, N, axis=0)
+    src = np.linspace(0, T-1, num=T, dtype=np.float32)
+    dst = np.linspace(0, T-1, num=N, dtype=np.float32)
+    out = np.zeros((N, X.shape[1]), np.float32)
+    for d in range(X.shape[1]):
+        out[:, d] = np.interp(dst, src, X[:, d])
+    return out
+
+def load_classes(seq_root: Path):
+    # Accept ANY class subfolder under sequences/train/, ignore hidden/system dirs
+    train_dir = seq_root / "train"
+    if not train_dir.exists():
+        raise SystemExit(f"Missing folder: {train_dir}")
+    classes = sorted([
+        p.name for p in train_dir.iterdir()
+        if p.is_dir() and not p.name.startswith(".")
+    ])
+    if not classes:
+        raise SystemExit("No classes found in sequences/train/ (folders should be class names like Mother, Father, etc.)")
+    return classes
+
+def collect_split(seq_root: Path, split: str, classes, N):
+    Xs, ys = [], []
+    for ci, cls in enumerate(classes):
+        for f in sorted(glob.glob(str(seq_root / split / cls / "clip_*.npz"))):
+            d = np.load(f)
+            Xi = d["X"].astype(np.float32)   # (T,F)
+            XiN = resample_sequence(Xi, N)   # (N,F)
+            Xs.append(XiN); ys.append(ci)
+    if Xs:
+        X = np.stack(Xs, 0); y = np.array(ys, np.int64)
+    else:
+        X = np.zeros((0, N, 1), np.float32); y = np.zeros((0,), np.int64)
+    return X, y
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in", dest="in_dir", default="sequences")
+    ap.add_argument("--out", default="landmarks_seq32")
+    ap.add_argument("--frames", type=int, default=32)
+    args = ap.parse_args()
+
+    seq_root = Path(args.in_dir)
+    outdir = Path(args.out); outdir.mkdir(parents=True, exist_ok=True)
+
+    classes = load_classes(seq_root)
+    trX, trY = collect_split(seq_root, "train", classes, args.frames)
+    vaX, vaY = collect_split(seq_root, "val",   classes, args.frames)
+
+    if trX.size == 0 and vaX.size == 0:
+        raise SystemExit("Found no clips. Did you run capture and save any clip_*.npz files?")
+
+    np.save(outdir/"train_X.npy", trX)
+    np.save(outdir/"train_y.npy", trY)
+    np.save(outdir/"val_X.npy",   vaX)
+    np.save(outdir/"val_y.npy",   vaY)
+    json.dump(classes, open(outdir/"class_names.json", "w"))
+
+    # Detect true feature dimension from data
+    input_dim = int(trX.shape[-1] if trX.size else vaX.shape[-1])
+    json.dump({"frames": args.frames, "input_dim": input_dim}, open(outdir/"meta.json","w"))
+
+    print(f"Saved dataset → {outdir}")
+    print(f"  train {trX.shape}, val {vaX.shape}, classes={classes}, input_dim={input_dim}")
+
+if __name__ == "__main__":
+    main()
diff --git a/train_seq.py b/train_seq.py
new file mode 100755
index 0000000..6b66c7a
--- /dev/null
+++ b/train_seq.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Train BiGRU on (T, F=1662) sequences; reads input_dim from meta.json
+
+import os, json, argparse
+import numpy as np
+import torch, torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+def get_device():
+    return torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+
+class SeqDataset(Dataset):
+    def __init__(self, X, y, augment=False):
+        self.X = X.astype(np.float32)
+        self.y = y.astype(np.int64)
+        self.augment = augment
+    def __len__(self): return len(self.y)
+    def _augment(self, seq):
+        # Light Gaussian noise — safe for high-D features
+        return seq + np.random.normal(0, 0.01, size=seq.shape).astype(np.float32)
+    def __getitem__(self, i):
+        xi = self.X[i]
+        if self.augment: xi = self._augment(xi)
+        return torch.from_numpy(xi).float(), int(self.y[i])
+
+class SeqGRU(nn.Module):
+    def __init__(self, input_dim, hidden=128, num_classes=26):
+        super().__init__()
+        self.gru = nn.GRU(input_dim, hidden, batch_first=True, bidirectional=True)
+        self.head = nn.Sequential(
+            nn.Linear(hidden*2, 128), nn.ReLU(), nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x):
+        h,_ = self.gru(x)
+        return self.head(h[:, -1, :])
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--landmarks", default="landmarks_seq32")
+    ap.add_argument("--epochs", type=int, default=40)
+    ap.add_argument("--batch", type=int, default=64)
+    ap.add_argument("--lr", type=float, default=1e-3)
+    ap.add_argument("--out", default="asl_seq32_gru.pt")
+    args = ap.parse_args()
+
+    trX = np.load(os.path.join(args.landmarks,"train_X.npy"))
+    trY = np.load(os.path.join(args.landmarks,"train_y.npy"))
+    vaX = np.load(os.path.join(args.landmarks,"val_X.npy"))
+    vaY = np.load(os.path.join(args.landmarks,"val_y.npy"))
+    classes = json.load(open(os.path.join(args.landmarks,"class_names.json")))
+    meta = json.load(open(os.path.join(args.landmarks,"meta.json")))
+    T = int(meta["frames"])
+    input_dim = int(meta.get("input_dim", trX.shape[-1]))
+
+    print(f"Loaded: train {trX.shape}  val {vaX.shape}  classes={classes}  input_dim={input_dim}")
+
+    # Global normalization (feature-wise)
+    X_mean = trX.reshape(-1, trX.shape[-1]).mean(axis=0, keepdims=True).astype(np.float32)
+    X_std  = trX.reshape(-1, trX.shape[-1]).std(axis=0, keepdims=True).astype(np.float32) + 1e-6
+    trXn   = (trX - X_mean) / X_std
+    vaXn   = (vaX - X_mean) / X_std
+
+    tr_ds = SeqDataset(trXn, trY, augment=True)
+    va_ds = SeqDataset(vaXn, vaY, augment=False)
+    tr_dl = DataLoader(tr_ds, batch_size=args.batch, shuffle=True)
+    va_dl = DataLoader(va_ds, batch_size=args.batch, shuffle=False)
+
+    device = get_device()
+    model = SeqGRU(input_dim=input_dim, hidden=128, num_classes=len(classes)).to(device)
+    crit = nn.CrossEntropyLoss()
+    opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-4)
+    sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs)
+
+    best_acc, best_state = 0.0, None
+    for epoch in range(1, args.epochs+1):
+        model.train()
+        tot, correct, loss_sum = 0, 0, 0.0
+        for xb, yb in tr_dl:
+            xb, yb = xb.to(device), yb.to(device)
+            opt.zero_grad(set_to_none=True)
+            logits = model(xb)
+            loss = crit(logits, yb)
+            loss.backward()
+            opt.step()
+            loss_sum += loss.item() * yb.size(0)
+            correct += (logits.argmax(1)==yb).sum().item()
+            tot += yb.size(0)
+        tr_loss = loss_sum / max(1, tot)
+        tr_acc = correct / max(1, tot)
+
+        model.eval()
+        vtot, vcorrect = 0, 0
+        with torch.no_grad():
+            for xb, yb in va_dl:
+                xb, yb = xb.to(device), yb.to(device)
+                logits = model(xb)
+                vcorrect += (logits.argmax(1)==yb).sum().item()
+                vtot += yb.size(0)
+        va_acc = vcorrect / max(1, vtot)
+        sch.step()
+
+        print(f"Epoch {epoch:02d}: train_loss={tr_loss:.4f} train_acc={tr_acc:.3f} val_acc={va_acc:.3f}")
+
+        if va_acc > best_acc:
+            best_acc = va_acc
+            best_state = {
+                "model": model.state_dict(),
+                "classes": classes,
+                "frames": T,
+                "X_mean": torch.from_numpy(X_mean),
+                "X_std":  torch.from_numpy(X_std),
+            }
+            torch.save(best_state, args.out)
+            print(f"  ✅ Saved best → {args.out} (val_acc={best_acc:.3f})")
+
+    print("Done. Best val_acc:", best_acc)
+
+if __name__ == "__main__":
+    main()
diff --git a/what_to_do.txt b/what_to_do.txt
new file mode 100644
index 0000000..647c95e
--- /dev/null
+++ b/what_to_do.txt
@@ -0,0 +1,16 @@
+./make_seq_dirs.sh Mother Father
+
+python capture_sequence.py --label Mother --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Mother --split val   --seconds 0.8 --count 20
+python capture_sequence.py --label Father --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Father --split val   --seconds 0.8 --count 20
+python capture_sequence.py --label Go --split train --seconds 0.8 --count 100
+python capture_sequence.py --label Go --split val   --seconds 0.8 --count 20
+
+python prep_sequence_resampled.py --in sequences --out landmarks_seq32 --frames 32
+
+python train_seq.py --landmarks landmarks_seq32 --out asl_seq32_gru_mother_father_go.pt
+python eval_val.py --landmarks landmarks_seq32 --model asl_seq32_gru_mother_father_go.pt
+
+python infer_seq_webcam.py --model asl_seq32_gru_mother_father_go.pt --threshold 0.35 --smooth 0.1
+