commit 8bcc62b0459281f5ca688e32d7be691927aa65ad Author: jared Date: Mon Jan 19 22:38:40 2026 -0500 Initial commit: MediaPipe landmarks demo HTML demos for face, hand, gesture, and posture tracking using MediaPipe. Includes Python CLI tools for processing video files. Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77ac754 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv/ +__pycache__/ +*.pyc diff --git a/Training_handshape_B.md b/Training_handshape_B.md new file mode 100644 index 0000000..5a92afb --- /dev/null +++ b/Training_handshape_B.md @@ -0,0 +1,179 @@ +Let’s add a custom gesture for the **ASL letter “B”** (flat hand, fingers together, thumb folded across the palm) using MediaPipe **Gesture Recognizer (Model Maker)**. + +# Plan (what you’ll build) + +* A custom model with a new class label, e.g. `ASL_B`, plus the required `none` class. +* A small, labeled image dataset (Model Maker will extract hand landmarks for you). +* A trained `.task` file you can drop into your Python/JS app and allowlist. + +--- + +# 1) Pick labels + +Use: + +* `ASL_B` ← your new gesture +* `none` ← anything that’s not one of your target gestures (mandatory) + +Folder layout: + +``` +dataset/ + ASL_B/ + ...images... + none/ + ...images... +``` + +--- + +# 2) Collect the right data (what to capture) + +Target handshape for **B**: + +* **Fingers**: index–pinky fully extended and **pressed together** +* **Thumb**: folded across palm (not sticking out to the side) +* **Palm**: facing camera (front) and also a few angles + +Suggested minimums (per label): + +| Bucket | Shots | +| --------------------------------------------------- | -------------------- | +| Distances: close (\~40–60 cm), medium (\~80–120 cm) | 80 | +| View angles: front, \~30°, \~60° yaw | 80 | +| Rotations: slight roll/tilt | 40 | +| Lighting: bright, dim, backlit | 40 | +| Backgrounds: plain wall, cluttered office/outdoor | 40 | +| Hands: left & right (both) | included across all | +| Skin tones / several people | as many as practical | + +Do **at least \~300–500** `ASL_B` images to start. +For **`none`**, include: open palm (“High-Five”), slightly spread fingers, thumbs-up, fist, pointing, random objects/background frames, other ASL letters—especially **Open\_Palm** look-alikes so the model learns “not B”. + +Quick ways to get images: + +* Record short clips on laptop/phone and extract frames (e.g., 2 fps). +* Ask 3–5 colleagues to contribute a short 10–20s clip each. + +Frame extraction example: + +```bash +# Extract 2 frames/sec from a video into dataset/ASL_B/ +ffmpeg -i b_sign.mov -vf fps=2 dataset/ASL_B/b_%05d.jpg +# Do the same for negatives into dataset/none/ +``` + +--- + +# 3) Train with Model Maker (Python) + +Create and activate a venv, then: + +```bash +pip install --upgrade pip +pip install mediapipe-model-maker +``` + +Training script (save as `train_asl_b.py` and run it): + +```python +from mediapipe_model_maker import gesture_recognizer as gr + +DATA_DIR = "dataset" +EXPORT_DIR = "exported_model" + +# Load & auto-preprocess (runs hand detection, keeps images with a detected hand) +data = gr.Dataset.from_folder( + dirname=DATA_DIR, + hparams=gr.HandDataPreprocessingParams( # you can tweak these if needed + min_detection_confidence=0.5 + ) +) + +# Split +train_data, rest = data.split(0.8) +validation_data, test_data = rest.split(0.5) + +# Hyperparameters (start small; bump epochs if needed) +hparams = gr.HParams( + export_dir=EXPORT_DIR, + epochs=12, + batch_size=16, + learning_rate=0.001, +) + +# Optional model head size & dropout +options = gr.GestureRecognizerOptions( + hparams=hparams, + model_options=gr.ModelOptions(layer_widths=[128, 64], dropout_rate=0.1) +) + +model = gr.GestureRecognizer.create( + train_data=train_data, + validation_data=validation_data, + options=options +) + +# Evaluate +loss, acc = model.evaluate(test_data, batch_size=1) +print(f"Test loss={loss:.4f}, acc={acc:.4f}") + +# Export .task +model.export_model() # writes exported_model/gesture_recognizer.task +print("Exported:", EXPORT_DIR + "/gesture_recognizer.task") +``` + +Tips: + +* If many `ASL_B` images get dropped at load time (no hand detected), back up the camera a little or ensure the whole hand is visible. +* If `none` is weak, add more “near-miss” negatives: open palm with fingers slightly apart, thumb slightly out, partial occlusions. + +--- + +# 4) Plug it into your app + +**Python (Tasks API example):** + +```python +import mediapipe as mp +BaseOptions = mp.tasks.BaseOptions +GestureRecognizer = mp.tasks.vision.GestureRecognizer +GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions +VisionRunningMode = mp.tasks.vision.RunningMode +ClassifierOptions = mp.tasks.components.processors.ClassifierOptions + +options = GestureRecognizerOptions( + base_options=BaseOptions(model_asset_path="exported_model/gesture_recognizer.task"), + running_mode=VisionRunningMode.LIVE_STREAM, + custom_gesture_classifier_options=ClassifierOptions( + score_threshold=0.6, # tighten until false positives drop + category_allowlist=["ASL_B"] # only report your class + ), +) +recognizer = GestureRecognizer.create_from_options(options) +``` + +**Web (JS):** + +```js +const recognizer = await GestureRecognizer.createFromOptions(fileset, { + baseOptions: { modelAssetPath: "exported_model/gesture_recognizer.task" }, + runningMode: "LIVE_STREAM", + customGesturesClassifierOptions: { + scoreThreshold: 0.6, + categoryAllowlist: ["ASL_B"] + } +}); +``` + +--- + +# 5) Troubleshooting & tuning + +* **False positives with Open Palm:** Add more `none` examples where fingers are together but **thumb is visible** to the side. The model needs to see “almost B but not B.” +* **Left vs right hand:** Include both in training. If you only trained on right hands, left hands may underperform. +* **Distance issues:** If far-away hands fail, capture more medium/far shots. Landmarks get noisier when small. +* **Thresholds:** Raise `score_threshold` to reduce spurious detections; lower it if you miss true B’s. +* **Confusion matrix:** If accuracy is fine but live results wobble, collect more from the exact camera/lighting you’ll use. + +--- diff --git a/face.html b/face.html new file mode 100644 index 0000000..47d3310 --- /dev/null +++ b/face.html @@ -0,0 +1,435 @@ + + + + + + + + + + Face Landmarker + + + + + + + + +

Face landmark detection using the MediaPipe FaceLandmarker task

+ + + + + + diff --git a/face_landmarker.task b/face_landmarker.task new file mode 100644 index 0000000..c50c845 Binary files /dev/null and b/face_landmarker.task differ diff --git a/fingers_positions.sh b/fingers_positions.sh new file mode 100755 index 0000000..33ecdf6 --- /dev/null +++ b/fingers_positions.sh @@ -0,0 +1 @@ +python hand_landmarker_cli.py --image hand.png --model hand_landmarker.task --out annotated.png diff --git a/gesture.html b/gesture.html new file mode 100644 index 0000000..6795891 --- /dev/null +++ b/gesture.html @@ -0,0 +1,290 @@ + + + + + + MediaPipe Hand Gesture Recognizer — Single File Demo + + + + + + + + + + + + diff --git a/gesture.sh b/gesture.sh new file mode 100755 index 0000000..56a4c7d --- /dev/null +++ b/gesture.sh @@ -0,0 +1,5 @@ +export GLOG_minloglevel=2 +export TF_CPP_MIN_LOG_LEVEL=3 +python recognize_gesture.py --image ily.png --model gesture_recognizer.task 2>/dev/null + + diff --git a/gesture_recognizer.task b/gesture_recognizer.task new file mode 100644 index 0000000..130d1d0 Binary files /dev/null and b/gesture_recognizer.task differ diff --git a/hand.png b/hand.png new file mode 100644 index 0000000..62df8a7 Binary files /dev/null and b/hand.png differ diff --git a/hand_landmarker.task b/hand_landmarker.task new file mode 100644 index 0000000..0d53faf Binary files /dev/null and b/hand_landmarker.task differ diff --git a/hand_landmarker_cli.py b/hand_landmarker_cli.py new file mode 100755 index 0000000..7fc0827 --- /dev/null +++ b/hand_landmarker_cli.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Hand Landmarks on a static image using MediaPipe Tasks. + +Usage: + python hand_landmarker_cli.py --image hand.png --model hand_landmarker.task --max_hands 2 --out annotated.png + +What it does: + • Loads the MediaPipe Hand Landmarker model (.task file) + • Runs landmark detection on a single image + • Prints handedness and 21 landmark coords for each detected hand + • Saves an annotated image with landmarks and connections +""" + +import argparse +import sys +from pathlib import Path + +import cv2 +import numpy as np +import mediapipe as mp + +# MediaPipe Tasks API aliases +BaseOptions = mp.tasks.BaseOptions +HandLandmarker = mp.tasks.vision.HandLandmarker +HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions +VisionRunningMode = mp.tasks.vision.RunningMode + +# Landmark connection topology (same as mp.solutions.hands.HAND_CONNECTIONS, copied to avoid extra dependency) +HAND_CONNECTIONS = [ + (0,1),(1,2),(2,3),(3,4), # Thumb + (0,5),(5,6),(6,7),(7,8), # Index + (5,9),(9,10),(10,11),(11,12), # Middle + (9,13),(13,14),(14,15),(15,16), # Ring + (13,17),(17,18),(18,19),(19,20), # Pinky + (0,17) # Palm base to pinky base +] + +def draw_landmarks(image_bgr: np.ndarray, landmarks_norm: list): + """ + Draws landmarks and connections on a BGR image. + `landmarks_norm` is a list of normalized (x,y,z) MediaPipe landmarks (0..1). + """ + h, w = image_bgr.shape[:2] + + # Convert normalized to pixel coords + pts = [] + for lm in landmarks_norm: + x = int(lm.x * w) + y = int(lm.y * h) + pts.append((x, y)) + + # Draw connections + for a, b in HAND_CONNECTIONS: + if 0 <= a < len(pts) and 0 <= b < len(pts): + cv2.line(image_bgr, pts[a], pts[b], (0, 255, 0), 2, cv2.LINE_AA) + + # Draw keypoints + for i, (x, y) in enumerate(pts): + cv2.circle(image_bgr, (x, y), 3, (255, 255, 255), -1, cv2.LINE_AA) + cv2.circle(image_bgr, (x, y), 2, (0, 0, 255), -1, cv2.LINE_AA) + +def main(): + ap = argparse.ArgumentParser(description="MediaPipe Hand Landmarker (static image)") + ap.add_argument("--image", required=True, help="Path to an input image (e.g., hand.jpg)") + ap.add_argument("--model", default="hand_landmarker.task", help="Path to MediaPipe .task model") + ap.add_argument("--max_hands", type=int, default=2, help="Maximum hands to detect") + ap.add_argument("--out", default="annotated.png", help="Output path for annotated image") + args = ap.parse_args() + + img_path = Path(args.image) + if not img_path.exists(): + print(f"[ERROR] Image not found: {img_path}", file=sys.stderr) + sys.exit(1) + + model_path = Path(args.model) + if not model_path.exists(): + print(f"[ERROR] Model not found: {model_path}", file=sys.stderr) + print("Download the model bundle (.task) and point --model to it.", file=sys.stderr) + sys.exit(2) + + # Load image for MP and for drawing + mp_image = mp.Image.create_from_file(str(img_path)) + image_bgr = cv2.imread(str(img_path)) + if image_bgr is None: + print(f"[ERROR] Could not read image with OpenCV: {img_path}", file=sys.stderr) + sys.exit(3) + + # Configure and run the landmarker + options = HandLandmarkerOptions( + base_options=BaseOptions(model_asset_path=str(model_path)), + running_mode=VisionRunningMode.IMAGE, + num_hands=args.max_hands, + min_hand_detection_confidence=0.5, + min_hand_presence_confidence=0.5, + min_tracking_confidence=0.5 + ) + + with HandLandmarker.create_from_options(options) as landmarker: + result = landmarker.detect(mp_image) + + # Print results + if not result.hand_landmarks: + print("No hands detected.") + else: + for i, (handedness, lms, world_lms) in enumerate( + zip(result.handedness, result.hand_landmarks, result.hand_world_landmarks) + ): + label = handedness[0].category_name if handedness else "Unknown" + score = handedness[0].score if handedness else 0.0 + print(f"\nHand #{i+1}: {label} (score {score:.3f})") + for idx, lm in enumerate(lms): + print(f" L{idx:02d}: x={lm.x:.3f} y={lm.y:.3f} z={lm.z:.3f}") + + # Draw + draw_landmarks(image_bgr, lms) + # Put label + cv2.putText(image_bgr, f"{label}", (10, 30 + i*30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2, cv2.LINE_AA) + + # Save annotated image + cv2.imwrite(str(args.out), image_bgr) + print(f"\nSaved annotated image to: {args.out}") + +if __name__ == "__main__": + main() diff --git a/holistic.html b/holistic.html new file mode 100644 index 0000000..d109499 --- /dev/null +++ b/holistic.html @@ -0,0 +1,262 @@ + + + + + MediaPipe Holistic — Main Output Only + + + + + +
+ + + +
+ +
+ + +
+
+
Loading
+
+ + + + + + +
+ + +
+ + + + + + + + + + + diff --git a/ily.png b/ily.png new file mode 100644 index 0000000..c2de311 Binary files /dev/null and b/ily.png differ diff --git a/landmarks.png b/landmarks.png new file mode 100644 index 0000000..7935331 Binary files /dev/null and b/landmarks.png differ diff --git a/marker.html b/marker.html new file mode 100644 index 0000000..dfa0369 --- /dev/null +++ b/marker.html @@ -0,0 +1,268 @@ + + + + + + MediaPipe Hand Landmarker — Single File Demo + + + + + + + + + + + + + +

Demo: Webcam continuous hands landmarks detection

+

Hold your hand in front of your webcam to get real-time hand landmarker detection.
Click ENABLE WEBCAM below and grant access to the webcam if prompted.

+ +
+ +
+ + +
+
+ + + + + diff --git a/more_info.txt b/more_info.txt new file mode 100644 index 0000000..daf7426 --- /dev/null +++ b/more_info.txt @@ -0,0 +1,2 @@ +https://ai.google.dev/edge/mediapipe/solutions/vision/hand_landmarker +https://ai.google.dev/edge/mediapipe/solutions/customization/gesture_recognizer \ No newline at end of file diff --git a/posture.html b/posture.html new file mode 100644 index 0000000..dd9069e --- /dev/null +++ b/posture.html @@ -0,0 +1,298 @@ + + + + + + + + Pose Landmarker — Single File Demo + + + + + + + + + +

Pose detection using the MediaPipe PoseLandmarker task

+ + + + + + diff --git a/process_mp4_facial.py b/process_mp4_facial.py new file mode 100755 index 0000000..6ace09a --- /dev/null +++ b/process_mp4_facial.py @@ -0,0 +1,151 @@ +import cv2 +import mediapipe as mp +from mediapipe.tasks import python +from mediapipe.tasks.python import vision +import numpy as np +from mediapipe.framework.formats import landmark_pb2 +import argparse +import os +import csv + +# --- NEW: Helper function to create the landmark-to-feature map --- +def create_landmark_map(): + """Creates a mapping from landmark index to facial feature name.""" + landmark_map = {} + + # Define the connection groups from MediaPipe's face_mesh solutions + connection_groups = { + 'lips': mp.solutions.face_mesh.FACEMESH_LIPS, + 'left_eye': mp.solutions.face_mesh.FACEMESH_LEFT_EYE, + 'right_eye': mp.solutions.face_mesh.FACEMESH_RIGHT_EYE, + 'left_eyebrow': mp.solutions.face_mesh.FACEMESH_LEFT_EYEBROW, + 'right_eyebrow': mp.solutions.face_mesh.FACEMESH_RIGHT_EYEBROW, + 'face_oval': mp.solutions.face_mesh.FACEMESH_FACE_OVAL, + 'left_iris': mp.solutions.face_mesh.FACEMESH_LEFT_IRIS, + 'right_iris': mp.solutions.face_mesh.FACEMESH_RIGHT_IRIS, + } + + # Populate the map by iterating through the connection groups + for part_name, connections in connection_groups.items(): + for connection in connections: + landmark_map[connection[0]] = part_name + landmark_map[connection[1]] = part_name + + return landmark_map + +# --- Helper Function to Draw Landmarks --- +def draw_landmarks_on_image(rgb_image, detection_result): + """Draws face landmarks on a single image frame.""" + face_landmarks_list = detection_result.face_landmarks + annotated_image = np.copy(rgb_image) + + # Loop through the detected faces to visualize. + for face_landmarks in face_landmarks_list: + face_landmarks_proto = landmark_pb2.NormalizedLandmarkList() + face_landmarks_proto.landmark.extend([ + landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in face_landmarks + ]) + + mp.solutions.drawing_utils.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks_proto, + connections=mp.solutions.face_mesh.FACEMESH_TESSELATION, + landmark_drawing_spec=None, + connection_drawing_spec=mp.solutions.drawing_styles + .get_default_face_mesh_tesselation_style()) + mp.solutions.drawing_utils.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks_proto, + connections=mp.solutions.face_mesh.FACEMESH_CONTOURS, + landmark_drawing_spec=None, + connection_drawing_spec=mp.solutions.drawing_styles + .get_default_face_mesh_contours_style()) + mp.solutions.drawing_utils.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks_proto, + connections=mp.solutions.face_mesh.FACEMESH_IRISES, + landmark_drawing_spec=None, + connection_drawing_spec=mp.solutions.drawing_styles + .get_default_face_mesh_iris_connections_style()) + + return annotated_image + + +def main(): + parser = argparse.ArgumentParser(description='Process a video to detect and draw face landmarks.') + parser.add_argument('input_video', help='The path to the input video file.') + args = parser.parse_args() + + input_video_path = args.input_video + base_name, extension = os.path.splitext(input_video_path) + output_video_path = f"{base_name}_annotated{extension}" + output_csv_path = f"{base_name}_landmarks.csv" + + # --- NEW: Create the landmark map --- + landmark_to_part_map = create_landmark_map() + + # --- Configuration & Setup --- + model_path = 'face_landmarker.task' + base_options = python.BaseOptions(model_asset_path=model_path) + options = vision.FaceLandmarkerOptions(base_options=base_options, + output_face_blendshapes=True, + output_facial_transformation_matrixes=True, + num_faces=1) + detector = vision.FaceLandmarker.create_from_options(options) + + # --- Video and CSV Setup --- + cap = cv2.VideoCapture(input_video_path) + if not cap.isOpened(): + print(f"Error: Could not open video file {input_video_path}") + return + + frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = int(cap.get(cv2.CAP_PROP_FPS)) + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height)) + + # Open CSV file for writing + with open(output_csv_path, 'w', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + # NEW: Write the updated header row + csv_writer.writerow(['frame', 'face', 'landmark_index', 'face_part', 'x', 'y', 'z']) + + print(f"Processing video: {input_video_path} 📹") + frame_number = 0 + while(cap.isOpened()): + ret, frame = cap.read() + if not ret: + break + + frame_number += 1 + rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame) + detection_result = detector.detect(mp_image) + + # Write landmark data to CSV + if detection_result.face_landmarks: + for face_index, face_landmarks in enumerate(detection_result.face_landmarks): + for landmark_index, landmark in enumerate(face_landmarks): + # NEW: Look up the face part name from the map + face_part = landmark_to_part_map.get(landmark_index, 'unknown') + # NEW: Write the new column to the CSV row + csv_writer.writerow([frame_number, face_index, landmark_index, face_part, landmark.x, landmark.y, landmark.z]) + + # Draw landmarks on the frame for the video + annotated_frame = draw_landmarks_on_image(rgb_frame, detection_result) + bgr_annotated_frame = cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR) + out.write(bgr_annotated_frame) + + # Release everything when the job is finished + cap.release() + out.release() + cv2.destroyAllWindows() + + print(f"\n✅ Processing complete.") + print(f"Annotated video saved to: {output_video_path}") + print(f"Landmarks CSV saved to: {output_csv_path}") + + +if __name__ == '__main__': + main() diff --git a/process_mp4_holistic.py b/process_mp4_holistic.py new file mode 100755 index 0000000..f628c6b --- /dev/null +++ b/process_mp4_holistic.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +holistic_mp4.py +Process an MP4 with MediaPipe Holistic: + - Saves annotated video + - Exports CSV of face/pose/hand landmarks per frame + +Usage: + python holistic_mp4.py /path/to/input.mp4 + python holistic_mp4.py /path/to/input.mp4 --out-video out.mp4 --out-csv out.csv --show +""" + +import argparse +import csv +import os +import sys +from pathlib import Path + +import cv2 +import mediapipe as mp + +mp_holistic = mp.solutions.holistic +mp_drawing = mp.solutions.drawing_utils +mp_styles = mp.solutions.drawing_styles + + +def parse_args(): + p = argparse.ArgumentParser(description="Run MediaPipe Holistic on an MP4 and export annotated video + CSV landmarks.") + p.add_argument("input", help="Input .mp4 file") + p.add_argument("--out-video", help="Output annotated MP4 path (default: _annotated.mp4)") + p.add_argument("--out-csv", help="Output CSV path for landmarks (default: _landmarks.csv)") + p.add_argument("--model-complexity", type=int, default=1, choices=[0, 1, 2], help="Holistic model complexity") + p.add_argument("--no-smooth", action="store_true", help="Disable smoothing (smoothing is ON by default)") + p.add_argument("--refine-face", action="store_true", help="Refine face landmarks (iris, lips).") + p.add_argument("--show", action="store_true", help="Show preview window while processing") + return p.parse_args() + + +def open_video_writer(cap, out_path): + # Properties from input + fps = cap.get(cv2.CAP_PROP_FPS) + if fps is None or fps <= 0: + fps = 30.0 # sensible fallback + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # Writer + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(out_path, fourcc, float(fps), (width, height)) + if not writer.isOpened(): + raise RuntimeError(f"Failed to open VideoWriter at {out_path}") + return writer, fps, (width, height) + + +def write_landmarks_to_csv(writer, frame_idx, ts_ms, kind, landmarks, world_landmarks=None, handedness=None): + """ + landmarks: NormalizedLandmarkList (x,y,z, visibility?) -> face/hand have no visibility; pose has visibility. + world_landmarks: LandmarkList in meters (optional, pose_world_landmarks available). + handedness: "Left"|"Right"|None (we label hand sets by field name; not a confidence score here) + """ + if not landmarks: + return + + # index by position; world coords may be absent or differ in length + wl = world_landmarks.landmark if world_landmarks and getattr(world_landmarks, "landmark", None) else None + + for i, lm in enumerate(landmarks.landmark): + world_x = world_y = world_z = "" + if wl and i < len(wl): + world_x, world_y, world_z = wl[i].x, wl[i].y, wl[i].z + + # Some landmark types (pose) include visibility; others (face/hands) don't + vis = getattr(lm, "visibility", "") + writer.writerow([ + frame_idx, + int(ts_ms), + kind, # e.g., face, pose, left_hand, right_hand + i, + lm.x, lm.y, lm.z, + vis, + "", # presence not provided in Holistic landmarks + world_x, world_y, world_z, + handedness or "" + ]) + + +def main(): + args = parse_args() + in_path = Path(args.input) + if not in_path.exists(): + print(f"Input not found: {in_path}", file=sys.stderr) + sys.exit(1) + + out_video = Path(args.out_video) if args.out_video else in_path.with_name(in_path.stem + "_annotated.mp4") + out_csv = Path(args.out_csv) if args.out_csv else in_path.with_name(in_path.stem + "_landmarks.csv") + + cap = cv2.VideoCapture(str(in_path)) + if not cap.isOpened(): + print(f"Could not open video: {in_path}", file=sys.stderr) + sys.exit(1) + + writer, fps, (w, h) = open_video_writer(cap, str(out_video)) + + # Prepare CSV + out_csv.parent.mkdir(parents=True, exist_ok=True) + csv_file = open(out_csv, "w", newline="", encoding="utf-8") + csv_writer = csv.writer(csv_file) + csv_writer.writerow([ + "frame", "timestamp_ms", "type", "landmark_index", + "x", "y", "z", "visibility", "presence", + "world_x", "world_y", "world_z", "handedness" + ]) + + # Holistic configuration + holistic = mp_holistic.Holistic( + static_image_mode=False, + model_complexity=args.model_complexity, + smooth_landmarks=(not args.no_smooth), + refine_face_landmarks=args.refine_face, + enable_segmentation=False + ) + + try: + frame_idx = 0 + print(f"Processing: {in_path.name} -> {out_video.name}, {out_csv.name}") + while True: + ok, frame_bgr = cap.read() + if not ok: + break + + # Timestamp (ms) based on frame index and fps + ts_ms = (frame_idx / fps) * 1000.0 + + # Convert to RGB for MediaPipe + image_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) + image_rgb.flags.writeable = False + results = holistic.process(image_rgb) + image_rgb.flags.writeable = True + + # Draw on a BGR copy for output + out_frame = frame_bgr + + # Face + if results.face_landmarks: + mp_drawing.draw_landmarks( + out_frame, + results.face_landmarks, + mp_holistic.FACEMESH_TESSELATION, + landmark_drawing_spec=None, + connection_drawing_spec=mp_styles.get_default_face_mesh_tesselation_style(), + ) + write_landmarks_to_csv(csv_writer, frame_idx, ts_ms, "face", results.face_landmarks) + + # Pose + if results.pose_landmarks: + mp_drawing.draw_landmarks( + out_frame, + results.pose_landmarks, + mp_holistic.POSE_CONNECTIONS, + landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style() + ) + write_landmarks_to_csv( + csv_writer, frame_idx, ts_ms, "pose", + results.pose_landmarks, + world_landmarks=getattr(results, "pose_world_landmarks", None) + ) + + # Left hand + if results.left_hand_landmarks: + mp_drawing.draw_landmarks( + out_frame, + results.left_hand_landmarks, + mp_holistic.HAND_CONNECTIONS, + landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style() + ) + write_landmarks_to_csv(csv_writer, frame_idx, ts_ms, "left_hand", results.left_hand_landmarks, handedness="Left") + + # Right hand + if results.right_hand_landmarks: + mp_drawing.draw_landmarks( + out_frame, + results.right_hand_landmarks, + mp_holistic.HAND_CONNECTIONS, + landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style() + ) + write_landmarks_to_csv(csv_writer, frame_idx, ts_ms, "right_hand", results.right_hand_landmarks, handedness="Right") + + # Write frame + writer.write(out_frame) + + # Optional preview + if args.show: + cv2.imshow("Holistic (annotated)", out_frame) + if cv2.waitKey(1) & 0xFF == 27: # ESC + break + + # Lightweight progress + if frame_idx % 120 == 0: + print(f" frame {frame_idx}", end="\r", flush=True) + frame_idx += 1 + + print(f"\nDone.\n Video: {out_video}\n CSV: {out_csv}") + + finally: + holistic.close() + writer.release() + cap.release() + csv_file.close() + if args.show: + cv2.destroyAllWindows() + + +if __name__ == "__main__": + main() diff --git a/recognize_gesture.py b/recognize_gesture.py new file mode 100755 index 0000000..510c08b --- /dev/null +++ b/recognize_gesture.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +import argparse +import sys +import mediapipe as mp + +BaseOptions = mp.tasks.BaseOptions +VisionRunningMode = mp.tasks.vision.RunningMode +GestureRecognizer = mp.tasks.vision.GestureRecognizer +GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions + +def _first_category(item): + """ + Accepts either: + - a Classifications object with .categories + - a list of Category + - None / empty + Returns the first Category or None. + """ + if item is None: + return None + # Shape 1: Classifications with .categories + cats = getattr(item, "categories", None) + if isinstance(cats, list): + return cats[0] if cats else None + # Shape 2: already a list[Category] + if isinstance(item, list): + return item[0] if item else None + return None + +def _len_safe(x): + return len(x) if isinstance(x, list) else 0 + +def main(): + parser = argparse.ArgumentParser(description="Recognize hand gestures in a still image with MediaPipe.") + parser.add_argument("-i", "--image", default="hand.jpg", help="Path to input image (default: hand.jpg)") + parser.add_argument("-m", "--model", default="gesture_recognizer.task", + help="Path to gesture_recognizer .task model (default: gesture_recognizer.task)") + parser.add_argument("--num_hands", type=int, default=2, help="Max hands to detect") + args = parser.parse_args() + + options = GestureRecognizerOptions( + base_options=BaseOptions(model_asset_path=args.model), + running_mode=VisionRunningMode.IMAGE, + num_hands=args.num_hands, + ) + + # Load the image + try: + mp_image = mp.Image.create_from_file(args.image) + except Exception as e: + print(f"Failed to load image '{args.image}': {e}", file=sys.stderr) + sys.exit(1) + + with GestureRecognizer.create_from_options(options) as recognizer: + result = recognizer.recognize(mp_image) + + if result is None: + print("No result returned.") + return + + n = max( + _len_safe(getattr(result, "gestures", [])), + _len_safe(getattr(result, "handedness", [])), + _len_safe(getattr(result, "hand_landmarks", [])), + ) + if n == 0: + print("No hands/gestures detected.") + return + + for i in range(n): + handed = None + if _len_safe(getattr(result, "handedness", [])) > i: + cat = _first_category(result.handedness[i]) + if cat: + handed = cat.category_name + + top_gesture = None + score = None + if _len_safe(getattr(result, "gestures", [])) > i: + cat = _first_category(result.gestures[i]) + if cat: + top_gesture = cat.category_name + score = cat.score + + header = f"Hand #{i+1}" + (f" ({handed})" if handed else "") + print(header + ":") + if top_gesture: + print(f" Gesture: {top_gesture} (score={score:.3f})") + else: + print(" Gesture: none") + + # If you want pixel landmark coordinates later: + # if _len_safe(getattr(result, "hand_landmarks", [])) > i: + # for j, lm in enumerate(result.hand_landmarks[i]): + # print(f" lm{j}: x={lm.x:.3f} y={lm.y:.3f} z={lm.z:.3f}") + +if __name__ == "__main__": + main() diff --git a/server_holistic.sh b/server_holistic.sh new file mode 100755 index 0000000..1e1c436 --- /dev/null +++ b/server_holistic.sh @@ -0,0 +1,2 @@ +echo "Go to: http://localhost:8001/holistic.html " +python -m http.server 8001 diff --git a/source_activate_venv.sh b/source_activate_venv.sh new file mode 100755 index 0000000..6243bb3 --- /dev/null +++ b/source_activate_venv.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# AlERT: source this script, don't run it directly. +# source source_activate_venv.sh + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + echo "This script must be sourced, not run directly." + echo "source source_activate_venv.sh" + exit 1 +fi + +# rest of your script here +echo "Script is being sourced. Continuing..." +source ./.venv/bin/activate