Files
slr_google_landmarks_demo/recognize_gesture.py
jared 8bcc62b045 Initial commit: MediaPipe landmarks demo
HTML demos for face, hand, gesture, and posture tracking using MediaPipe.
Includes Python CLI tools for processing video files.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 22:38:40 -05:00

99 lines
3.2 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import sys
import mediapipe as mp
BaseOptions = mp.tasks.BaseOptions
VisionRunningMode = mp.tasks.vision.RunningMode
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
def _first_category(item):
"""
Accepts either:
- a Classifications object with .categories
- a list of Category
- None / empty
Returns the first Category or None.
"""
if item is None:
return None
# Shape 1: Classifications with .categories
cats = getattr(item, "categories", None)
if isinstance(cats, list):
return cats[0] if cats else None
# Shape 2: already a list[Category]
if isinstance(item, list):
return item[0] if item else None
return None
def _len_safe(x):
return len(x) if isinstance(x, list) else 0
def main():
parser = argparse.ArgumentParser(description="Recognize hand gestures in a still image with MediaPipe.")
parser.add_argument("-i", "--image", default="hand.jpg", help="Path to input image (default: hand.jpg)")
parser.add_argument("-m", "--model", default="gesture_recognizer.task",
help="Path to gesture_recognizer .task model (default: gesture_recognizer.task)")
parser.add_argument("--num_hands", type=int, default=2, help="Max hands to detect")
args = parser.parse_args()
options = GestureRecognizerOptions(
base_options=BaseOptions(model_asset_path=args.model),
running_mode=VisionRunningMode.IMAGE,
num_hands=args.num_hands,
)
# Load the image
try:
mp_image = mp.Image.create_from_file(args.image)
except Exception as e:
print(f"Failed to load image '{args.image}': {e}", file=sys.stderr)
sys.exit(1)
with GestureRecognizer.create_from_options(options) as recognizer:
result = recognizer.recognize(mp_image)
if result is None:
print("No result returned.")
return
n = max(
_len_safe(getattr(result, "gestures", [])),
_len_safe(getattr(result, "handedness", [])),
_len_safe(getattr(result, "hand_landmarks", [])),
)
if n == 0:
print("No hands/gestures detected.")
return
for i in range(n):
handed = None
if _len_safe(getattr(result, "handedness", [])) > i:
cat = _first_category(result.handedness[i])
if cat:
handed = cat.category_name
top_gesture = None
score = None
if _len_safe(getattr(result, "gestures", [])) > i:
cat = _first_category(result.gestures[i])
if cat:
top_gesture = cat.category_name
score = cat.score
header = f"Hand #{i+1}" + (f" ({handed})" if handed else "")
print(header + ":")
if top_gesture:
print(f" Gesture: {top_gesture} (score={score:.3f})")
else:
print(" Gesture: none")
# If you want pixel landmark coordinates later:
# if _len_safe(getattr(result, "hand_landmarks", [])) > i:
# for j, lm in enumerate(result.hand_landmarks[i]):
# print(f" lm{j}: x={lm.x:.3f} y={lm.y:.3f} z={lm.z:.3f}")
if __name__ == "__main__":
main()