slr_google_landmarks_demo/recognize_gesture.py

#!/usr/bin/env python3
import argparse
import sys
import mediapipe as mp

BaseOptions = mp.tasks.BaseOptions
VisionRunningMode = mp.tasks.vision.RunningMode
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions

def _first_category(item):
    """
    Accepts either:
      - a Classifications object with .categories
      - a list of Category
      - None / empty
    Returns the first Category or None.
    """
    if item is None:
        return None
    # Shape 1: Classifications with .categories
    cats = getattr(item, "categories", None)
    if isinstance(cats, list):
        return cats[0] if cats else None
    # Shape 2: already a list[Category]
    if isinstance(item, list):
        return item[0] if item else None
    return None

def _len_safe(x):
    return len(x) if isinstance(x, list) else 0

def main():
    parser = argparse.ArgumentParser(description="Recognize hand gestures in a still image with MediaPipe.")
    parser.add_argument("-i", "--image", default="hand.jpg", help="Path to input image (default: hand.jpg)")
    parser.add_argument("-m", "--model", default="gesture_recognizer.task",
                        help="Path to gesture_recognizer .task model (default: gesture_recognizer.task)")
    parser.add_argument("--num_hands", type=int, default=2, help="Max hands to detect")
    args = parser.parse_args()

    options = GestureRecognizerOptions(
        base_options=BaseOptions(model_asset_path=args.model),
        running_mode=VisionRunningMode.IMAGE,
        num_hands=args.num_hands,
    )

    # Load the image
    try:
        mp_image = mp.Image.create_from_file(args.image)
    except Exception as e:
        print(f"Failed to load image '{args.image}': {e}", file=sys.stderr)
        sys.exit(1)

    with GestureRecognizer.create_from_options(options) as recognizer:
        result = recognizer.recognize(mp_image)

    if result is None:
        print("No result returned.")
        return

    n = max(
        _len_safe(getattr(result, "gestures", [])),
        _len_safe(getattr(result, "handedness", [])),
        _len_safe(getattr(result, "hand_landmarks", [])),
    )
    if n == 0:
        print("No hands/gestures detected.")
        return

    for i in range(n):
        handed = None
        if _len_safe(getattr(result, "handedness", [])) > i:
            cat = _first_category(result.handedness[i])
            if cat:
                handed = cat.category_name

        top_gesture = None
        score = None
        if _len_safe(getattr(result, "gestures", [])) > i:
            cat = _first_category(result.gestures[i])
            if cat:
                top_gesture = cat.category_name
                score = cat.score

        header = f"Hand #{i+1}" + (f" ({handed})" if handed else "")
        print(header + ":")
        if top_gesture:
            print(f"  Gesture: {top_gesture} (score={score:.3f})")
        else:
            print("  Gesture: none")

        # If you want pixel landmark coordinates later:
        # if _len_safe(getattr(result, "hand_landmarks", [])) > i:
        #     for j, lm in enumerate(result.hand_landmarks[i]):
        #         print(f"    lm{j}: x={lm.x:.3f} y={lm.y:.3f} z={lm.z:.3f}")

if __name__ == "__main__":
    main()