pythonmediapipe

Mediapipe Gesture Recognition Handedness detects both hands but result object has only one


I'm trying to detect multi hand gestures using mediapipe. I want to detect both the gestures of both hands independently. Both hands can have the same gesture or different gestures. I the given code the function print_result is printing the the contents of the object after the inference has been run on the frame. the max_num_hands parameter has been set to 2 here with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:

import cv2
import mediapipe as mp
import time

cap = cv2.VideoCapture(1)

BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode

# Callback function to print gesture recognition results
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    if result.gestures:
        # Get the category name of the recognized gesture
        category_name = result.gestures[0][0].category_name
        # print(category_name)
        print(result)
    else:
        print("No gestures recognized")

# Initialize MediaPipe drawing utils and hands module
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

# Configure options for the gesture recognizer
options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path='C:\\Users\\golut\\OneDrive\\Documents\\Projects\\Virtual Mouse\\models\\gesture_recognizer.task'),
    running_mode=VisionRunningMode.LIVE_STREAM,
    result_callback=print_result
)

# Create a gesture recognizer instance
with GestureRecognizer.create_from_options(options) as recognizer:
    print('Gesture recognizer created')

    while True:
        success, img = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # Convert BGR image to RGB for MediaPipe processing
        rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Detect hand landmarks using MediaPipe Hands
        with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
            results = hands.process(rgb_img)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Draw hand landmarks on the image with specified color and thickness
                    mp_drawing.draw_landmarks(
                        img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                        mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
                        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2)
                    )
    

        # Prepare image for gesture recognition
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_img)
        current_time_ms = int(time.time() * 1000)

        # Perform gesture recognition on the processed image
        detected_gestures = recognizer.recognize_async(mp_image, current_time_ms)

        img = cv2.flip(img, 1)  # Flips the image horizontally
        cv2.imshow("Imshow", img)

        if cv2.waitKey(10) == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

In the object GestureRecognizerResult we see a list handedness that contains the category_name which is either left or right.

The problem is the gesture recognizer only gives one output either left or right hand in the output depending on which hand got detected first and the latter is ignored. In mediapipe's given try on example, both hands when shown to the camera with different gesture are recognized independently. Link to mediapipe demo

GestureRecognizerResult(gestures=[[Category(index=-1, score=0.7995390892028809,
 display_name='', category_name='Open_Palm')]], handedness=[[Category(index=0, score=0.9178019165992737, display_name='Right', category_name='Right')]], 
hand_landmarks=[[NormalizedLandmark(x=0.23192565143108368, y=0.8508237600326538, z=3.7175095712882467e-07, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2964465022087097, y=0.807819128036499, z=-0.02174699306488037, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3386477530002594, y=0.7381684184074402, z=-0.026875635609030724, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3652242422103882, y=0.6717657446861267, z=-0.03148443624377251, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.39171433448791504, y=0.627888560295105, z=-0.03597773239016533, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.30005523562431335, y=0.6441321969032288, z=-0.002747688442468643, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3194928765296936, y=0.5634738802909851, z=-0.015889683738350868, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3276906907558441, y=0.5102080702781677, z=-0.0299211535602808, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.33434727787971497, y=0.46343517303466797, z=-0.04088740795850754, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2615800201892853, y=0.6335919499397278, z=-0.002842121757566929, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.26276978850364685, y=0.5426733493804932, z=-0.014345655217766762, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2621628940105438, y=0.48378312587738037, z=-0.028536789119243622, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.26235222816467285, y=0.43310630321502686, z=-0.03940063342452049, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.22592493891716003, y=0.6417601108551025, z=-0.006861940026283264, visibility=0.0, presence=0.0), 

NormalizedLandmark(x=0.2230750024318695, y=0.5614591240882874, z=-0.01952073909342289, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.22449643909931183, y=0.5094373822212219, z=-0.029860520735383034, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.229284405708313, y=0.46403464674949646, z=-0.03746004030108452, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.19173786044120789, y=0.663299024105072, z=-0.0136506836861372, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.18222525715827942, y=0.604834794998169, z=-0.025881653651595116, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.18415895104408264, y=0.5673394799232483, z=-0.03144041821360588, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.19118154048919678, y=0.5324922800064087, z=-0.034897807985544205, visibility=0.0, presence=0.0)]], 
hand_world_landmarks=[[Landmark(x=-0.012245522812008858, y=0.09203963726758957, z=-0.0038926522247493267, visibility=0.0, presence=0.0), 
Landmark(x=0.021369636058807373, y=0.06962162256240845, z=-0.009559692814946175, visibility=0.0, presence=0.0), 
Landmark(x=0.042654991149902344, y=0.04227661341428757, z=-0.012077674269676208, visibility=0.0, presence=0.0), 
Landmark(x=0.0617685541510582, y=0.014768477529287338, z=-0.011491118930280209, visibility=0.0, presence=0.0), 
Landmark(x=0.07398916780948639, y=-0.012367911636829376, z=-0.0075836945325136185, visibility=0.0, presence=0.0), 
Landmark(x=0.025482138618826866, y=-0.0010876771993935108, z=0.006445789244025946, visibility=0.0, presence=0.0), 
Landmark(x=0.03543740138411522, y=-0.02912675403058529, z=-0.00173004565294832, visibility=0.0, presence=0.0), 
Landmark(x=0.040552493184804916, y=-0.0489623099565506, z=-0.007902431301772594, visibility=0.0, presence=0.0), 
Landmark(x=0.04358145594596863, y=-0.06487865746021271, z=-0.0319957509636879, visibility=0.0, presence=0.0), 
Landmark(x=0.0016808465588837862, y=-0.004498452879488468, z=0.006683729123324156, visibility=0.0, presence=0.0), 
Landmark(x=0.004972374066710472, y=-0.04138147830963135, z=-0.003927251789718866, visibility=0.0, presence=0.0), 
Landmark(x=0.00558849610388279, y=-0.06327502429485321, z=-0.020593348890542984, visibility=0.0, presence=0.0), 
Landmark(x=0.0066368915140628815, y=-0.08291880786418915, z=-0.039193443953990936, visibility=0.0, presence=0.0), 
Landmark(x=-0.018360454589128494, y=-0.0009643810335546732, z=-0.0038148483727127314, visibility=0.0, presence=0.0), 
Landmark(x=-0.015782665461301804, y=-0.03162727132439613, z=-0.013909644447267056, visibility=0.0, presence=0.0), 
Landmark(x=-0.013191262260079384, y=-0.05145301669836044, z=-0.028273196890950203, visibility=0.0, presence=0.0), 
Landmark(x=-0.009723789989948273, y=-0.0685187503695488, z=-0.04024944826960564, visibility=0.0, presence=0.0), 
Landmark(x=-0.035820234566926956, y=0.011946788057684898, z=-0.0120608601719141, visibility=0.0, presence=0.0), 
Landmark(x=-0.03725161403417587, y=-0.009996423497796059, z=-0.017715157940983772, visibility=0.0, presence=0.0), 
Landmark(x=-0.036166295409202576, y=-0.028470497578382492, z=-0.026987750083208084, visibility=0.0, presence=0.0), 
Landmark(x=-0.030654065310955048, y=-0.03972318768501282, z=-0.03699912130832672, visibility=0.0, presence=0.0)]])

I want to achieve the same result as the demo of recognizing both hands individually at once with different gestures.


Solution

  • You can use code from my other answer that recognizes gestures of both hands (mediapipe==0.10.0):

    import cv2
    import mediapipe as mp
    from mediapipe.tasks import python
    import threading 
    
    class GestureRecognizer:
        def main(self):
            num_hands = 2
            model_path = "gesture_recognizer.task"
            GestureRecognizer = mp.tasks.vision.GestureRecognizer
            GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
            VisionRunningMode = mp.tasks.vision.RunningMode
    
            self.lock = threading.Lock()
            self.current_gestures = []
            options = GestureRecognizerOptions(
                base_options=python.BaseOptions(model_asset_path=model_path),
                running_mode=VisionRunningMode.LIVE_STREAM,
                num_hands = num_hands,
                result_callback=self.__result_callback)
            recognizer = GestureRecognizer.create_from_options(options)
    
            timestamp = 0 
            mp_drawing = mp.solutions.drawing_utils
            mp_hands = mp.solutions.hands
            hands = mp_hands.Hands(
                    static_image_mode=False,
                    max_num_hands=num_hands,
                    min_detection_confidence=0.65,
                    min_tracking_confidence=0.65)
    
            cap = cv2.VideoCapture(0)
    
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = hands.process(frame)
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                np_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                if results.multi_hand_landmarks:
                    for hand_landmarks in results.multi_hand_landmarks:
                        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_array)
                        recognizer.recognize_async(mp_image, timestamp)
                        timestamp = timestamp + 1 # should be monotonically increasing, because in LIVE_STREAM mode
                        
                    self.put_gestures(frame)
    
                cv2.imshow('MediaPipe Hands', frame)
                if cv2.waitKey(1) & 0xFF == 27:
                    break
    
            cap.release()
    
        def put_gestures(self, frame):
            self.lock.acquire()
            gestures = self.current_gestures
            self.lock.release()
            y_pos = 50
            for hand_gesture_name in gestures:
                # show the prediction on the frame
                cv2.putText(frame, hand_gesture_name, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 
                                    1, (0,0,255), 2, cv2.LINE_AA)
                y_pos += 50
    
        def __result_callback(self, result, output_image, timestamp_ms):
            #print(f'gesture recognition result: {result}')
            self.lock.acquire() # solves potential concurrency issues
            self.current_gestures = []
            if result is not None and any(result.gestures):
                print("Recognized gestures:")
                for single_hand_gesture_data in result.gestures:
                    gesture_name = single_hand_gesture_data[0].category_name
                    print(gesture_name)
                    self.current_gestures.append(gesture_name)
            self.lock.release()
    
    if __name__ == "__main__":
        rec = GestureRecognizer()
        rec.main()
    

    Demo:

    enter image description here