Mediapipe Gesture Recognition Handedness detects both hands but result object has only one

I'm trying to detect multi hand gestures using mediapipe. I want to detect both the gestures of both hands independently. Both hands can have the same gesture or different gestures. I the given code the function print_result is printing the the contents of the object after the inference has been run on the frame. the max_num_hands parameter has been set to 2 here with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:

import cv2
import mediapipe as mp
import time

cap = cv2.VideoCapture(1)

BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode

# Callback function to print gesture recognition results
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    if result.gestures:
        # Get the category name of the recognized gesture
        category_name = result.gestures[0][0].category_name
        # print(category_name)
        print(result)
    else:
        print("No gestures recognized")

# Initialize MediaPipe drawing utils and hands module
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

# Configure options for the gesture recognizer
options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path='C:\\Users\\golut\\OneDrive\\Documents\\Projects\\Virtual Mouse\\models\\gesture_recognizer.task'),
    running_mode=VisionRunningMode.LIVE_STREAM,
    result_callback=print_result
)

# Create a gesture recognizer instance
with GestureRecognizer.create_from_options(options) as recognizer:
    print('Gesture recognizer created')

    while True:
        success, img = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # Convert BGR image to RGB for MediaPipe processing
        rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Detect hand landmarks using MediaPipe Hands
        with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
            results = hands.process(rgb_img)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Draw hand landmarks on the image with specified color and thickness
                    mp_drawing.draw_landmarks(
                        img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                        mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
                        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2)
                    )
    

        # Prepare image for gesture recognition
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_img)
        current_time_ms = int(time.time() * 1000)

        # Perform gesture recognition on the processed image
        detected_gestures = recognizer.recognize_async(mp_image, current_time_ms)

        img = cv2.flip(img, 1)  # Flips the image horizontally
        cv2.imshow("Imshow", img)

        if cv2.waitKey(10) == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

In the object GestureRecognizerResult we see a list handedness that contains the category_name which is either left or right.

The problem is the gesture recognizer only gives one output either left or right hand in the output depending on which hand got detected first and the latter is ignored. In mediapipe's given try on example, both hands when shown to the camera with different gesture are recognized independently. Link to mediapipe demo

GestureRecognizerResult(gestures=[[Category(index=-1, score=0.7995390892028809,
 display_name='', category_name='Open_Palm')]], handedness=[[Category(index=0, score=0.9178019165992737, display_name='Right', category_name='Right')]], 
hand_landmarks=[[NormalizedLandmark(x=0.23192565143108368, y=0.8508237600326538, z=3.7175095712882467e-07, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2964465022087097, y=0.807819128036499, z=-0.02174699306488037, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3386477530002594, y=0.7381684184074402, z=-0.026875635609030724, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3652242422103882, y=0.6717657446861267, z=-0.03148443624377251, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.39171433448791504, y=0.627888560295105, z=-0.03597773239016533, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.30005523562431335, y=0.6441321969032288, z=-0.002747688442468643, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3194928765296936, y=0.5634738802909851, z=-0.015889683738350868, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.3276906907558441, y=0.5102080702781677, z=-0.0299211535602808, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.33434727787971497, y=0.46343517303466797, z=-0.04088740795850754, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2615800201892853, y=0.6335919499397278, z=-0.002842121757566929, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.26276978850364685, y=0.5426733493804932, z=-0.014345655217766762, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.2621628940105438, y=0.48378312587738037, z=-0.028536789119243622, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.26235222816467285, y=0.43310630321502686, z=-0.03940063342452049, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.22592493891716003, y=0.6417601108551025, z=-0.006861940026283264, visibility=0.0, presence=0.0), 

NormalizedLandmark(x=0.2230750024318695, y=0.5614591240882874, z=-0.01952073909342289, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.22449643909931183, y=0.5094373822212219, z=-0.029860520735383034, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.229284405708313, y=0.46403464674949646, z=-0.03746004030108452, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.19173786044120789, y=0.663299024105072, z=-0.0136506836861372, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.18222525715827942, y=0.604834794998169, z=-0.025881653651595116, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.18415895104408264, y=0.5673394799232483, z=-0.03144041821360588, visibility=0.0, presence=0.0), 
NormalizedLandmark(x=0.19118154048919678, y=0.5324922800064087, z=-0.034897807985544205, visibility=0.0, presence=0.0)]], 
hand_world_landmarks=[[Landmark(x=-0.012245522812008858, y=0.09203963726758957, z=-0.0038926522247493267, visibility=0.0, presence=0.0), 
Landmark(x=0.021369636058807373, y=0.06962162256240845, z=-0.009559692814946175, visibility=0.0, presence=0.0), 
Landmark(x=0.042654991149902344, y=0.04227661341428757, z=-0.012077674269676208, visibility=0.0, presence=0.0), 
Landmark(x=0.0617685541510582, y=0.014768477529287338, z=-0.011491118930280209, visibility=0.0, presence=0.0), 
Landmark(x=0.07398916780948639, y=-0.012367911636829376, z=-0.0075836945325136185, visibility=0.0, presence=0.0), 
Landmark(x=0.025482138618826866, y=-0.0010876771993935108, z=0.006445789244025946, visibility=0.0, presence=0.0), 
Landmark(x=0.03543740138411522, y=-0.02912675403058529, z=-0.00173004565294832, visibility=0.0, presence=0.0), 
Landmark(x=0.040552493184804916, y=-0.0489623099565506, z=-0.007902431301772594, visibility=0.0, presence=0.0), 
Landmark(x=0.04358145594596863, y=-0.06487865746021271, z=-0.0319957509636879, visibility=0.0, presence=0.0), 
Landmark(x=0.0016808465588837862, y=-0.004498452879488468, z=0.006683729123324156, visibility=0.0, presence=0.0), 
Landmark(x=0.004972374066710472, y=-0.04138147830963135, z=-0.003927251789718866, visibility=0.0, presence=0.0), 
Landmark(x=0.00558849610388279, y=-0.06327502429485321, z=-0.020593348890542984, visibility=0.0, presence=0.0), 
Landmark(x=0.0066368915140628815, y=-0.08291880786418915, z=-0.039193443953990936, visibility=0.0, presence=0.0), 
Landmark(x=-0.018360454589128494, y=-0.0009643810335546732, z=-0.0038148483727127314, visibility=0.0, presence=0.0), 
Landmark(x=-0.015782665461301804, y=-0.03162727132439613, z=-0.013909644447267056, visibility=0.0, presence=0.0), 
Landmark(x=-0.013191262260079384, y=-0.05145301669836044, z=-0.028273196890950203, visibility=0.0, presence=0.0), 
Landmark(x=-0.009723789989948273, y=-0.0685187503695488, z=-0.04024944826960564, visibility=0.0, presence=0.0), 
Landmark(x=-0.035820234566926956, y=0.011946788057684898, z=-0.0120608601719141, visibility=0.0, presence=0.0), 
Landmark(x=-0.03725161403417587, y=-0.009996423497796059, z=-0.017715157940983772, visibility=0.0, presence=0.0), 
Landmark(x=-0.036166295409202576, y=-0.028470497578382492, z=-0.026987750083208084, visibility=0.0, presence=0.0), 
Landmark(x=-0.030654065310955048, y=-0.03972318768501282, z=-0.03699912130832672, visibility=0.0, presence=0.0)]])

I want to achieve the same result as the demo of recognizing both hands individually at once with different gestures.

Solution

You can use code from my other answer that recognizes gestures of both hands (mediapipe==0.10.0):

import cv2
import mediapipe as mp
from mediapipe.tasks import python
import threading 

class GestureRecognizer:
    def main(self):
        num_hands = 2
        model_path = "gesture_recognizer.task"
        GestureRecognizer = mp.tasks.vision.GestureRecognizer
        GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
        VisionRunningMode = mp.tasks.vision.RunningMode

        self.lock = threading.Lock()
        self.current_gestures = []
        options = GestureRecognizerOptions(
            base_options=python.BaseOptions(model_asset_path=model_path),
            running_mode=VisionRunningMode.LIVE_STREAM,
            num_hands = num_hands,
            result_callback=self.__result_callback)
        recognizer = GestureRecognizer.create_from_options(options)

        timestamp = 0 
        mp_drawing = mp.solutions.drawing_utils
        mp_hands = mp.solutions.hands
        hands = mp_hands.Hands(
                static_image_mode=False,
                max_num_hands=num_hands,
                min_detection_confidence=0.65,
                min_tracking_confidence=0.65)

        cap = cv2.VideoCapture(0)

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame)
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            np_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_array)
                    recognizer.recognize_async(mp_image, timestamp)
                    timestamp = timestamp + 1 # should be monotonically increasing, because in LIVE_STREAM mode
                    
                self.put_gestures(frame)

            cv2.imshow('MediaPipe Hands', frame)
            if cv2.waitKey(1) & 0xFF == 27:
                break

        cap.release()

    def put_gestures(self, frame):
        self.lock.acquire()
        gestures = self.current_gestures
        self.lock.release()
        y_pos = 50
        for hand_gesture_name in gestures:
            # show the prediction on the frame
            cv2.putText(frame, hand_gesture_name, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 
                                1, (0,0,255), 2, cv2.LINE_AA)
            y_pos += 50

    def __result_callback(self, result, output_image, timestamp_ms):
        #print(f'gesture recognition result: {result}')
        self.lock.acquire() # solves potential concurrency issues
        self.current_gestures = []
        if result is not None and any(result.gestures):
            print("Recognized gestures:")
            for single_hand_gesture_data in result.gestures:
                gesture_name = single_hand_gesture_data[0].category_name
                print(gesture_name)
                self.current_gestures.append(gesture_name)
        self.lock.release()

if __name__ == "__main__":
    rec = GestureRecognizer()
    rec.main()

Demo: