I am trying to utilize MediaPipe for real-time gesture recognition over a webcam. However, I want to use the gesture_recognizer.task
model for inference. Here's my code:
import cv2
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
model_path = "gesture_recognizer.task"
base_options = python.BaseOptions(model_asset_path=model_path)
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
print('gesture recognition result: {}'.format(result))
options = GestureRecognizerOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=VisionRunningMode.LIVE_STREAM,
result_callback=print_result)
recognizer = GestureRecognizer.create_from_options(options)
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=False,
max_num_hands=2,
min_detection_confidence=0.65,
min_tracking_confidence=0.65)
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
if not ret:
break
i = 1 # left or right hand
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
np_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
h, w, c = frame.shape
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_array)
results = recognizer.recognize_async(mp_image)
# show the prediction on the frame
cv2.putText(mp_image, results, (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
1, (0,0,255), 2, cv2.LINE_AA)
cv2.imshow('MediaPipe Hands', frame)
if cv2.waitKey(1) & 0xFF == 27:
break
cap.release()
I am getting a NameError: name 'mp_image' is not defined
error on the line cv2.putText(mp_image, results, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2, cv2.LINE_AA)
. By now I am really confused and not sure what I am doing, let alone what I am doing wrong. Please help!
In addition to all the changes mentioned in the comments, you should also provide a "monotonically increasing" timestamp to the recognize_async()
function.
Since the recognize_callback
cannot access the output image directly, I have used a lock and a shared field to share gesture data between the threads. This is a fully functional example (mediapipe==0.10.0):
import cv2
import mediapipe as mp
from mediapipe.tasks import python
import threading
class GestureRecognizer:
def main(self):
num_hands = 2
model_path = "gesture_recognizer.task"
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode
self.lock = threading.Lock()
self.current_gestures = []
options = GestureRecognizerOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=VisionRunningMode.LIVE_STREAM,
num_hands = num_hands,
result_callback=self.__result_callback)
recognizer = GestureRecognizer.create_from_options(options)
timestamp = 0
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=False,
max_num_hands=num_hands,
min_detection_confidence=0.65,
min_tracking_confidence=0.65)
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
if not ret:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
np_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_array)
recognizer.recognize_async(mp_image, timestamp)
timestamp = timestamp + 1 # should be monotonically increasing, because in LIVE_STREAM mode
self.put_gestures(frame)
cv2.imshow('MediaPipe Hands', frame)
if cv2.waitKey(1) & 0xFF == 27:
break
cap.release()
def put_gestures(self, frame):
self.lock.acquire()
gestures = self.current_gestures
self.lock.release()
y_pos = 50
for hand_gesture_name in gestures:
# show the prediction on the frame
cv2.putText(frame, hand_gesture_name, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX,
1, (0,0,255), 2, cv2.LINE_AA)
y_pos += 50
def __result_callback(self, result, output_image, timestamp_ms):
#print(f'gesture recognition result: {result}')
self.lock.acquire() # solves potential concurrency issues
self.current_gestures = []
if result is not None and any(result.gestures):
print("Recognized gestures:")
for single_hand_gesture_data in result.gestures:
gesture_name = single_hand_gesture_data[0].category_name
print(gesture_name)
self.current_gestures.append(gesture_name)
self.lock.release()
if __name__ == "__main__":
rec = GestureRecognizer()
rec.main()
Demo: