I'm trying to create simple code to detect faces using my webcam with the MediaPipe library, but I'm facing an issue. When I attempt to draw the box on the mp_image, it's not appearing. Can someone please help me?
#------ importamos las librerias -----------#
import cv2
import mediapipe as mp
import numpy as np
#--------- Declarar el detector --------#
detector = mp.tasks.vision.FaceDetector
dibujo = mp.solutions.drawing_utils
#----realizar la video captura----#
cap = cv2.VideoCapture(0)
#------Inicializamos parametros de configuracion-----#
BaseOptions = mp.tasks.BaseOptions
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
FaceDetectorResult = mp.tasks.vision.FaceDetectorResult
VisionRunningMode = mp.tasks.vision.RunningMode
#------- Crear una instancia para la detencion en tiempo real ------#
def print_result(result: FaceDetectorResult, output_image, timestamp_ms: int):
print('face detector result: {}'.format(result))
# Verificar si existen detecciones para dibujarlas
if result.detections:
# dibujar cada rostro detectado
for detection in result.detections:
#PROBLEMAS CON LOS TIPO DE DATOS
output_image = np.array(output_image)
print(f"image = {type(output_image)}")
det_pb2 = detection.to_pb2()
dibujo.draw_detection(output_image, det_pb2)
print('detected faces')
#-----------Iniciamos la configuracion----------------#
options = FaceDetectorOptions(
base_options = BaseOptions(model_asset_path='model.tflite'),
running_mode = VisionRunningMode.LIVE_STREAM,
min_detection_confidence = 0.5,
result_callback = print_result
)
# Obtén la tasa de cuadros por segundo (FPS) para calcular el timestamp
fps = cap.get(cv2.CAP_PROP_FPS)
frame_number = 0
with detector.create_from_options(options) as rostros:
while True:
# La lectura de la video captura
ret, frame = cap.read()
# Eliminar el error de movimiento
frame = cv2.flip(frame,1)
# Correcion de color
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Obtener frame_timestamp_ms
frame_timestamp_ms = int((frame_number / fps) * 1000)
frame_number += 1
# Convertimos el frame a modelo de imagen mp.image de mediapipe
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
# Mandamos la deteccion del rostro en frame
rostros.detect_async(mp_image,frame_timestamp_ms)
#Mostramos los fotogramas
cv2.imshow("Camara", mp_image)
#Leemos el teclado
t = cv2.waitKey(1)
if t == 27:
break
cap.release()
cv2.destroyAllWindows()
I've tried a few things, such as changing the image type and the detection type, but nothing seems to work.
sorry for not responding earlier. I had some health issues and needed to rest.
Well, the first thing to highlight is that we changed the running_mode option of the face detector from LIVE_STREAM to IMAGE. It’s not entirely clear to me, but it seems that the only difference between them is that in LIVE_STREAM, we can process the results directly using a callback function, whereas in IMAGE, we process each frame in a while loop.
options = FaceDetectorOptions(
base_options=BaseOptions(model_asset_path='model.tflite'),
running_mode=VisionRunningMode.IMAGE, # IMAGE MODE
min_detection_confidence=0.5
)
Another thing we modified was the addition of a function to visualize the results. I copied this function directly from the MediaPipe website. It’s a function created by the same developers, which draws the bounding box on a copy of the image and then marks the keypoints. But before that, the keypoints are passed to an internal function to normalize the values. This function checks if the values are normalized, and if everything is okay, it doesn’t return anything. However, if necessary, it calculates the normalization and returns the values, which are then used with OpenCV's circle method to draw the keypoints on the image.
from typing import Tuple, Union
import math
import cv2
import numpy as np
MARGIN = 10 # pixels
ROW_SIZE = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
TEXT_COLOR = (255, 0, 0) # red
def _normalized_to_pixel_coordinates(
normalized_x: float, normalized_y: float, image_width: int,
image_height: int) -> Union[None, Tuple[int, int]]:
"""Converts normalized value pair to pixel coordinates."""
# Checks if the float value is between 0 and 1.
def is_valid_normalized_value(value: float) -> bool:
return (value > 0 or math.isclose(0, value)) and (value < 1 or
math.isclose(1, value))
if not (is_valid_normalized_value(normalized_x) and
is_valid_normalized_value(normalized_y)):
# TODO: Draw coordinates even if it's outside of the image bounds.
return None
x_px = min(math.floor(normalized_x * image_width), image_width - 1)
y_px = min(math.floor(normalized_y * image_height), image_height - 1)
return x_px, y_px
def visualize(
image,
detection_result
) -> np.ndarray:
"""Draws bounding boxes and keypoints on the input image and returns it.
Args:
image: The input RGB image.
detection_result: The list of all "Detection" entities to visualize.
Returns:
Image with bounding boxes.
"""
annotated_image = image.copy()
height, width, _ = image.shape
for detection in detection_result.detections:
# Draw bounding box
bbox = detection.bounding_box
start_point = bbox.origin_x, bbox.origin_y
end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
cv2.rectangle(annotated_image, start_point, end_point, TEXT_COLOR, 3)
# Draw keypoints
for keypoint in detection.keypoints:
keypoint_px = _normalized_to_pixel_coordinates(
keypoint.x, keypoint.y, width, height)
color, thickness, radius = (0, 255, 0), 2, 2
cv2.circle(annotated_image, keypoint_px, thickness, color, radius)
# Draw label and score
category = detection.categories[0]
category_name = category.category_name
category_name = '' if category_name is None else category_name
probability = round(category.score, 2)
result_text = category_name + ' (' + str(probability) + ')'
text_location = (MARGIN + bbox.origin_x,
MARGIN + ROW_SIZE + bbox.origin_y)
cv2.putText(annotated_image, result_text, text_location,
cv2.FONT_HERSHEY_PLAIN, FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)
return annotated_image
Here is the full main code so that it can be verified by everyone:
import cv2
import mediapipe as mp
import numpy as np
from drawing import visualize
#--------- Declare the detector --------#
detector = mp.tasks.vision.FaceDetector
#---- Perform video capture ----#
cap = cv2.VideoCapture(0)
#------ Initialize configuration parameters -----#
BaseOptions = mp.tasks.BaseOptions
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
VisionRunningMode = mp.tasks.vision.RunningMode
#----------- Start configuration ----------------#
options = FaceDetectorOptions(
base_options=BaseOptions(model_asset_path='model.tflite'),
running_mode=VisionRunningMode.IMAGE, # IMAGE MODE
min_detection_confidence=0.5
)
with detector.create_from_options(options) as faces:
while True:
# Read from the video capture
ret, frame = cap.read()
if not ret:
print("Error capturing the image.")
break
# Eliminate movement error
frame = cv2.flip(frame, 1)
# Color correction
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Convert the frame to MediaPipe's mp.Image format
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
# Perform face detection in the frame
result = faces.detect(mp_image) # Use detect instead of detect_async
# Create a NumPy copy of the mp_image
image_copy = np.copy(mp_image.numpy_view())
# Draw the face markings on the image
annotated_image = visualize(image_copy, result)
# Convert from RGB to BGR
rgb_annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
cv2.imshow("image", rgb_annotated_image)
# Read the keyboard input
t = cv2.waitKey(1)
if t == 27: # If the ESC key is pressed, exit the loop
break
cap.release()
cv2.destroyAllWindows()
You can read more on the official MediaPipe page, Python application.
https://ai.google.dev/edge/mediapipe/solutions/vision/face_detector/python?hl=pt-br