python computer-vision face-detection mediapipe

I am encountering an issue with drawing boxes in Face Detection using MediaPipe

I'm trying to create simple code to detect faces using my webcam with the MediaPipe library, but I'm facing an issue. When I attempt to draw the box on the mp_image, it's not appearing. Can someone please help me?

#------ importamos las librerias -----------#
import cv2
import mediapipe as mp
import numpy as np


#--------- Declarar el detector --------#
detector = mp.tasks.vision.FaceDetector
dibujo = mp.solutions.drawing_utils

#----realizar la video captura----#
cap = cv2.VideoCapture(0)

#------Inicializamos parametros de configuracion-----#
BaseOptions = mp.tasks.BaseOptions
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
FaceDetectorResult = mp.tasks.vision.FaceDetectorResult
VisionRunningMode = mp.tasks.vision.RunningMode



#------- Crear una instancia para la detencion en tiempo real ------#
def print_result(result: FaceDetectorResult, output_image, timestamp_ms: int):
    print('face detector result: {}'.format(result))

    # Verificar si existen detecciones para dibujarlas
    if result.detections:
        # dibujar cada rostro detectado
        for detection in result.detections:

            #PROBLEMAS CON LOS TIPO DE DATOS
            output_image = np.array(output_image)
            print(f"image = {type(output_image)}")
            det_pb2 = detection.to_pb2()
            dibujo.draw_detection(output_image, det_pb2)
            print('detected faces')

#-----------Iniciamos la configuracion----------------#
options = FaceDetectorOptions(
    base_options = BaseOptions(model_asset_path='model.tflite'),
    running_mode = VisionRunningMode.LIVE_STREAM,
    min_detection_confidence = 0.5,
    result_callback = print_result
)


# Obtén la tasa de cuadros por segundo (FPS) para calcular el timestamp
fps = cap.get(cv2.CAP_PROP_FPS)
frame_number = 0

with detector.create_from_options(options) as rostros:


    while True:

        # La lectura de la video captura
        ret, frame = cap.read()

        # Eliminar el error de movimiento
        frame = cv2.flip(frame,1)

        # Correcion de color
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Obtener frame_timestamp_ms
        frame_timestamp_ms = int((frame_number / fps) * 1000)
        frame_number += 1

        # Convertimos el frame a modelo de imagen mp.image de mediapipe
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
        # Mandamos la deteccion del rostro en frame
        rostros.detect_async(mp_image,frame_timestamp_ms)


        #Mostramos los fotogramas
        cv2.imshow("Camara", mp_image)

        #Leemos el teclado
        t = cv2.waitKey(1)
        if t == 27:
            break

cap.release()
cv2.destroyAllWindows()

I've tried a few things, such as changing the image type and the detection type, but nothing seems to work.

Solution

sorry for not responding earlier. I had some health issues and needed to rest.

Well, the first thing to highlight is that we changed the running_mode option of the face detector from LIVE_STREAM to IMAGE. It’s not entirely clear to me, but it seems that the only difference between them is that in LIVE_STREAM, we can process the results directly using a callback function, whereas in IMAGE, we process each frame in a while loop.

    options = FaceDetectorOptions(
    base_options=BaseOptions(model_asset_path='model.tflite'),
    running_mode=VisionRunningMode.IMAGE,  # IMAGE MODE
    min_detection_confidence=0.5
)

Another thing we modified was the addition of a function to visualize the results. I copied this function directly from the MediaPipe website. It’s a function created by the same developers, which draws the bounding box on a copy of the image and then marks the keypoints. But before that, the keypoints are passed to an internal function to normalize the values. This function checks if the values are normalized, and if everything is okay, it doesn’t return anything. However, if necessary, it calculates the normalization and returns the values, which are then used with OpenCV's circle method to draw the keypoints on the image.

from typing import Tuple, Union
import math
import cv2
import numpy as np

MARGIN = 10  # pixels
ROW_SIZE = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
TEXT_COLOR = (255, 0, 0)  # red


def _normalized_to_pixel_coordinates(
    normalized_x: float, normalized_y: float, image_width: int,
    image_height: int) -> Union[None, Tuple[int, int]]:
    """Converts normalized value pair to pixel coordinates."""

    # Checks if the float value is between 0 and 1.
    def is_valid_normalized_value(value: float) -> bool:
        return (value > 0 or math.isclose(0, value)) and (value < 1 or
                                                          math.isclose(1, value))

    if not (is_valid_normalized_value(normalized_x) and
            is_valid_normalized_value(normalized_y)):
        # TODO: Draw coordinates even if it's outside of the image bounds.
        return None
    x_px = min(math.floor(normalized_x * image_width), image_width - 1)
    y_px = min(math.floor(normalized_y * image_height), image_height - 1)
    return x_px, y_px


def visualize(
    image,
    detection_result
) -> np.ndarray:
    """Draws bounding boxes and keypoints on the input image and returns it.
    
    Args:
        image: The input RGB image.
        detection_result: The list of all "Detection" entities to visualize.
        
    Returns:
        Image with bounding boxes.
    """
    annotated_image = image.copy()
    height, width, _ = image.shape

    for detection in detection_result.detections:
        # Draw bounding box
        bbox = detection.bounding_box
        start_point = bbox.origin_x, bbox.origin_y
        end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
        cv2.rectangle(annotated_image, start_point, end_point, TEXT_COLOR, 3)

        # Draw keypoints
        for keypoint in detection.keypoints:
            keypoint_px = _normalized_to_pixel_coordinates(
                keypoint.x, keypoint.y, width, height)

            color, thickness, radius = (0, 255, 0), 2, 2
            cv2.circle(annotated_image, keypoint_px, thickness, color, radius)

        # Draw label and score
        category = detection.categories[0]
        category_name = category.category_name
        category_name = '' if category_name is None else category_name
        probability = round(category.score, 2)
        result_text = category_name + ' (' + str(probability) + ')'
        text_location = (MARGIN + bbox.origin_x,
                         MARGIN + ROW_SIZE + bbox.origin_y)
        cv2.putText(annotated_image, result_text, text_location,
                    cv2.FONT_HERSHEY_PLAIN, FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)

    return annotated_image

Here is the full main code so that it can be verified by everyone:

import cv2
import mediapipe as mp
import numpy as np
from drawing import visualize

#--------- Declare the detector --------#
detector = mp.tasks.vision.FaceDetector

#---- Perform video capture ----#
cap = cv2.VideoCapture(0)

#------ Initialize configuration parameters -----#
BaseOptions = mp.tasks.BaseOptions
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
VisionRunningMode = mp.tasks.vision.RunningMode

#----------- Start configuration ----------------#
options = FaceDetectorOptions(
    base_options=BaseOptions(model_asset_path='model.tflite'),
    running_mode=VisionRunningMode.IMAGE,  # IMAGE MODE
    min_detection_confidence=0.5
)

with detector.create_from_options(options) as faces:

    while True:

        # Read from the video capture
        ret, frame = cap.read()

        if not ret:
            print("Error capturing the image.")
            break

        # Eliminate movement error
        frame = cv2.flip(frame, 1)

        # Color correction
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Convert the frame to MediaPipe's mp.Image format
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)

        # Perform face detection in the frame
        result = faces.detect(mp_image)  # Use detect instead of detect_async

        # Create a NumPy copy of the mp_image
        image_copy = np.copy(mp_image.numpy_view())

        # Draw the face markings on the image
        annotated_image = visualize(image_copy, result)

        # Convert from RGB to BGR
        rgb_annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
        cv2.imshow("image", rgb_annotated_image)

        # Read the keyboard input
        t = cv2.waitKey(1)
        if t == 27:  # If the ESC key is pressed, exit the loop
            break

cap.release()
cv2.destroyAllWindows()

You can read more on the official MediaPipe page, Python application.

https://ai.google.dev/edge/mediapipe/solutions/vision/face_detector/python?hl=pt-br