I am following a tutorial on Open CV and trying to rewrite the following code: https://github.com/learncodebygaming/opencv_tutorials/tree/master/005_real_time
(specifically, the windowcapture.py file)
This file uses win32gui, win32ui, win32con to capture a given open window by window name and take a screenshot of it for cv2 processing later down the line.
I have attempted to recreate this functionality using Quartz for macOS using the following example: https://stackoverflow.com/a/48030215/14649706
So my own version of windowcapture.py looks like this:
import numpy as np
from Quartz import CGWindowListCopyWindowInfo, kCGNullWindowID, kCGWindowListOptionAll, CGRectNull, CGWindowListCreateImage, kCGWindowImageBoundsIgnoreFraming, kCGWindowListExcludeDesktopElements, CGImageGetDataProvider, CGDataProviderCopyData, CFDataGetBytePtr, CFDataGetLength
import os
from PIL import Image
import cv2 as cv
class WindowCapture:
# properties
window_name = None
window = None
window_id = None
window_width = 0
window_height = 0
# constructor
def __init__(self, given_window_name=None):
if given_window_name is not None:
self.window_name = given_window_name
self.window = self.get_window()
if self.window is None:
raise Exception('Unable to find window: {}'.format(given_window_name))
self.window_id = self.get_window_id()
self.window_width = self.get_window_width()
self.window_height = self.get_window_height()
self.window_x = self.get_window_pos_x()
self.window_y = self.get_window_pos_y()
# determine the window we want to capture
def get_window(self):
windows = CGWindowListCopyWindowInfo(kCGWindowListOptionAll, kCGNullWindowID)
for window in windows:
name = window.get('kCGWindowName', 'Unknown')
if name and self.window_name in name:
return window
return None
def get_window_id(self):
return self.window['kCGWindowNumber']
def get_window_width(self):
return int(self.window['kCGWindowBounds']['Width'])
def get_window_height(self):
return int(self.window['kCGWindowBounds']['Height'])
def get_window_pos_x(self):
return int(self.window['kCGWindowBounds']['X'])
def get_window_pos_y(self):
return int(self.window['kCGWindowBounds']['Y'])
def get_image_from_window(self):
image_filename = 'test-img.png'
# -x mutes sound and -l specifies windowId
os.system('screencapture -x -l %s %s' % (self.window_id, image_filename))
pil_image = Image.open(image_filename)
image_as_numpy_array = np.array(pil_image)
os.remove(image_filename)
image = cv.cvtColor(image_as_numpy_array, cv.COLOR_BGR2RGB)
return image
My get_image_from_window
method here works fine, I am able to use cv.imshow('cv', screenshot)
to view it:
import cv2 as cv
from time import time
from windowcapture import WindowCapture
# initialize the WindowCapture class
wincap = WindowCapture('Blue Box Clicker')
loop_time = time()
while(True):
# get an updated image of the game
screenshot = wincap.get_image_from_window()
cv.imshow('cv', screenshot)
# debug the loop rate
print('FPS {}'.format(1 / (time() - loop_time)))
loop_time = time()
# press 'q' with the output window focused to exit.
# waits 1 ms every loop to process key presses
if cv.waitKey(1) == ord('q'):
cv.destroyAllWindows()
break
print('Done.')
But I don't want to save the image locally to then load it again. I believe this is very inefficient and I would like to achieve the same functionality without actually saving the image file and then opening it.
Similarly to how it is done here (in the GitHub link above):
def get_screenshot(self):
# get the window image data
wDC = win32gui.GetWindowDC(self.hwnd)
dcObj = win32ui.CreateDCFromHandle(wDC)
cDC = dcObj.CreateCompatibleDC()
dataBitMap = win32ui.CreateBitmap()
dataBitMap.CreateCompatibleBitmap(dcObj, self.w, self.h)
cDC.SelectObject(dataBitMap)
cDC.BitBlt((0, 0), (self.w, self.h), dcObj, (self.cropped_x, self.cropped_y), win32con.SRCCOPY)
# convert the raw data into a format opencv can read
#dataBitMap.SaveBitmapFile(cDC, 'debug.bmp')
signedIntsArray = dataBitMap.GetBitmapBits(True)
img = np.fromstring(signedIntsArray, dtype='uint8')
img.shape = (self.h, self.w, 4)
# free resources
dcObj.DeleteDC()
cDC.DeleteDC()
win32gui.ReleaseDC(self.hwnd, wDC)
win32gui.DeleteObject(dataBitMap.GetHandle())
# drop the alpha channel, or cv.matchTemplate() will throw an error like:
# error: (-215:Assertion failed) (depth == CV_8U || depth == CV_32F) && type == _templ.type()
# && _img.dims() <= 2 in function 'cv::matchTemplate'
img = img[...,:3]
# make image C_CONTIGUOUS to avoid errors that look like:
# File ... in draw_rectangles
# TypeError: an integer is required (got type tuple)
# see the discussion here:
# https://github.com/opencv/opencv/issues/14866#issuecomment-580207109
img = np.ascontiguousarray(img)
return img
How can I achieve this using Quartz?
I am on macOS (M1 Pro) and would really like to get this working.
At the moment, this program runs at around 12fps.
The program it is trying to capture is another python program (a simple pygame):
import pygame
import random
# Set up the game window
pygame.init()
window_width, window_height = 640, 480
window = pygame.display.set_mode((window_width, window_height))
pygame.display.set_caption("Blue Box Clicker")
# Set up the clock
clock = pygame.time.Clock()
# Set up the game variables
background_color = (0, 0, 0)
box_color = (0, 0, 255)
box_width, box_height = 50, 50
box_x, box_y = 0, 0
# Set up the game loop
running = True
while running:
# Handle events
for event in pygame.event.get():
if event.type == pygame.QUIT:
running = False
elif event.type == pygame.MOUSEBUTTONDOWN:
mouse_x, mouse_y = pygame.mouse.get_pos()
if box_x <= mouse_x <= box_x + box_width and box_y <= mouse_y <= box_y + box_height:
# Correct click
box_x, box_y = random.randint(
0, window_width - box_width), random.randint(0, window_height - box_height)
# Incorrect click
# Draw the background
window.fill(background_color)
# Draw the box
pygame.draw.rect(window, box_color, (box_x, box_y, box_width, box_height))
# Update the window
pygame.display.update()
# Limit the frame rate
clock.tick(60)
# Clean up
pygame.quit()
I fixed this using the following code for my image capture:
def get_image_from_window(self):
core_graphics_image = QZ.CGWindowListCreateImage(
QZ.CGRectNull,
QZ.kCGWindowListOptionIncludingWindow,
self.window_id,
QZ.kCGWindowImageBoundsIgnoreFraming | QZ.kCGWindowImageNominalResolution
)
bytes_per_row = QZ.CGImageGetBytesPerRow(core_graphics_image)
width = QZ.CGImageGetWidth(core_graphics_image)
height = QZ.CGImageGetHeight(core_graphics_image)
core_graphics_data_provider = QZ.CGImageGetDataProvider(core_graphics_image)
core_graphics_data = QZ.CGDataProviderCopyData(core_graphics_data_provider)
np_raw_data = np.frombuffer(core_graphics_data, dtype=np.uint8)
numpy_data = np.lib.stride_tricks.as_strided(np_raw_data,
shape=(height, width, 3),
strides=(bytes_per_row, 4, 1),
writeable=False)
final_output = np.ascontiguousarray(numpy_data, dtype=np.uint8)
return final_output
This method returns the captured CGImage in a format that cv2
can recognise and use for matchTemplate
.
My full code looks like this:
#main.py
import cv2 as cv
from time import time
from windowcapture import WindowCapture
# initialize the WindowCapture class
wincap = WindowCapture('Blue Box Clicker')
loop_time = time()
while(True):
# get an updated image of the game
screenshot = wincap.get_image_from_window()
# display the processed image
cv.imshow('Computer Vision', screenshot)
# debug the loop rate
print('FPS {}'.format(1 / (time() - loop_time)))
loop_time = time()
# hold 'q' with the output window focused to exit.
# waits 1 ms every loop to process key presses
if cv.waitKey(1) == ord('q'):
cv.destroyAllWindows()
break
print('Done.')
#windowcapture.py
import numpy as np
import Quartz as QZ
class WindowCapture:
# properties
window_name = None
window = None
window_id = None
window_width = 0
window_height = 0
# constructor
def __init__(self, given_window_name=None):
if given_window_name is not None:
self.window_name = given_window_name
self.window = self.get_window()
if self.window is None:
raise Exception('Unable to find window: {}'.format(given_window_name))
self.window_id = self.get_window_id()
self.window_width = self.get_window_width()
self.window_height = self.get_window_height()
self.window_x = self.get_window_pos_x()
self.window_y = self.get_window_pos_y()
else:
raise Exception('No window name given')
def get_window(self):
windows = QZ.CGWindowListCopyWindowInfo(QZ.kCGWindowListOptionAll, QZ.kCGNullWindowID)
for window in windows:
name = window.get('kCGWindowName', 'Unknown')
if name and self.window_name in name:
return window
return None
def get_window_id(self):
return self.window['kCGWindowNumber']
def get_window_width(self):
return int(self.window['kCGWindowBounds']['Width'])
def get_window_height(self):
return int(self.window['kCGWindowBounds']['Height'])
def get_window_pos_x(self):
return int(self.window['kCGWindowBounds']['X'])
def get_window_pos_y(self):
return int(self.window['kCGWindowBounds']['Y'])
def get_image_from_window(self):
core_graphics_image = QZ.CGWindowListCreateImage(
QZ.CGRectNull,
QZ.kCGWindowListOptionIncludingWindow,
self.window_id,
QZ.kCGWindowImageBoundsIgnoreFraming | QZ.kCGWindowImageNominalResolution
)
bytes_per_row = QZ.CGImageGetBytesPerRow(core_graphics_image)
width = QZ.CGImageGetWidth(core_graphics_image)
height = QZ.CGImageGetHeight(core_graphics_image)
core_graphics_data_provider = QZ.CGImageGetDataProvider(core_graphics_image)
core_graphics_data = QZ.CGDataProviderCopyData(core_graphics_data_provider)
np_raw_data = np.frombuffer(core_graphics_data, dtype=np.uint8)
numpy_data = np.lib.stride_tricks.as_strided(np_raw_data,
shape=(height, width, 3),
strides=(bytes_per_row, 4, 1),
writeable=False)
final_output = np.ascontiguousarray(numpy_data, dtype=np.uint8)
return final_output
And this works with an average 60fps on a MacBook Pro (with an M1 Pro processor).