I'm still new to Python, and I've been using ChatGBT as my tutor, I'm trying to automate a mini game, it needs to read the single character (letter or number) on my screen, and then when the timing is right, press that button. I've been at 3 different versions trying to find a solution to this, I simply can't get it to recognize the character. I really hope someone can help me move forward!
Images:
import pyautogui
import pytesseract
from PIL import Image, ImageEnhance
import time
import re
def preprocess_image(image):
"""
Preprocess the image to enhance OCR detection.
Args:
image (PIL.Image.Image): The input image.
Returns:
PIL.Image.Image: The preprocessed image.
"""
# Convert to grayscale
gray_image = image.convert("L")
# Enhance contrast significantly
enhancer = ImageEnhance.Contrast(gray_image)
enhanced_image = enhancer.enhance(5.0)
# Apply thresholding to retain only the most prominent white text
threshold_image = enhanced_image.point(lambda p: p > 200 and 255)
return threshold_image
def analyze_region(region, region_name):
"""
Analyzes a specific region for characters.
Args:
region (tuple): The region to analyze (x, y, width, height).
region_name (str): A name for the region for debugging purposes.
Returns:
str: Detected valid character, if any.
"""
region_screenshot = pyautogui.screenshot(region=region)
# Save the original scanned image for debugging
region_screenshot.save(f"scanned_region_{region_name}_original.png")
# Preprocess the image
processed_image = preprocess_image(region_screenshot)
# Save the processed image for debugging
processed_image.save(f"scanned_region_{region_name}_processed.png")
# Analyze the region for text
config = '--psm 10 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
text = pytesseract.image_to_string(processed_image, config=config)
print(f"Raw OCR output from {region_name}: {text.strip()}\nFiltered characters: {re.findall(r'[A-Z0-9]', text)}") # Debugging output
valid_characters = re.findall(r'[A-Z0-9]', text)
return ''.join(valid_characters)
def find_color_on_screen(target_color):
"""
Continuously scans the screen for a specific color.
Args:
target_color (tuple): RGB values of the target color (e.g., (255, 0, 0) for red).
Outputs:
Prints a message when the color is found and analyzes specific regions to identify a character.
"""
print(f"Scanning for color: {target_color} (RGB)...")
while True:
screenshot = pyautogui.screenshot()
width, height = screenshot.size
pixels = screenshot.load()
for y in range(height):
for x in range(width):
if pixels[x, y] == target_color:
print(f"Color {target_color} found at pixel ({x}, {y})!")
# Define regions for progress bar positions
TOP_BAR_REGION = (1010, 103, 530, 77) # (x, y, width, height)
RIGHT_BAR_REGION = (1840, 120, 75, 695) # (x, y, width, height)
for _ in range(10): # Perform 10 checks within 2 seconds
top_bar_result = analyze_region(TOP_BAR_REGION, "top_bar")
right_bar_result = analyze_region(RIGHT_BAR_REGION, "right_bar")
if top_bar_result:
print(f"Detected character(s) in top bar: {top_bar_result}")
input("Press Enter to acknowledge and exit...")
return
if right_bar_result:
print(f"Detected character(s) in right bar: {right_bar_result}")
input("Press Enter to acknowledge and exit...")
return
time.sleep(0.2) # Brief delay to ensure 10 checks within 2 seconds
print("No valid capital letters or numbers detected within the time frame.")
input("Press Enter to exit...")
return
# Example usage
if __name__ == "__main__":
# Replace with the RGB value of the color you want to detect
target_color = (0, 141, 146) # 008D92 in HEX
find_color_on_screen(target_color)
Command Prompt:
Raw OCR output from top_bar: Filtered characters: [] Raw OCR output from right_bar: Filtered characters: [] No valid capital letters or numbers detected within the time frame. Press Enter to exit...
I did some testing with pytesseract and found out that it didn't like the white borders around the letter.
I rewrote the preprocess_image()
to get the letter only, it's probably not the best way of doing it it but it's just to give an understanding of what pytesseract
needs:
def preprocess_image(image: Image, top_right: bool):
npimage = np.array(image.convert('RGB'))
Y, X = np.where(np.all(npimage==[255, 255, 255], axis=2))
output = Image.new('RGB', image.size, (255, 255, 255))
for x, y in zip(X, Y):
if (top and (x != max(X) and x != min(X))) or (not top and (y != max(Y) and y != min(Y))):
output.putpixel((x, y), (0, 0, 0))
return output
It's taking the positions of all the white pixels in the image ([255, 255, 255]
) and checking that it isn't the first or last x position, which corresponds to the white bars around the number (if x != max(X) and x != min(X):
).
Here's an image of what it returns: code output
Sources :
Find the coordinates in an image where a specified colour is detected