Почему цифры не распознаются с помощью pytesseract на чистых изображениях

Почему цифры не распознаются с помощью pytesseract на чистых изображениях ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Почему цифры не распознаются с помощью pytesseract на чистых изображениях

Цитата

Сообщение Anonymous » 29 дек 2025, 20:41

Я хочу написать код Python, который сможет извлекать цифры для строк и столбцов в игре Nonogram. Для этого я уже написал код для распознавания прямоугольников. Что еще осталось сделать, так это написать код, проверяющий, есть ли в прямоугольнике цифра, и если да, то что это за цифры. Я хотел сделать это с помощью pytesseract, поскольку это казалось наиболее доступным способом.
Я написал для этого код, но теперь проблема в том, что код не распознает цифры последовательно. Не понимаю почему, ведь изображения достаточно чистые. Я использую черные цифры на белом фоне и пробовал как --psm 6, так и цифры выходной базы. Квадраты примерно 60 на 140 пикселей. Я использую Windows и не обладаю знаниями по использованию Linux на своей машине, поэтому использование tesstrain для пользовательского обучения на основе изображений, которые я могу пометить самостоятельно, на самом деле невозможно.
Как исправить мой код, чтобы он работал с pytesseract? Если это невозможно, что может быть простой альтернативой распознаванию этой цифры? Когда я извлекаю изображения из игры Nonogram, цифры фиксируются в шрифте, а размер и точное расположение зависят от сложности игры (10x10, 15x15 или 20x20), и обычно присутствует некоторый шум, но небольшой.
См. ниже код для воспроизведения распознавания изображений с исходным изображением и некоторыми распознаваемыми квадратами. Я использую Python 3.12.7 и использую pytesseract версии 5.5.0.20241111.

Код: Выделить всё

import os
import cv2
import numpy as np
import pytesseract

from typing import Tuple, List

Rect = Tuple[int, int, int, int]

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def crop_image(
obj: str | np.ndarray,
bounding_box: Tuple[int, int, int|None, int|None],
margin: int = 0
) -> np.ndarray:
# Check that first input is either string or numpy array and correctly formatted
if isinstance(obj, str):
if not os.path.exists(obj):
raise FileNotFoundError("Provided path does not exist.")
img: np.ndarray | None = cv2.imread(obj)
if img is None:
raise ValueError("File exists but cannot be read as an image.")
elif isinstance(obj, np.ndarray):
if obj.ndim not in (2, 3):
raise ValueError("Provided numpy array is not a valid image (must have 2 or 3 dimensions).")
img = obj
else:
raise TypeError("obj must be either a file path (str) or a numpy.ndarray image.")

# Check validity of bounding box
if not isinstance(bounding_box, tuple) or len(bounding_box) != 4:
raise ValueError("bounding_box must be a tuple of length 4 (x, y, w, h).")

# Extract info from bounding box and possibly update values
x, y, w, h = bounding_box
if not isinstance(x, int) or not isinstance(y, int):
raise TypeError("The first two elements of bounding_box (x, y) must be integers.")
if not isinstance(w, (int, type(None))) or not isinstance(h, (int, type(None))):
raise TypeError("The last two elements of bounding_box (w, h) must be integers or None.")

# Check validity of margin
if not isinstance(margin, int) or margin < 0:
raise ValueError("margin must be a non-negative integer.")

height, width = img.shape[:2]
w = width if w is None else w
h = height if h is None else h

# Crop image and show
x1 = max(0, x - margin)
y1 = max(0, y - margin)
x2 = min(width, x + w + margin)
y2 = min(height, y + h + margin)
if x1 >= x2 or y1 >= y2:
raise ValueError("Bounding box and margin result in an empty crop region.")
cropped_img: np.ndarray = img[y1:y2, x1:x2]
return cropped_img

def rect_contains_number(img: np.ndarray, rect: Tuple[int, int, int, int]) -> str:
"""
Extracts the ROI defined by rect, pre-processes it for OCR,
and returns found digits as a string.
"""
# Bounds check
roi_img = crop_image(img, rect, 0)

# Return empty if crop is invalid
if roi_img.size == 0:
print("somehow roi size is 0")
return ""

# Upscaling
roi_resized = cv2.resize(roi_img, None, fx=1.5, fy=1.5)

# Binarization
if len(roi_resized.shape) == 3:
roi_gray = cv2.cvtColor(roi_resized, cv2.COLOR_BGR2GRAY)
else:
roi_gray = roi_resized

# OCR Configuration
# custom_config = r'outputbase digits'
custom_config = "--psm 6"

try:
text = pytesseract.image_to_string(roi_gray, config=custom_config)
except Exception as e:
print(f"OCR Error: {e}")
return ""

# Strip whitespace and return
return text.strip()

dir: str = os.getcwd()
images_dir: str = os.path.join(dir, "Images")

size_image_names: List[str] = ["Standing_square.jpeg", "Laying_square.jpeg", "NoNumber_square.jpeg"]
number_rects = []
for image_name_str in os.listdir(images_dir):
print(image_name_str)
if image_name_str not in size_image_names:
continue
image_path: str = os.path.join(images_dir, image_name_str)
img: np.ndarray = cv2.imread(image_path)
thresh: np.ndarray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

rectangle_text = rect_contains_number(thresh, tuple([0, 0, None, None]))
img_contains_number = any(ch.isdigit() for ch in rectangle_text)
if img_contains_number:
number_rects.append([image_name_str, rectangle_text])
if image_name_str == "NoNumber_square.jpeg":
print(f"Found a digit in {image_name_str} while that should not be the case.")
else:
print(f"Correctly found that {image_name_str} does contain digits")
else:
if image_name_str in ["Standing_square.jpeg", "Laying_square.jpeg"]:
print(f"Should have recognized a digit in {image_name_str}, but only text recognized is '{rectangle_text}'.")
else:
print(f"Correctly found that {image_name_str} does not have any digits.")

print(number_rects)

исходное изображение
Laying_square
Standing_square
NoNumber_square

Подробнее здесь: https://stackoverflow.com/questions/798 ... ean-images

1767030108

Anonymous

Я хочу написать код Python, который сможет извлекать цифры для строк и столбцов в игре Nonogram. Для этого я уже написал код для распознавания прямоугольников. Что еще осталось сделать, так это написать код, проверяющий, есть ли в прямоугольнике цифра, и если да, то что это за цифры. Я хотел сделать это с помощью pytesseract, поскольку это казалось наиболее доступным способом.
Я написал для этого код, но теперь проблема в том, что код не распознает цифры последовательно. Не понимаю почему, ведь изображения достаточно чистые. Я использую черные цифры на белом фоне и пробовал как --psm 6, так и цифры выходной базы. Квадраты примерно 60 на 140 пикселей. Я использую Windows и не обладаю знаниями по использованию Linux на своей машине, поэтому использование tesstrain для пользовательского обучения на основе изображений, которые я могу пометить самостоятельно, на самом деле невозможно.
Как исправить мой код, чтобы он работал с pytesseract? Если это невозможно, что может быть простой альтернативой распознаванию этой цифры? Когда я извлекаю изображения из игры Nonogram, цифры фиксируются в шрифте, а размер и точное расположение зависят от сложности игры (10x10, 15x15 или 20x20), и обычно присутствует некоторый шум, но небольшой.
См. ниже код для воспроизведения распознавания изображений с исходным изображением и некоторыми распознаваемыми квадратами.  Я использую Python 3.12.7 и использую pytesseract версии 5.5.0.20241111.
[code]import os
import cv2
import numpy as np
import pytesseract

from typing import Tuple, List

Rect = Tuple[int, int, int, int]

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def crop_image(
obj: str | np.ndarray,
bounding_box: Tuple[int, int, int|None, int|None],
margin: int = 0
) -> np.ndarray:
# Check that first input is either string or numpy array and correctly formatted
if isinstance(obj, str):
if not os.path.exists(obj):
raise FileNotFoundError("Provided path does not exist.")
img: np.ndarray | None = cv2.imread(obj)
if img is None:
raise ValueError("File exists but cannot be read as an image.")
elif isinstance(obj, np.ndarray):
if obj.ndim not in (2, 3):
raise ValueError("Provided numpy array is not a valid image (must have 2 or 3 dimensions).")
img = obj
else:
raise TypeError("obj must be either a file path (str) or a numpy.ndarray image.")

# Check validity of bounding box
if not isinstance(bounding_box, tuple) or len(bounding_box) != 4:
raise ValueError("bounding_box must be a tuple of length 4 (x, y, w, h).")

# Extract info from bounding box and possibly update values
x, y, w, h = bounding_box
if not isinstance(x, int) or not isinstance(y, int):
raise TypeError("The first two elements of bounding_box (x, y) must be integers.")
if not isinstance(w, (int, type(None))) or not isinstance(h, (int, type(None))):
raise TypeError("The last two elements of bounding_box (w, h) must be integers or None.")

# Check validity of margin
if not isinstance(margin, int) or margin < 0:
raise ValueError("margin must be a non-negative integer.")

height, width = img.shape[:2]
w = width if w is None else w
h = height if h is None else h

# Crop image and show
x1 = max(0, x - margin)
y1 = max(0, y - margin)
x2 = min(width, x + w + margin)
y2 = min(height, y + h + margin)
if x1 >= x2 or y1 >= y2:
raise ValueError("Bounding box and margin result in an empty crop region.")
cropped_img: np.ndarray = img[y1:y2, x1:x2]
return cropped_img

def rect_contains_number(img: np.ndarray, rect: Tuple[int, int, int, int]) -> str:
"""
Extracts the ROI defined by rect, pre-processes it for OCR,
and returns found digits as a string.
"""
# Bounds check
roi_img = crop_image(img, rect, 0)

# Return empty if crop is invalid
if roi_img.size == 0:
print("somehow roi size is 0")
return ""

# Upscaling
roi_resized = cv2.resize(roi_img, None, fx=1.5, fy=1.5)

# Binarization
if len(roi_resized.shape) == 3:
roi_gray = cv2.cvtColor(roi_resized, cv2.COLOR_BGR2GRAY)
else:
roi_gray = roi_resized

# OCR Configuration
# custom_config = r'outputbase digits'
custom_config = "--psm 6"

try:
text = pytesseract.image_to_string(roi_gray, config=custom_config)
except Exception as e:
print(f"OCR Error: {e}")
return ""

# Strip whitespace and return
return text.strip()

dir: str = os.getcwd()
images_dir: str = os.path.join(dir, "Images")

size_image_names: List[str] = ["Standing_square.jpeg", "Laying_square.jpeg", "NoNumber_square.jpeg"]
number_rects = []
for image_name_str in os.listdir(images_dir):
print(image_name_str)
if image_name_str not in size_image_names:
continue
image_path: str = os.path.join(images_dir, image_name_str)
img: np.ndarray = cv2.imread(image_path)
thresh: np.ndarray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

rectangle_text = rect_contains_number(thresh, tuple([0, 0, None, None]))
img_contains_number = any(ch.isdigit() for ch in rectangle_text)
if img_contains_number:
number_rects.append([image_name_str, rectangle_text])
if image_name_str == "NoNumber_square.jpeg":
print(f"Found a digit in {image_name_str} while that should not be the case.")
else:
print(f"Correctly found that {image_name_str} does contain digits")
else:
if image_name_str in ["Standing_square.jpeg", "Laying_square.jpeg"]:
print(f"Should have recognized a digit in {image_name_str}, but only text recognized is '{rectangle_text}'.")
else:
print(f"Correctly found that {image_name_str} does not have any digits.")

print(number_rects)
[/code]
исходное изображение
Laying_square
Standing_square
NoNumber_square 

Подробнее здесь: [url]https://stackoverflow.com/questions/79856937/why-are-digits-not-recognized-with-pytesseract-in-clean-images[/url]