Tesseract OCR пропускает всю строку текста

Tesseract OCR пропускает всю строку текста ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Tesseract OCR пропускает всю строку текста

Цитата

Сообщение Anonymous » 14 ноя 2024, 06:45

import ocrmypdf
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import img2pdf
import io
import cv2

def preprocess_image(img):
"""
Preprocess image function (contrast enhancement, noise reduction)
"""
# Convert to grayscale
img = img.convert('L')

# Enhance contrast
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)  # Increase contrast

# Apply noise reduction (optional)
img = img.filter(ImageFilter.MedianFilter())

# Convert back to array for deskewing
gray = np.array(img)

# Apply thresholding
_, thresh_img = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
return Image.fromarray(thresh_img)

# Step 1: Extract images from PDF
images = convert_from_path(raw_input, dpi=300)

# Step 2: Preprocess images
preprocessed_images = [preprocess_image(img) for img in images]

# Step 3: Save preprocessed images to a new PDF using BytesIO
preprocessed_pdf_path = 'preprocessed.pdf'

# Create a list to hold image streams
image_streams = []

for img in preprocessed_images:
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='JPEG')
img_byte_arr.seek(0)  # Move to the beginning of the BytesIO buffer
image_streams.append(img_byte_arr)

# Convert the list of image streams to a PDF
with open(preprocessed_pdf_path, 'wb') as f:
f.write(img2pdf.convert(image_streams))

# Step 4: Run ocrmypdf on the new PDF
ocrmypdf.ocr(preprocessed_pdf_path, stg_pdf_path, skip_text=True)

Я использую приведенный выше код для распознавания отсканированного PDF-файла, отсканированный PDF-файл показался мне очень чистым и читаемым. Однако есть линия, которая всегда отсутствует (линия рядом с красной стрелкой на изображении ниже). Я провел небольшое исследование, но не нашел рабочего решения.

Подробнее здесь: https://stackoverflow.com/questions/791 ... ne-of-text

1731555953

Anonymous


[code]import ocrmypdf
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import img2pdf
import io
import cv2

def preprocess_image(img):
"""
Preprocess image function (contrast enhancement, noise reduction)
"""
# Convert to grayscale
img = img.convert('L')

# Enhance contrast
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)  # Increase contrast

# Apply noise reduction (optional)
img = img.filter(ImageFilter.MedianFilter())

# Convert back to array for deskewing
gray = np.array(img)

# Apply thresholding
_, thresh_img = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
return Image.fromarray(thresh_img)

# Step 1: Extract images from PDF
images = convert_from_path(raw_input, dpi=300)

# Step 2: Preprocess images
preprocessed_images = [preprocess_image(img) for img in images]

# Step 3: Save preprocessed images to a new PDF using BytesIO
preprocessed_pdf_path = 'preprocessed.pdf'

# Create a list to hold image streams
image_streams = []

for img in preprocessed_images:
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='JPEG')
img_byte_arr.seek(0)  # Move to the beginning of the BytesIO buffer
image_streams.append(img_byte_arr)

# Convert the list of image streams to a PDF
with open(preprocessed_pdf_path, 'wb') as f:
f.write(img2pdf.convert(image_streams))

# Step 4: Run ocrmypdf on the new PDF
ocrmypdf.ocr(preprocessed_pdf_path, stg_pdf_path, skip_text=True)

[/code]
Я использую приведенный выше код для распознавания отсканированного PDF-файла, отсканированный PDF-файл показался мне очень чистым и читаемым. Однако есть линия, которая всегда отсутствует (линия рядом с красной стрелкой на изображении ниже). Я провел небольшое исследование, но не нашел рабочего решения.
[img]https://i.sstatic.net/Lhx1E2Ed.jpg[/img]
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79187344/tesseract-ocr-misses-entire-line-of-text[/url]