Ниже приведена упрощенная версия моего кода: < /p>
Код: Выделить всё
import io
from PIL import Image, ImageDraw
import fitz
import pytesseract
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd = (
r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)
INPUT_PDF = "example.pdf"
OUTPUT_PDF = "boxed_example.pdf"
def process_pdf(pdf_path):
doc = fitz.open(pdf_path)
modified_images = []
for page in doc:
zoom = 300.0 / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("png")))
draw = ImageDraw.Draw(img)
data = pytesseract.image_to_data(img, output_type=Output.DICT)
n_boxes = len(data["text"])
for i in range(n_boxes):
x = data["left"][i]
y = data["top"][i]
w = data["width"][i]
h = data["height"][i]
draw.rectangle([x, y, x + w, y + h], outline="red", width=2)
modified_images.append(img.convert("RGB"))
doc.close()
output_buffer = io.BytesIO()
modified_images[0].save(
output_buffer,
"PDF",
resolution=300,
save_all=True,
append_images=modified_images[1:],
)
output_buffer.seek(0)
return output_buffer.getvalue()
boxed_pdf_bytes = process_pdf(INPUT_PDF)
with open(OUTPUT_PDF, "wb") as f:
f.write(boxed_pdf_bytes)
print(f"Boxed PDF saved as {OUTPUT_PDF}")
Изменение DPI, разные режимы PSM, но ничего не помогло. /> Обнаружен почти весь текст, за исключением полных страниц 2 и 11. < /p>
Подробнее здесь: https://stackoverflow.com/questions/795 ... processing