Добавление числовой строки в отсканированные PDF-файлы

Добавление числовой строки в отсканированные PDF-файлы ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Добавление числовой строки в отсканированные PDF-файлы

Цитата

Сообщение Anonymous » 26 ноя 2024, 10:49

Я все еще ищу решение: кто-нибудь готов помочь. Это отсканированный PDF-файл, в который я хочу добавить несколько числовых строк. Я пробовал pytesseract, reportlab, но не получил необходимого результата.

Что я получаю в настоящее время

Вот мой обновленный код, он работает, но неточно

Код: Выделить всё

    import sys
from collections import defaultdict
from io import BytesIO
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen.canvas import Canvas
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Set the path to Tesseract executable (Windows-specific, adjust as needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program
Files\Tesseract-OCR\tesseract.exe'

# Define path to Poppler on Windows
poppler_path = r'C:\Program Files\poppler-24.07.0\Library\bin'

def extract_text_and_positions_from_image(image):
"""Use Tesseract to extract text and their positions from an image."""
data = pytesseract.image_to_data(image,
output_type=pytesseract.Output.DICT)
return data

def add_line_numbers_to_scanned_pdf(pdf_file_path,
output_pdf_path):
"""Add line numbers to a scanned PDF using OCR."""
images = convert_from_path(pdf_file_path, 300,
poppler_path=poppler_path)  # Convert the scanned PDF to images

pdf_writer = PdfWriter()  # Writer to create a new PDF with
line numbers

for page_num, image in enumerate(images):
# Extract text and positions using OCR
ocr_data = extract_text_and_positions_from_image(image)

# Get the width and height of the image for proper scaling
image_width, image_height = image.size
overlay = BytesIO()
canvas = Canvas(overlay, pagesize=(image_width, image_height))

line_number = 0
previous_y = None
line_height = 15  # Approximate line height (adjust for spacing)

non_empty_lines = []  # Store non-empty line y-coordinates

# Collect non-empty lines based on OCR bounding boxes
for i in range(len(ocr_data['level'])):
y = int(ocr_data['top'][i])
text = ocr_data['text'][i]

# Skip empty or invalid text
if not text.strip():
continue

# Track non-empty lines based on their Y positions
if previous_y is None or abs(previous_y - y) >= line_height:
non_empty_lines.append(y)
previous_y = y

# Sort the y-positions to ensure numbering from top to bottom
non_empty_lines.sort(reverse=True)

# Draw line numbers starting from the 10th non-empty line
for i, y in enumerate(non_empty_lines):
# Start numbering from the 10th line and continue numbering every 10th line
if i >= 9 and (i + 1) % 10 == 0:
line_number += 10
canvas.setFont("Helvetica", 30)
# Align the line number on the right side of the page
canvas.drawString(image_width - 70, y, str(line_number))  # Adjust `image_width - 50` to align right

# Add page number in the top-right corner
canvas.setFont("Helvetica", 80)
canvas.drawString(image_width - 70, image_height - 60, str(page_num + 1))  # Page number
canvas.save()
overlay.seek(0)

# Convert PIL image to a PDF format
img_pdf = BytesIO()
image.save(img_pdf, format='PDF')
img_pdf.seek(0)

# Create a PdfReader object for both the image and the overlay
image_reader = PdfReader(img_pdf)
overlay_pdf = PdfReader(overlay)

# Get the first page from both
page = image_reader.pages[0]
page.merge_page(overlay_pdf.pages[0])  # Merge the overlay with the image page

pdf_writer.add_page(page)

# Save the final PDF with line numbers
with open(output_pdf_path,  'wb') as output_file:
pdf_writer.write(output_file)

print(f"Line numbers added to scanned PDF and saved as {output_pdf_path}.")

# Example usage
input_pdf = r'C:\PyCharm
Projects\AirRumi\media\20240703125513952.pdf'  # Path to the scanned PDF
output_pdf = r'C:\PyCharm
Projects\AirRumi\media\text_with_line_numbers.pdf'  # Output PDF

# For scanned PDFs
add_line_numbers_to_scanned_pdf(input_pdf, output_pdf)

Это обновленный код, вывод не совсем соответствует тексту

Подробнее здесь: https://stackoverflow.com/questions/790 ... anned-pdfs

1732607370

Anonymous

Я все еще ищу решение: кто-нибудь готов помочь. Это отсканированный PDF-файл, в который я хочу добавить несколько числовых строк.  Я пробовал pytesseract, reportlab, но не получил необходимого результата.
[img]https://i.sstatic.net/YFyty6ex.jpg[/img]

Что я получаю в настоящее время
[img] [/img]

Вот мой обновленный код, он работает, но неточно
[code]    import sys
from collections import defaultdict
from io import BytesIO
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen.canvas import Canvas
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Set the path to Tesseract executable (Windows-specific, adjust as needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program
Files\Tesseract-OCR\tesseract.exe'

# Define path to Poppler on Windows
poppler_path = r'C:\Program Files\poppler-24.07.0\Library\bin'

def extract_text_and_positions_from_image(image):
"""Use Tesseract to extract text and their positions from an image."""
data = pytesseract.image_to_data(image,
output_type=pytesseract.Output.DICT)
return data

def add_line_numbers_to_scanned_pdf(pdf_file_path,
output_pdf_path):
"""Add line numbers to a scanned PDF using OCR."""
images = convert_from_path(pdf_file_path, 300,
poppler_path=poppler_path)  # Convert the scanned PDF to images

pdf_writer = PdfWriter()  # Writer to create a new PDF with
line numbers

for page_num, image in enumerate(images):
# Extract text and positions using OCR
ocr_data = extract_text_and_positions_from_image(image)

# Get the width and height of the image for proper scaling
image_width, image_height = image.size
overlay = BytesIO()
canvas = Canvas(overlay, pagesize=(image_width, image_height))

line_number = 0
previous_y = None
line_height = 15  # Approximate line height (adjust for spacing)

non_empty_lines = []  # Store non-empty line y-coordinates

# Collect non-empty lines based on OCR bounding boxes
for i in range(len(ocr_data['level'])):
y = int(ocr_data['top'][i])
text = ocr_data['text'][i]

# Skip empty or invalid text
if not text.strip():
continue

# Track non-empty lines based on their Y positions
if previous_y is None or abs(previous_y - y) >= line_height:
non_empty_lines.append(y)
previous_y = y

# Sort the y-positions to ensure numbering from top to bottom
non_empty_lines.sort(reverse=True)

# Draw line numbers starting from the 10th non-empty line
for i, y in enumerate(non_empty_lines):
# Start numbering from the 10th line and continue numbering every 10th line
if i >= 9 and (i + 1) % 10 == 0:
line_number += 10
canvas.setFont("Helvetica", 30)
# Align the line number on the right side of the page
canvas.drawString(image_width - 70, y, str(line_number))  # Adjust `image_width - 50` to align right

# Add page number in the top-right corner
canvas.setFont("Helvetica", 80)
canvas.drawString(image_width - 70, image_height - 60, str(page_num + 1))  # Page number
canvas.save()
overlay.seek(0)

# Convert PIL image to a PDF format
img_pdf = BytesIO()
image.save(img_pdf, format='PDF')
img_pdf.seek(0)

# Create a PdfReader object for both the image and the overlay
image_reader = PdfReader(img_pdf)
overlay_pdf = PdfReader(overlay)

# Get the first page from both
page = image_reader.pages[0]
page.merge_page(overlay_pdf.pages[0])  # Merge the overlay with the image page

pdf_writer.add_page(page)

# Save the final PDF with line numbers
with open(output_pdf_path,  'wb') as output_file:
pdf_writer.write(output_file)

print(f"Line numbers added to scanned PDF and saved as {output_pdf_path}.")

# Example usage
input_pdf = r'C:\PyCharm
Projects\AirRumi\media\20240703125513952.pdf'  # Path to the scanned PDF
output_pdf = r'C:\PyCharm
Projects\AirRumi\media\text_with_line_numbers.pdf'  # Output PDF

# For scanned PDFs
add_line_numbers_to_scanned_pdf(input_pdf, output_pdf)
[/code]
Это обновленный код, вывод не совсем соответствует тексту 

Подробнее здесь: [url]https://stackoverflow.com/questions/79077859/adding-number-line-on-scanned-pdfs[/url]