Я все еще ищу решение: кто-нибудь готов помочь. Это отсканированный PDF-файл, в который я хочу добавить несколько числовых строк. Я пробовал pytesseract, reportlab, но не получил необходимого результата.
import sys
from collections import defaultdict
from io import BytesIO
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen.canvas import Canvas
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
# Set the path to Tesseract executable (Windows-specific, adjust as needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program
Files\Tesseract-OCR\tesseract.exe'
# Define path to Poppler on Windows
poppler_path = r'C:\Program Files\poppler-24.07.0\Library\bin'
def extract_text_and_positions_from_image(image):
"""Use Tesseract to extract text and their positions from an image."""
data = pytesseract.image_to_data(image,
output_type=pytesseract.Output.DICT)
return data
def add_line_numbers_to_scanned_pdf(pdf_file_path,
output_pdf_path):
"""Add line numbers to a scanned PDF using OCR."""
images = convert_from_path(pdf_file_path, 300,
poppler_path=poppler_path) # Convert the scanned PDF to images
pdf_writer = PdfWriter() # Writer to create a new PDF with
line numbers
for page_num, image in enumerate(images):
# Extract text and positions using OCR
ocr_data = extract_text_and_positions_from_image(image)
# Get the width and height of the image for proper scaling
image_width, image_height = image.size
overlay = BytesIO()
canvas = Canvas(overlay, pagesize=(image_width, image_height))
line_number = 0
previous_y = None
line_height = 15 # Approximate line height (adjust for spacing)
non_empty_lines = [] # Store non-empty line y-coordinates
# Collect non-empty lines based on OCR bounding boxes
for i in range(len(ocr_data['level'])):
y = int(ocr_data['top'][i])
text = ocr_data['text'][i]
# Skip empty or invalid text
if not text.strip():
continue
# Track non-empty lines based on their Y positions
if previous_y is None or abs(previous_y - y) >= line_height:
non_empty_lines.append(y)
previous_y = y
# Sort the y-positions to ensure numbering from top to bottom
non_empty_lines.sort(reverse=True)
# Draw line numbers starting from the 10th non-empty line
for i, y in enumerate(non_empty_lines):
# Start numbering from the 10th line and continue numbering every 10th line
if i >= 9 and (i + 1) % 10 == 0:
line_number += 10
canvas.setFont("Helvetica", 30)
# Align the line number on the right side of the page
canvas.drawString(image_width - 70, y, str(line_number)) # Adjust `image_width - 50` to align right
# Add page number in the top-right corner
canvas.setFont("Helvetica", 80)
canvas.drawString(image_width - 70, image_height - 60, str(page_num + 1)) # Page number
canvas.save()
overlay.seek(0)
# Convert PIL image to a PDF format
img_pdf = BytesIO()
image.save(img_pdf, format='PDF')
img_pdf.seek(0)
# Create a PdfReader object for both the image and the overlay
image_reader = PdfReader(img_pdf)
overlay_pdf = PdfReader(overlay)
# Get the first page from both
page = image_reader.pages[0]
page.merge_page(overlay_pdf.pages[0]) # Merge the overlay with the image page
pdf_writer.add_page(page)
# Save the final PDF with line numbers
with open(output_pdf_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"Line numbers added to scanned PDF and saved as {output_pdf_path}.")
# Example usage
input_pdf = r'C:\PyCharm
Projects\AirRumi\media\20240703125513952.pdf' # Path to the scanned PDF
output_pdf = r'C:\PyCharm
Projects\AirRumi\media\text_with_line_numbers.pdf' # Output PDF
# For scanned PDFs
add_line_numbers_to_scanned_pdf(input_pdf, output_pdf)
Это обновленный код, вывод не совсем соответствует тексту
Я все еще ищу решение: кто-нибудь готов помочь. Это отсканированный PDF-файл, в который я хочу добавить несколько числовых строк. Я пробовал pytesseract, reportlab, но не получил необходимого результата. [img]https://i.sstatic.net/YFyty6ex.jpg[/img]
Что я получаю в настоящее время [img] [/img]
Вот мой обновленный код, он работает, но неточно [code] import sys from collections import defaultdict from io import BytesIO from pypdf import PdfReader, PdfWriter from reportlab.pdfgen.canvas import Canvas from pdf2image import convert_from_path import pytesseract from PIL import Image
# Set the path to Tesseract executable (Windows-specific, adjust as needed) pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Define path to Poppler on Windows poppler_path = r'C:\Program Files\poppler-24.07.0\Library\bin'
def extract_text_and_positions_from_image(image): """Use Tesseract to extract text and their positions from an image.""" data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) return data
def add_line_numbers_to_scanned_pdf(pdf_file_path, output_pdf_path): """Add line numbers to a scanned PDF using OCR.""" images = convert_from_path(pdf_file_path, 300, poppler_path=poppler_path) # Convert the scanned PDF to images
pdf_writer = PdfWriter() # Writer to create a new PDF with line numbers
for page_num, image in enumerate(images): # Extract text and positions using OCR ocr_data = extract_text_and_positions_from_image(image)
# Get the width and height of the image for proper scaling image_width, image_height = image.size overlay = BytesIO() canvas = Canvas(overlay, pagesize=(image_width, image_height))
line_number = 0 previous_y = None line_height = 15 # Approximate line height (adjust for spacing)
non_empty_lines = [] # Store non-empty line y-coordinates
# Collect non-empty lines based on OCR bounding boxes for i in range(len(ocr_data['level'])): y = int(ocr_data['top'][i]) text = ocr_data['text'][i]
# Skip empty or invalid text if not text.strip(): continue
# Track non-empty lines based on their Y positions if previous_y is None or abs(previous_y - y) >= line_height: non_empty_lines.append(y) previous_y = y
# Sort the y-positions to ensure numbering from top to bottom non_empty_lines.sort(reverse=True)
# Draw line numbers starting from the 10th non-empty line for i, y in enumerate(non_empty_lines): # Start numbering from the 10th line and continue numbering every 10th line if i >= 9 and (i + 1) % 10 == 0: line_number += 10 canvas.setFont("Helvetica", 30) # Align the line number on the right side of the page canvas.drawString(image_width - 70, y, str(line_number)) # Adjust `image_width - 50` to align right
# Add page number in the top-right corner canvas.setFont("Helvetica", 80) canvas.drawString(image_width - 70, image_height - 60, str(page_num + 1)) # Page number canvas.save() overlay.seek(0)
# Convert PIL image to a PDF format img_pdf = BytesIO() image.save(img_pdf, format='PDF') img_pdf.seek(0)
# Create a PdfReader object for both the image and the overlay image_reader = PdfReader(img_pdf) overlay_pdf = PdfReader(overlay)
# Get the first page from both page = image_reader.pages[0] page.merge_page(overlay_pdf.pages[0]) # Merge the overlay with the image page
pdf_writer.add_page(page)
# Save the final PDF with line numbers with open(output_pdf_path, 'wb') as output_file: pdf_writer.write(output_file)
print(f"Line numbers added to scanned PDF and saved as {output_pdf_path}.")
# Example usage input_pdf = r'C:\PyCharm Projects\AirRumi\media\20240703125513952.pdf' # Path to the scanned PDF output_pdf = r'C:\PyCharm Projects\AirRumi\media\text_with_line_numbers.pdf' # Output PDF
# For scanned PDFs add_line_numbers_to_scanned_pdf(input_pdf, output_pdf) [/code] Это обновленный код, вывод не совсем соответствует тексту