Это код, который извлекает текст рукописи из https: //hal.science/hal-04206682/document:
Код: Выделить всё
import fitz # PyMuPDF
import requests
import io
def extract_text_from_pdf(pdf_file):
# Open the PDF file from the stream
doc = fitz.open(stream=pdf_file, filetype="pdf")
full_text = []
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("dict")["blocks"]
# Analyze the page width to detect column structure
page_width = page.rect.width
mid_x = page_width / 2 # Middle of the page for splitting columns
left_column = []
right_column = []
for block in blocks:
if "bbox" in block:
x0, y0, x1, y1 = block["bbox"] # Extract block bounding box
# Classify blocks into left or right columns
if x1 = mid_x:
right_column.append(block)
# Sort blocks by their vertical position (top) within each column
left_column.sort(key=lambda b: b["bbox"][1])
right_column.sort(key=lambda b: b["bbox"][1])
# Extract text from each column and concatenate
page_text = []
for column in [left_column, right_column]:
for block in column:
if "lines" in block:
block_text = ""
for line in block["lines"]:
for span in line["spans"]:
block_text += span["text"] + " "
page_text.append(block_text.strip())
# Combine text from both columns into page text
full_text.append("\n".join(page_text))
return "\n\n".join(full_text)
# Fetch the PDF from the URL
url = 'https://hal.science/hal-04206682/document'
try:
response = requests.get(url)
response.raise_for_status() # Raise an error if the request failed
pdf_file = io.BytesIO(response.content) # Load the PDF content into memory
# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)
print(text)
except requests.exceptions.RequestException as e:
print(f"Error downloading the PDF: {e}")
Код: Выделить всё
enter code hereПодробнее здесь: https://stackoverflow.com/questions/792 ... erences-or
Мобильная версия