Это функция Python, которая у меня есть. :
Код: Выделить всё
def extract_text_and_save_images_not_working(pdf_path):
doc = fitz.open(pdf_path)
full_text = ""
image_counter = 1 # Initialize the image counter before iterating through pages
for page_num in range(len(doc)): # Iterate through each page of the pdf document
page = doc.load_page(page_num) # Load the pdf page
blocks = page.get_text("dict")["blocks"] # The list of block dictionaries
for block in blocks: # Iterate through each block
if block['type'] == 0: # If the block is a text block
for line in block["lines"]: # Iterate through lines in the block
for span in line["spans"]: # Iterate through spans in the line
full_text += span["text"] + " " # Append text to full_text
full_text += "\n" # Add newline after each block
elif block['type'] == 1: # If the block is an image block
image_label = f"" # Label to insert in the extracted text in place of the corresponding image
full_text += f"{image_label}\n" # Insert image label at the image location
img = block['image']
xref = img[0]
print()
print(xref)
print()
base_image = doc.extract_image(xref) # Attempt to extract image
image_bytes = base_image["image"] # Get the image bytes
image_filename = f"image_{image_counter}.png"
with open(image_filename, "wb") as img_file: # Save the image
img_file.write(image_bytes)
image_counter += 1 # Increment counter for next image regardless of extraction success
doc.close() # Close the pdf document
return full_text
Код: Выделить всё
block['type'] == 0
Код: Выделить всё
block['type'] == 1). Если блок представляет собой изображение, то функция сохраняет изображение в том же каталоге работающего скрипта с этим именем f"image_{image_counter}.png"
Код: Выделить всё
f""
Теперь, когда я запускаю эту функцию, я получаю следующую ошибку:
Код: Выделить всё
Traceback (most recent call last):
File "c:\Users\xxxx\Desktop\X_Project\extract_images_from_pdf\extract_text_and_images_from_pdf.py", line 93, in
extracted_text = extract_text_and_save_images_not_working(pdf_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\xxxx\Desktop\X_Project\extract_images_from_pdf\extract_text_and_images_from_pdf.py", line 76, in extract_text_and_save_images_not_working
base_image = doc.extract_image(xref) # Attempt to extract image
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxx\Desktop\X_Project\extract_images_from_pdf\venv\Lib\site-packages\fitz\__init__.py", line 3894, in extract_image
raise ValueError( MSG_BAD_XREF)
ValueError: bad xref
Подробнее здесь: https://stackoverflow.com/questions/781 ... t-textdict