Извлечение изображений из столбца таблицы в PDF и сохранение их в Excel

Извлечение изображений из столбца таблицы в PDF и сохранение их в Excel ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Извлечение изображений из столбца таблицы в PDF и сохранение их в Excel

Цитата

Сообщение Anonymous » 03 дек 2024, 18:29

При использовании pdfplumber , я не могу извлечь изображения из определенного столбца таблицы.
Для получения изображений я использую следующие пакеты:
import fitz # PyMuPDF
from PIL import Image

Мне нужно использовать пакеты pdfplumber и выше, чтобы разместить изображения в строках Excel, как в исходном PDF-файле.
from flask import Flask, request, send_file, jsonify
import pdfplumber
import pandas as pd
import os
from flask_cors import CORS
import io
import fitz # PyMuPDF
from PIL import Image
from openpyxl import Workbook
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.utils.dataframe import dataframe_to_rows

app = Flask(__name__)
CORS(app)

@app.route('/')
def home():
return jsonify({'message': 'Welcome to the PDF to Excel converter API!'})

@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
# Ensure the 'uploads' directory exists
if not os.path.exists('uploads'):
os.makedirs('uploads')

# Save the uploaded PDF file
pdf_path = os.path.join('uploads', file.filename)
file.save(pdf_path)

# Extract tables and images, and save to Excel
excel_path = pdf_path.replace('.pdf', '.xlsx')
try:
# Open the PDF file with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
wb = Workbook()
ws_data = wb.active
ws_data.title = "Data"
data_rows = []

for page_num, page in enumerate(pdf.pages, start=1):
all_tables = []

# Extract tables from the current page
tables = page.extract_tables()
for table in tables:
df = pd.DataFrame(table)
all_tables.append(df)

if all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)

if page_num == 1: # Add headers only once for the first page
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=True))
else:
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=False))

# Open the PDF file with PyMuPDF to extract images
pdf_file = fitz.open(pdf_path)
output_dir = 'images'
os.makedirs(output_dir, exist_ok=True)

# Extract images from the current page
page_mupdf = pdf_file[page_num - 1] # Page index is 0-based
image_list = page_mupdf.get_images(full=True)

for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image = Image.open(io.BytesIO(image_bytes))

# Save the image
image_filename = f'{output_dir}/page_{page_num}_img_{img_index + 1}.{image_ext}'
image.save(image_filename)

# Create a new sheet for each image
ws_images = wb.create_sheet(title=f"Page_{page_num}_Image_{img_index + 1}")
img_excel = ExcelImage(image_filename)
ws_images.add_image(img_excel, 'A1')

# Write all data rows to the 'Data' sheet
for row in data_rows:
ws_data.append(row)

# Save the Excel workbook
wb.save(excel_path)

return send_file(excel_path, as_attachment=True)

except Exception as e:
return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
if not os.path.exists('uploads'):
os.makedirs('uploads')
app.run(debug=True, port=2000, host='0.0.0.0')

Подробнее здесь: https://stackoverflow.com/questions/787 ... m-to-excel

1733239773

Anonymous


[img]https://i.sstatic.net/HfjCAnOy.png[/img]

При использовании pdfplumber , я не могу извлечь изображения из определенного столбца таблицы.
Для получения изображений я использую следующие пакеты:
import fitz  # PyMuPDF
from PIL import Image

Мне нужно использовать пакеты pdfplumber и выше, чтобы разместить изображения в строках Excel, как в исходном PDF-файле.
from flask import Flask, request, send_file, jsonify
import pdfplumber
import pandas as pd
import os
from flask_cors import CORS
import io
import fitz  # PyMuPDF
from PIL import Image
from openpyxl import Workbook
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.utils.dataframe import dataframe_to_rows

app = Flask(__name__)
CORS(app)

@app.route('/')
def home():
return jsonify({'message': 'Welcome to the PDF to Excel converter API!'})

@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
# Ensure the 'uploads' directory exists
if not os.path.exists('uploads'):
os.makedirs('uploads')

# Save the uploaded PDF file
pdf_path = os.path.join('uploads', file.filename)
file.save(pdf_path)

# Extract tables and images, and save to Excel
excel_path = pdf_path.replace('.pdf', '.xlsx')
try:
# Open the PDF file with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
wb = Workbook()
ws_data = wb.active
ws_data.title = "Data"
data_rows = []

for page_num, page in enumerate(pdf.pages, start=1):
all_tables = []

# Extract tables from the current page
tables = page.extract_tables()
for table in tables:
df = pd.DataFrame(table)
all_tables.append(df)

if all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)

if page_num == 1:  # Add headers only once for the first page
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=True))
else:
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=False))

# Open the PDF file with PyMuPDF to extract images
pdf_file = fitz.open(pdf_path)
output_dir = 'images'
os.makedirs(output_dir, exist_ok=True)

# Extract images from the current page
page_mupdf = pdf_file[page_num - 1]  # Page index is 0-based
image_list = page_mupdf.get_images(full=True)

for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image = Image.open(io.BytesIO(image_bytes))

# Save the image
image_filename = f'{output_dir}/page_{page_num}_img_{img_index + 1}.{image_ext}'
image.save(image_filename)

# Create a new sheet for each image
ws_images = wb.create_sheet(title=f"Page_{page_num}_Image_{img_index + 1}")
img_excel = ExcelImage(image_filename)
ws_images.add_image(img_excel, 'A1')

# Write all data rows to the 'Data' sheet
for row in data_rows:
ws_data.append(row)

# Save the Excel workbook
wb.save(excel_path)

return send_file(excel_path, as_attachment=True)

except Exception as e:
return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
if not os.path.exists('uploads'):
os.makedirs('uploads')
app.run(debug=True, port=2000, host='0.0.0.0')
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78754139/extracting-images-from-a-table-column-in-a-pdf-and-saving-them-to-excel[/url]