Как извлечь таблицу из PDF с полями в фрейм данных pandas

Как извлечь таблицу из PDF с полями в фрейм данных pandas ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Как извлечь таблицу из PDF с полями в фрейм данных pandas

Цитата

Сообщение Anonymous » 03 ноя 2025, 17:50

У меня есть код, который обнаруживает таблицу в PDF-файле, которая появляется после определенного раздела, анализирует информацию в таблице и копирует ее в фрейм данных pandas.
Теперь я хочу указать, установлен ли флажок (или не пуст) рядом с информацией, проанализированной из таблицы.
Вот ссылка на PDF-файл
Вот мой код, который пока не может определить, является ли флажок отмечен или нет.

Код: Выделить всё

import pandas as pd
import re
import fitz
from math import sqrt
from io import BytesIO

PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"

# --- Helper Functions (Re-using the reliable text extraction) ---

def clean_item_text(text):
"""Removes leading symbols and cleans up whitespace."""
if pd.isna(text) or text == "":
return ""
# Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
return cleaned.strip()

def extract_table_text(pdf_path, section_header):
"""
Extracts the table data, but cleans the item text to get only the name.
"""
with fitz.open(pdf_path) as doc:
text_pages = [page.get_text("text") for page in doc]
full_text = "".join(text_pages)
full_text = full_text.replace("Sec$on", "Section")

section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
if not section_match:
raise ValueError(f"Section '{section_header}' not found.")
section_start = section_match.end()
text_after_section = full_text[section_start:].strip()
table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]

lines = [l.strip() for l in table_text.split("\n") if l.strip()]

if len(lines) < 6:
raise ValueError("Insufficient lines found for table structure.")

headers = [l.strip('"').strip() for l in lines[2:5]]
items_raw = lines[5:]

# Define column splits based on the provided data structure
col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]

# Process raw lists to get cleaned text for the DF
col1 = [clean_item_text(x) for x in col1_raw]
col2 = [clean_item_text(x) for x in col2_raw]
col3 = [clean_item_text(x) for x in col3_raw]

maxlen = max(len(col1), len(col2), len(col3))
for c in (col1, col2, col3):
while len(c) <  maxlen:
c.append("")

df = pd.DataFrame({
headers[0]: col1,
headers[1]: col2,
headers[2]: col3
})

# Return both the DataFrame and the list of headers
return df, headers

# --- OCR/Image Analysis Logic ---

def scan_checkbox_roi(pdf_path, df, all_headers):
"""
Scans an image region (ROI) to the left of each item name to detect a mark.
"""
mapping = {}

# Flatten all items in the DataFrame to a list of unique names (and filter blanks)
all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
all_items = list(set(all_items))

print("="*60)
print("IMAGE SCAN (OCR) ATTEMPT")
print("="*60)

with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):

# Find coordinates of all relevant items on the page
words = page.get_text("words")

# Map item name to its bounding box (bbox)
item_coords = {}
for word in words:
text = clean_item_text(word[4])
if text in all_items and text not in item_coords:
item_coords[text] = word[:4] # (x0, y0, x1, y1)

# Process each found item
for item_text, item_bbox in item_coords.items():

# Define ROI: A small rectangle to the left of the item text.
# x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
# x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5,
item_bbox[0] - 5, item_bbox[3] + 5)

if not roi_rect.is_empty:
# 1. Render the ROI to a Pixmap (Image) at high resolution
matrix = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=matrix, clip=roi_rect)

# 2. Analyze Pixels for a Mark
dark_pixel_threshold = 0.9 # 90% white threshold
dark_pixel_count = 0
total_pixels = pix.width * pix.height

for i in range(0, len(pix.samples), pix.n):
r, g, b = pix.samples[i:i+3]
# Convert RGB to grayscale (luminance)
luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0

if luminance < dark_pixel_threshold:
dark_pixel_count += 1

# 3. Determine Status
mark_ratio = dark_pixel_count / total_pixels

if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
status = "checked"
else:
status = "unchecked"

mapping[item_text] = status
print(f"  ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
else:
mapping[item_text] = ""
print(f"  ✗ '{item_text}' - Invalid ROI")

return mapping

# --- Main Logic ---

def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
# 1. Extract the clean item names and original headers
df, original_data_cols = extract_table_text(pdf_file_path, section_header)

# 2. Use the item names to guide the image scanning for status
checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)

# 3.  Apply status to dataframe (FIXED LOGIC)

# Ensure we only iterate over the original columns before adding new ones

for col in original_data_cols:
status_col = f"{col} Status"
def get_status(x):
if pd.isna(x) or x == "":
return ""
val = str(x).strip()
return checkbox_map.get(val, "")

df[status_col] = df[col].map(get_status)

# Re-order columns using the clean, original column list
new_cols = []
for h in original_data_cols:
new_cols.append(h)
new_cols.append(f"{h} Status")

return df[new_cols]

# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)

Окончательный фрейм данных должен выглядеть так:

Код: Выделить всё

Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status
Item1 Checked Item4 Checked Item10 Checked
Item2         Item5         Item11
Item3         Item6         Item12
Item7 Checked Item13 Checked
Item8         Item14
Item9         Item15 Checked

Но столбцы немного смещены, и ни один крестик в полях не обнаруживается.
Как мне решить эту проблему?

Подробнее здесь: https://stackoverflow.com/questions/798 ... -dataframe

1762181433

Anonymous

У меня есть код, который обнаруживает таблицу в PDF-файле, которая появляется после определенного раздела, анализирует информацию в таблице и копирует ее в фрейм данных pandas.
Теперь я хочу указать, установлен ли флажок (или не пуст) рядом с информацией, проанализированной из таблицы.
Вот ссылка на PDF-файл
Вот мой код, который пока не может определить, является ли флажок отмечен или нет.
[code]import pandas as pd
import re
import fitz
from math import sqrt
from io import BytesIO

PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"

# --- Helper Functions (Re-using the reliable text extraction) ---

def clean_item_text(text):
"""Removes leading symbols and cleans up whitespace."""
if pd.isna(text) or text == "":
return ""
# Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
return cleaned.strip()

def extract_table_text(pdf_path, section_header):
"""
Extracts the table data, but cleans the item text to get only the name.
"""
with fitz.open(pdf_path) as doc:
text_pages = [page.get_text("text") for page in doc]
full_text = "".join(text_pages)
full_text = full_text.replace("Sec$on", "Section")

section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
if not section_match:
raise ValueError(f"Section '{section_header}' not found.")
section_start = section_match.end()
text_after_section = full_text[section_start:].strip()
table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]

lines = [l.strip() for l in table_text.split("\n") if l.strip()]

if len(lines) < 6:
raise ValueError("Insufficient lines found for table structure.")

headers = [l.strip('"').strip() for l in lines[2:5]]
items_raw = lines[5:]

# Define column splits based on the provided data structure
col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]

# Process raw lists to get cleaned text for the DF
col1 = [clean_item_text(x) for x in col1_raw]
col2 = [clean_item_text(x) for x in col2_raw]
col3 = [clean_item_text(x) for x in col3_raw]

maxlen = max(len(col1), len(col2), len(col3))
for c in (col1, col2, col3):
while len(c) <  maxlen:
c.append("")

df = pd.DataFrame({
headers[0]: col1,
headers[1]: col2,
headers[2]: col3
})

# Return both the DataFrame and the list of headers
return df, headers

# --- OCR/Image Analysis Logic ---

def scan_checkbox_roi(pdf_path, df, all_headers):
"""
Scans an image region (ROI) to the left of each item name to detect a mark.
"""
mapping = {}

# Flatten all items in the DataFrame to a list of unique names (and filter blanks)
all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
all_items = list(set(all_items))

print("="*60)
print("IMAGE SCAN (OCR) ATTEMPT")
print("="*60)

with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):

# Find coordinates of all relevant items on the page
words = page.get_text("words")

# Map item name to its bounding box (bbox)
item_coords = {}
for word in words:
text = clean_item_text(word[4])
if text in all_items and text not in item_coords:
item_coords[text] = word[:4] # (x0, y0, x1, y1)

# Process each found item
for item_text, item_bbox in item_coords.items():

# Define ROI: A small rectangle to the left of the item text.
# x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
# x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5,
item_bbox[0] - 5, item_bbox[3] + 5)

if not roi_rect.is_empty:
# 1. Render the ROI to a Pixmap (Image) at high resolution
matrix = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=matrix, clip=roi_rect)

# 2. Analyze Pixels for a Mark
dark_pixel_threshold = 0.9 # 90% white threshold
dark_pixel_count = 0
total_pixels = pix.width * pix.height

for i in range(0, len(pix.samples), pix.n):
r, g, b = pix.samples[i:i+3]
# Convert RGB to grayscale (luminance)
luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0

if luminance < dark_pixel_threshold:
dark_pixel_count += 1

# 3. Determine Status
mark_ratio = dark_pixel_count / total_pixels

if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
status = "checked"
else:
status = "unchecked"

mapping[item_text] = status
print(f"  ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
else:
mapping[item_text] = ""
print(f"  ✗ '{item_text}' - Invalid ROI")

return mapping

# --- Main Logic ---

def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
# 1. Extract the clean item names and original headers
df, original_data_cols = extract_table_text(pdf_file_path, section_header)

# 2. Use the item names to guide the image scanning for status
checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)

# 3.  Apply status to dataframe (FIXED LOGIC)

# Ensure we only iterate over the original columns before adding new ones

for col in original_data_cols:
status_col = f"{col} Status"
def get_status(x):
if pd.isna(x) or x == "":
return ""
val = str(x).strip()
return checkbox_map.get(val, "")

df[status_col] = df[col].map(get_status)

# Re-order columns using the clean, original column list
new_cols = []
for h in original_data_cols:
new_cols.append(h)
new_cols.append(f"{h} Status")

return df[new_cols]

# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)
[/code]
Окончательный фрейм данных должен выглядеть так:
[code]Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status
Item1 Checked Item4 Checked Item10 Checked
Item2         Item5         Item11
Item3         Item6         Item12
Item7 Checked Item13 Checked
Item8         Item14
Item9         Item15 Checked
[/code]
Но столбцы немного смещены, и ни один крестик в полях не обнаруживается.
Как мне решить эту проблему? 

Подробнее здесь: [url]https://stackoverflow.com/questions/79806769/how-to-extract-table-from-pdf-with-boxes-into-pandas-dataframe[/url]