У меня есть код, который обнаруживает таблицу в PDF-файле, которая появляется после определенного раздела, анализирует информацию в таблице и копирует ее в фрейм данных pandas.
Теперь я хочу указать, установлен ли флажок (или не пуст) рядом с информацией, проанализированной из таблицы.
Вот ссылка на PDF-файл
Вот мой код, который пока не может определить, является ли флажок отмечен или нет.
import pandas as pd
import re
import fitz
from math import sqrt
from io import BytesIO
PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"
# --- Helper Functions (Re-using the reliable text extraction) ---
def clean_item_text(text):
"""Removes leading symbols and cleans up whitespace."""
if pd.isna(text) or text == "":
return ""
# Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
return cleaned.strip()
def extract_table_text(pdf_path, section_header):
"""
Extracts the table data, but cleans the item text to get only the name.
"""
with fitz.open(pdf_path) as doc:
text_pages = [page.get_text("text") for page in doc]
full_text = "".join(text_pages)
full_text = full_text.replace("Sec$on", "Section")
section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
if not section_match:
raise ValueError(f"Section '{section_header}' not found.")
section_start = section_match.end()
text_after_section = full_text[section_start:].strip()
table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]
lines = [l.strip() for l in table_text.split("\n") if l.strip()]
if len(lines) < 6:
raise ValueError("Insufficient lines found for table structure.")
headers = [l.strip('"').strip() for l in lines[2:5]]
items_raw = lines[5:]
# Define column splits based on the provided data structure
col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]
# Process raw lists to get cleaned text for the DF
col1 = [clean_item_text(x) for x in col1_raw]
col2 = [clean_item_text(x) for x in col2_raw]
col3 = [clean_item_text(x) for x in col3_raw]
maxlen = max(len(col1), len(col2), len(col3))
for c in (col1, col2, col3):
while len(c) < maxlen:
c.append("")
df = pd.DataFrame({
headers[0]: col1,
headers[1]: col2,
headers[2]: col3
})
# Return both the DataFrame and the list of headers
return df, headers
# --- OCR/Image Analysis Logic ---
def scan_checkbox_roi(pdf_path, df, all_headers):
"""
Scans an image region (ROI) to the left of each item name to detect a mark.
"""
mapping = {}
# Flatten all items in the DataFrame to a list of unique names (and filter blanks)
all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
all_items = list(set(all_items))
print("="*60)
print("IMAGE SCAN (OCR) ATTEMPT")
print("="*60)
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):
# Find coordinates of all relevant items on the page
words = page.get_text("words")
# Map item name to its bounding box (bbox)
item_coords = {}
for word in words:
text = clean_item_text(word[4])
if text in all_items and text not in item_coords:
item_coords[text] = word[:4] # (x0, y0, x1, y1)
# Process each found item
for item_text, item_bbox in item_coords.items():
# Define ROI: A small rectangle to the left of the item text.
# x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
# x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5,
item_bbox[0] - 5, item_bbox[3] + 5)
if not roi_rect.is_empty:
# 1. Render the ROI to a Pixmap (Image) at high resolution
matrix = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=matrix, clip=roi_rect)
# 2. Analyze Pixels for a Mark
dark_pixel_threshold = 0.9 # 90% white threshold
dark_pixel_count = 0
total_pixels = pix.width * pix.height
for i in range(0, len(pix.samples), pix.n):
r, g, b = pix.samples[i:i+3]
# Convert RGB to grayscale (luminance)
luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0
if luminance < dark_pixel_threshold:
dark_pixel_count += 1
# 3. Determine Status
mark_ratio = dark_pixel_count / total_pixels
if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
status = "checked"
else:
status = "unchecked"
mapping[item_text] = status
print(f" ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
else:
mapping[item_text] = ""
print(f" ✗ '{item_text}' - Invalid ROI")
return mapping
# --- Main Logic ---
def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
# 1. Extract the clean item names and original headers
df, original_data_cols = extract_table_text(pdf_file_path, section_header)
# 2. Use the item names to guide the image scanning for status
checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)
# 3. Apply status to dataframe (FIXED LOGIC)
# Ensure we only iterate over the original columns before adding new ones
for col in original_data_cols:
status_col = f"{col} Status"
def get_status(x):
if pd.isna(x) or x == "":
return ""
val = str(x).strip()
return checkbox_map.get(val, "")
df[status_col] = df[col].map(get_status)
# Re-order columns using the clean, original column list
new_cols = []
for h in original_data_cols:
new_cols.append(h)
new_cols.append(f"{h} Status")
return df[new_cols]
# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)
У меня есть код, который обнаруживает таблицу в PDF-файле, которая появляется после определенного раздела, анализирует информацию в таблице и копирует ее в фрейм данных pandas. Теперь я хочу указать, установлен ли флажок (или не пуст) рядом с информацией, проанализированной из таблицы. Вот ссылка на PDF-файл Вот мой код, который пока не может определить, является ли флажок отмечен или нет. [code]import pandas as pd import re import fitz from math import sqrt from io import BytesIO
# --- Helper Functions (Re-using the reliable text extraction) ---
def clean_item_text(text): """Removes leading symbols and cleans up whitespace.""" if pd.isna(text) or text == "": return "" # Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip()) return cleaned.strip()
def extract_table_text(pdf_path, section_header): """ Extracts the table data, but cleans the item text to get only the name. """ with fitz.open(pdf_path) as doc: text_pages = [page.get_text("text") for page in doc] full_text = "".join(text_pages) full_text = full_text.replace("Sec$on", "Section")
section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE) if not section_match: raise ValueError(f"Section '{section_header}' not found.") section_start = section_match.end() text_after_section = full_text[section_start:].strip() table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]
lines = [l.strip() for l in table_text.split("\n") if l.strip()]
if len(lines) < 6: raise ValueError("Insufficient lines found for table structure.")
headers = [l.strip('"').strip() for l in lines[2:5]] items_raw = lines[5:]
# Define column splits based on the provided data structure col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]
# Process raw lists to get cleaned text for the DF col1 = [clean_item_text(x) for x in col1_raw] col2 = [clean_item_text(x) for x in col2_raw] col3 = [clean_item_text(x) for x in col3_raw]
maxlen = max(len(col1), len(col2), len(col3)) for c in (col1, col2, col3): while len(c) < maxlen: c.append("")
# Return both the DataFrame and the list of headers return df, headers
# --- OCR/Image Analysis Logic ---
def scan_checkbox_roi(pdf_path, df, all_headers): """ Scans an image region (ROI) to the left of each item name to detect a mark. """ mapping = {}
# Flatten all items in the DataFrame to a list of unique names (and filter blanks) all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""] all_items = list(set(all_items))
with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, 1):
# Find coordinates of all relevant items on the page words = page.get_text("words")
# Map item name to its bounding box (bbox) item_coords = {} for word in words: text = clean_item_text(word[4]) if text in all_items and text not in item_coords: item_coords[text] = word[:4] # (x0, y0, x1, y1)
# Process each found item for item_text, item_bbox in item_coords.items():
# Define ROI: A small rectangle to the left of the item text. # x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5 # x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5 roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5, item_bbox[0] - 5, item_bbox[3] + 5)
if not roi_rect.is_empty: # 1. Render the ROI to a Pixmap (Image) at high resolution matrix = fitz.Matrix(3, 3) pix = page.get_pixmap(matrix=matrix, clip=roi_rect)
# 2. Analyze Pixels for a Mark dark_pixel_threshold = 0.9 # 90% white threshold dark_pixel_count = 0 total_pixels = pix.width * pix.height
for i in range(0, len(pix.samples), pix.n): r, g, b = pix.samples[i:i+3] # Convert RGB to grayscale (luminance) luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0
if luminance < dark_pixel_threshold: dark_pixel_count += 1
# 3. Determine Status mark_ratio = dark_pixel_count / total_pixels
if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected) status = "checked" else: status = "unchecked"
def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header): # 1. Extract the clean item names and original headers df, original_data_cols = extract_table_text(pdf_file_path, section_header)
# 2. Use the item names to guide the image scanning for status checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)
# 3. Apply status to dataframe (FIXED LOGIC)
# Ensure we only iterate over the original columns before adding new ones
for col in original_data_cols: status_col = f"{col} Status" def get_status(x): if pd.isna(x) or x == "": return "" val = str(x).strip() return checkbox_map.get(val, "")
df[status_col] = df[col].map(get_status)
# Re-order columns using the clean, original column list new_cols = [] for h in original_data_cols: new_cols.append(h) new_cols.append(f"{h} Status")
return df[new_cols]
# Run result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER) [/code] Окончательный фрейм данных должен выглядеть так: [code]Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status Item1 Checked Item4 Checked Item10 Checked Item2 Item5 Item11 Item3 Item6 Item12 Item7 Checked Item13 Checked Item8 Item14 Item9 Item15 Checked [/code] Но столбцы немного смещены, и ни один крестик в полях не обнаруживается. Как мне решить эту проблему?