import fitz # PyMuPDF
import csv
INPUT_PDF = "input.pdf"
OUTPUT_CSV = "points.csv"
OUTPUT_PDF = "output.pdf"
TARGET_KEY = "/PageItemUIDToLocationDataMap"
def extract_datamap_points(pdf_path, target_key=TARGET_KEY):
out_rows = []
with pikepdf.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
piece_info = page.get('/PieceInfo', None)
if piece_info and '/InDesign' in piece_info:
indesign = piece_info['/InDesign']
if target_key in indesign:
for k, v in indesign[target_key].items():
try:
id_ = int(str(k).lstrip('/'))
type_val = float(v[2])
coords = [float(val) for val in list(v)[3:7]]
out_rows.append([i+1, id_, type_val] + coords)
except Exception as e:
print(f"Error parsing {k}:{v} ({e})")
return out_rows
def get_pdf_page_count(pdf_path):
with pikepdf.open(pdf_path) as pdf:
return len(pdf.pages)
def process_rows(rows, max_pdf_pages):
Y_TRANSFORM_BASE = 420.945 # Local constant hack for y-coordinate transform
# Datamaps are read sequentially so hack to pages
total_pages = get_pdf_page_count(INPUT_PDF)
hack_page = lambda page: 2 if (page >= max_pdf_pages) else (page + 1 if page > 1 else page)
processed_rows = []
for row in rows:
page, id_, type_val, x1, y1, x2, y2 = row
hacked_page = hack_page(page)
new_y1 = round(Y_TRANSFORM_BASE - y1, 3)
new_y2 = round(Y_TRANSFORM_BASE - y2, 3)
new_x1 = round(x1, 3)
new_x2 = round(x2, 3)
h = round(abs(new_y2 - new_y1), 1)
processed_rows.append([hacked_page, id_, type_val, new_x1, new_y1, new_x2, new_y2, h])
return processed_rows
def sort_and_filter(rows):
# Sort by page ascending, -y2 descending, x1 ascending, id ascending
rows_sorted = sorted(rows, key=lambda r: (r[0], -r[6], r[3], r[1]))
# Filter rows
filtered = []
for row in rows_sorted:
if (row[2] == 4 # type
and row[7] == 17): # height
filtered.append(row)
return filtered
def write_csv(csv_filename, rows):
with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['page', 'id', 'type', 'x1', 'y1', 'x2', 'y2', 'h'])
writer.writerows(rows)
def mark_points_on_pdf(input_pdf, output_pdf, rows):
doc = fitz.open(input_pdf)
for row in rows:
page_num = int(row[0])
cx = row[3]
cy = row[6]
page = doc[page_num - 1]
pymupdf_y = page.rect.height - cy
page.draw_circle((cx, pymupdf_y), radius=2, color=(0, 0, 0), fill=(0, 0, 0))
doc.save(output_pdf)
if __name__ == "__main__":
points = extract_datamap_points(INPUT_PDF)
processed_points = process_rows(points, total_pages)
filtered_points = sort_and_filter(processed_points)
write_csv(OUTPUT_CSV, filtered_points)
mark_points_on_pdf(INPUT_PDF, OUTPUT_PDF, filtered_points)
print(f"Done. Points: {len(filtered_points)}; Wrote {OUTPUT_CSV} and {OUTPUT_PDF}")
< /code>
pdftk: < /p>
Потоки страниц PDF Uncespress PDF для редактирования PDF в текстовом редакторе (например, Vim, Emacs) < /p>
- pdftk Doc.Oc. Uncespress
Подробнее здесь: https://stackoverflow.com/questions/797 ... s-from-pdf