Как извлечь информацию из файла w2 с помощью Python [закрыто]

Как извлечь информацию из файла w2 с помощью Python [закрыто] ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Как извлечь информацию из файла w2 с помощью Python [закрыто]

Цитата

Сообщение Anonymous » 04 янв 2025, 15:03

Я хочу извлечь информацию из файла w2, хранящегося в формате pdf.
Идея состоит в том, чтобы создать поля для каждого прямоугольника в файле w2 и очистить файл w2 для справки. p>
Почищенный файл w2 для справки
Я уже пробовал это:

Код: Выделить всё

import cv2
import numpy as np

def get_more_bboxes(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# 1. Apply noise reduction before anything
gray = cv2.medianBlur(gray, 5) # or cv2.GaussianBlur

# 2. Apply thresholding
# Try different methods, or combinations
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
# Another option would be cv2.threshold
#_, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)

# 3. Morphological operations to refine
kernel = np.ones((3,3), np.uint8)
thresh = cv2.erode(thresh, kernel, iterations=1) # maybe erode first
thresh = cv2.dilate(thresh, kernel, iterations=2) # maybe only dilate or open and close

# or try open and close operations
# thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
# thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1)

# 4. Find contours
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]

# 5. Filter contours based on properties
min_area = 50 # example
filtered_cnts = []
for c in cnts:
if cv2.contourArea(c) > min_area:
filtered_cnts.append(c)

# 6. Get bounding boxes
for c in filtered_cnts:
x,y,w,h = cv2.boundingRect(c)
if h*w > 350:
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)

cv2.imwrite("bbox.png", image)
print(f"Found {len(filtered_cnts)} contours!")
return image

# image path
get_more_bboxes("/content/Screenshot 2025-01-02 151226.png")

Используя этот метод, я получаю противоречивые поля. Есть ли способ получить более точные результаты?

Подробнее здесь: https://stackoverflow.com/questions/793 ... ing-python

1735992206

Anonymous

Я хочу извлечь информацию из файла w2, хранящегося в формате pdf.
Идея состоит в том, чтобы создать поля для каждого прямоугольника в файле w2 и очистить файл w2 для справки. p>
Почищенный файл w2 для справки
Я уже пробовал это:
[code]import cv2
import numpy as np

def get_more_bboxes(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# 1. Apply noise reduction before anything
gray = cv2.medianBlur(gray, 5) # or cv2.GaussianBlur

# 2. Apply thresholding
# Try different methods, or combinations
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
# Another option would be cv2.threshold
#_, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)

# 3. Morphological operations to refine
kernel = np.ones((3,3), np.uint8)
thresh = cv2.erode(thresh, kernel, iterations=1) # maybe erode first
thresh = cv2.dilate(thresh, kernel, iterations=2) # maybe only dilate or open and close

# or try open and close operations
# thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
# thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1)

# 4. Find contours
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]

# 5. Filter contours based on properties
min_area = 50 # example
filtered_cnts = []
for c in cnts:
if cv2.contourArea(c) > min_area:
filtered_cnts.append(c)

# 6. Get bounding boxes
for c in filtered_cnts:
x,y,w,h = cv2.boundingRect(c)
if h*w > 350:
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)

cv2.imwrite("bbox.png", image)
print(f"Found {len(filtered_cnts)} contours!")
return image

# image path
get_more_bboxes("/content/Screenshot 2025-01-02 151226.png")
[/code]
Используя этот метод, я получаю противоречивые поля. Есть ли способ получить более точные результаты?
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79323734/how-to-extract-information-from-a-w2-file-using-python[/url]