PDF Scraping in PythonPython

Программы на Python
Anonymous
 PDF Scraping in Python

Сообщение Anonymous »

У меня возникают проблемы с очисткой определенных данных из файлов PDF в Python. Нет никаких консольных ошибок, но когда производится CSV, имя владельца столбцов - почтовый индекс либо заполняется неправильными данными, либо без данных. Эти шесть столбцов должны быть заполнены данными, схваченными с левой стороны PDF, обнаруженных под номером посылок налоговой карты. Кроме этого, все остальное в порядке. < /p>
snip моего текущего CSV, где я вручную вводил некоторые недостающие данные < /p>
Вот Ссылка на PDF: https://www.columbiacountyny.com/upload ... e_roll.pdf
import re
import csv
import pdfplumber

# Define the file path
root_path = "C:\\Users\\jfdal\\OneDrive\\Desktop\\2022"
file_name = "austerlitz_2022_fr.pdf"
file_path = f"{root_path}\\{file_name}"

# Retain your property address pattern
property_address_pattern = r"\n([A-Za-z0-9\s]+(?:\d{1,4}\s[A-Za-z0-9\s]+)?)\n"
acreage_pattern = r"ACRES\s+(\d{1,3}\.\d{1,2})"
value_pattern = r"FULL MARKET VALUE\s+([\d,]+)"
tax_pattern = r"COUNTY TAXABLE VALUE\s+([\d,]+)"
ag_tax_pattern = r"AG\s+DISTRIC\s+41720\s+([\d,]+)"
forest_tax_pattern = r"FOREST\s+LND\s+47460\s+([\d,]+)"
solar_tax_pattern = r"RPTL 487\s+([\d,]+)"

# List to store extracted data
results = []

# Read the entire PDF
with pdfplumber.open(file_path) as pdf:
content = "".join([page.extract_text() for page in pdf.pages]) # Process all pages

# Remove page headers before splitting into property sections
content = re.sub(r"TAX MAP PARCEL NUMBER PROPERTY LOCATION & CLASS.*?ACCOUNT NO\..*?\n", "", content, flags=re.S)

# Split content by property sections
properties = re.split(r"\*{50,}", content)

for prop in properties:
try:
lines = [line.strip() for line in prop.split("\n") if line.strip()]

# Initialize placeholders
prop_id = "0"
owner_first_name, owner_last_name = "0", "0"
street_address, owner_town, owner_state, zip_code = "0", "0", "0", "0"

# First line for Prop ID and Address
if len(lines) > 0:
first_line = lines[0]
# Extract Prop ID from the first line if structured like "123.-1-45"
prop_id_match = re.match(r"(\d{1,3}\.\-?\d{1,3}\-?\d{1,3}\.?\d{0,3})", first_line)
prop_id = prop_id_match.group(1) if prop_id_match else "0"

# Owner's information parsing logic
for line in lines:
if len(line.split()) > 3: # Likely contains owner or address information
if "CURRENT OWNERS NAME" in line:
owner_name_parts = line.replace("CURRENT OWNERS NAME", "").strip().split()
owner_first_name = owner_name_parts[0] if len(owner_name_parts) > 0 else "0"
owner_last_name = owner_name_parts[-1] if len(owner_name_parts) > 1 else "0"
elif "CURRENT OWNERS ADDRESS" in line:
address_parts = line.replace("CURRENT OWNERS ADDRESS", "").strip().split(", ")
if len(address_parts) == 3:
street_address = address_parts[0]
owner_town, owner_state, zip_code = address_parts[1], address_parts[2].split()[0], address_parts[2].split()[1]

# Extract data using regex patterns
property_address_match = re.search(property_address_pattern, prop)
acreage_match = re.search(acreage_pattern, prop)
full_market_value_match = re.search(value_pattern, prop)
tax_value_match = re.search(tax_pattern, prop)
ag_tax_match = re.search(ag_tax_pattern, prop)
forest_tax_match = re.search(forest_tax_pattern, prop)
solar_tax_match = re.search(solar_tax_pattern, prop)

# Handle numeric fields with commas
def parse_numeric(value_match):
return value_match.group(1).replace(",", "") if value_match else "0"

# Get acreage and filter properties with less than 100 acres
acreage = float(acreage_match.group(1)) if acreage_match else 0
if acreage < 100:
continue

# Append extracted results
results.append({
"Property Town": "Austerlitz",
"Prop ID": prop_id,
"Property Address": property_address_match.group(1).strip() if property_address_match else "0",
"Acreage": f"{acreage:.2f}",
"Owner First Name": owner_first_name,
"Owner Last Name": owner_last_name,
"Street Address": street_address,
"Town": owner_town,
"State": owner_state,
"Zip Code": zip_code,
"Full Market Value": parse_numeric(full_market_value_match),
"Tax Value": parse_numeric(tax_value_match),
"AG Tax": parse_numeric(ag_tax_match),
"Forest Tax": parse_numeric(forest_tax_match),
"Solar Tax": parse_numeric(solar_tax_match)
})
except Exception as e:
# Handle parsing errors
print(f"Error parsing property: {e}")

# Write results to CSV (only properties with 100+ acres)
output_file = f"{root_path}\\austerlitz_results.csv"
with open(output_file, 'w', newline='') as csvfile:
fieldnames = [
"Property Town", "Prop ID", "Property Address", "Acreage", "Owner First Name", "Owner Last Name",
"Street Address", "Town", "State", "Zip Code", "Full Market Value", "Tax Value", "AG Tax", "Forest Tax", "Solar Tax"
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)

print(f"Data extraction complete. Results saved to {output_file}")
< /code>
Я ожидаю, что код произведет что -то похожее на мои данные вручную. Данные с нулями в имени владельца - столбцы почтового индекса должны быть заполнены правильными данными.

Подробнее здесь: https://stackoverflow.com/questions/793 ... -in-python

Вернуться в «Python»