Вся строка из таблицы Excel удаляется в генераторе таблиц PythonPython

Программы на Python
Ответить
Anonymous
 Вся строка из таблицы Excel удаляется в генераторе таблиц Python

Сообщение Anonymous »

У меня есть файл в блокноте Jupyter, в котором несколько разных листов Excel с информацией о затратах на рабочую силу объединены в одну основную таблицу. Он более или менее делал именно то, что я хотел, но при проверке сумм числовых значений я обнаружил, что целая строка первой загруженной электронной таблицы была удалена. Я приведу снимок экрана со значениями, которые появляются в электронной таблице (выделенная строка — это удаляемая строка):
Изображение

Изображение

Изображение

Изображение

Изображение

И вот код:

Код: Выделить всё

# Step 1: Load files from Master Data folder

import os
import glob
import pandas as pd
import numpy as np
import re
import xlsxwriter

location = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025"

excel_files = (
glob.glob(location + r"\*.xlsx") +
glob.glob(location + r"\*.xlsm")
)

excel_files = [
f for f in excel_files
if not os.path.basename(f).startswith("~$")
]

print(excel_files)

Код: Выделить всё

# Step 2: Filter for all files from 2019 to 2023

filtered_files = [
f for f in excel_files
if re.search(r'QucikBooks 2019 Hourly Payroll Data|QucikBooks 2020 Hourly Payroll Data|QucikBooks 2021 Hourly Payroll Data|QucikBooks 2022 Hourly Payroll Data|QucikBooks 2023 Hourly Payroll Data|QucikBooks 2019 Salary Payroll Data|QucikBooks 2020 Salary Payroll Data|QucikBooks 2021 Salary Payroll Data|QucikBooks 2022 Salary Payroll Data|QucikBooks 2023 Salary Payroll Data'
, f)
]

print(filtered_files)

pd.ExcelFile(filtered_files[0]).sheet_names

Код: Выделить всё

# Use this to make sure the worksheets you need are being loaded. Each loaded file will appear in the output with the sheet names.

for f in filtered_files:
t = pd.ExcelFile(f)
print(os.path.basename(f), "->", t.sheet_names)

Код: Выделить всё

# Step 3: Match by Sheet Name. In this case by the 'Detail' sheets found in both files

dfs = []

for f in filtered_files:
filename = os.path.basename(f)

# Extract year from filename
match = re.search(r"\d{4}", filename)
if not match:
print(f"Year not found in filename: {filename}")
continue

year = int(match.group())
sheet_name = f"{year} Detail"

try:
df = pd.read_excel(f, sheet_name=sheet_name)
except ValueError:
print(f"Sheet '{sheet_name}' not found in {filename}")
continue

# First, normalize column names temporarily so we can safely reference them
df.columns = (
df.columns
.str.strip()
.str.lower()
.str.replace(r"[^\w]+", "_", regex=True)
.str.strip("_")
)

df = df.rename(columns={
"dt": "dt",           # dollars
"dt_hrs": "dt_hrs",    # hours
"chld1": "child1",    # Child Care (1)
"employee_name": "employee", # Employee name
"class": "dept", # Department or Class
"location": "work_location", # Work location
"regular_hours": "reg_hrs", # Regular hours
"regular_hourly_pay": "reg", # Regular hourly pay
"overtime_hours": "ot_hrs", # Overtime hours
"overtime_hourly_pay": "ot", # Overtime hourly pay
"overtime_x1_5_hourly": "ot_x1.5", # Overtime x1.5 hourly pay
"sick": "pto", # Paid during sick days goe sinto PTO column
"401k_emp": "_401k", # Employee 401k
"beam_dental": "beamd", # Beam dental insurance
"bonus_one_time_cash_award": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column).  This includes the proceeding bonus columns.
"bonus_one_time_cash_reward": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column). This includes the proceeding bonus columns.
"clerical_hourly_bonuses": "bonus", # Clerical bonus included
"clerical_bonuses": "bonus", # Clerical bonus included
"health_insurance_emp": "healt", # Employee health insurance
"life_ins": "life", # Life insurance
"roth_401k_emp": "_4roth",  # Employee Roth 401k
"4roth": "_4roth",  # Employee Roth 401k
"federal_withholding": "fitw", # Federal withholding
"medicare_employee": "med_ee", # Medicare employee
"med": "med_ee", # Medicare employee
"medicare_company": "med_er", # Medicare company
"med_r": "med_er", # Medicare company
"social_security_employee": "ss", # Social security employee
"ss": "ss", # Social security employee
"social_security_company": "ss_r", # Social security company
"ss_r": "ss_r", # Social security company
"id_1": "idaho_state_tax", # Idaho state tax
"advance_repayment": "advre", # Advance repayment
"net_pay": "net", # Net pay
"federal_unemployment": "futa", # Federal unemployment (FUTA)
"id_unemployment_company": "suta", # Unemployment company (SUTA)
"id_workforce_dev_fund": "suta", # Workforce development fund (SUTA)
"total_employer_taxes_and_contributions": "suta", # Total employer taxes and contributions (SUTA)
"id_withholding": "idaho_state_tax", # Idaho state tax
"vison": "vision" # Vision insurance

})

# Merge duplicate columns by summing them
if df.columns.duplicated().any():
print(f"{filename} has duplicate columns:", df.columns[df.columns.duplicated()].tolist())
# Sum numeric duplicates, keep first for text
df = (
df.T
.groupby(level=0)
.agg(lambda x: x.sum() if pd.api.types.is_numeric_dtype(x) else x.iloc[0])
.T
)

# Remove TOTAL / SUMMARY rows
if "employee" in df.columns:
df = df[~df["employee"].str.contains(
"total|summary|grand", case=False, na=False
)]

df["source_file"] = filename
df["year"] = year

# Ensure chk_date is datetime and strip time for THIS df
if "chk_date" in df.columns:
df["chk_date"] = pd.to_datetime(df["chk_date"], errors="coerce").dt.normalize()

# This is to drop the summed 'Total' rows that are in the original sheets. Will be appended to the dataframe.  r"\btotal\b" will target any version of the word total regardless of case.)
df_no_totals = df[
~df.astype(str).apply(
lambda row: row.str.contains(r"\btotal\b", case=False, na=False).any(),
axis=1
)
]

dfs.append(df_no_totals)

Код: Выделить всё

# Step 4: Begin aligning columns

column_map = {
# employee
"employee": "employee",
"employee_name": "employee",

# DT dollars and hours
"dt": "dt",
"dt_hrs": "dt_hrs",

# common hour variants
"reg_hours": "reg_hrs",
"regular_hours": "reg_hrs",
"ot_hours": "ot_hours",

# Child Care (1)
"child1": "child1",
"chld1": "child1",

# Medicare
"medicare_employee": "med_ee",
"med": "med_ee",
"medicare_company":"med_er",
"med_r": "med_er",

# 401k
"401k_emp": "_401k",
"401k": "_401k",

# Net pay
"net_pay": "net",

# FUTA
"federal_unemployment": "futa",

# SUTA
"id_unemployment_company": "suta",
"id_workforce_dev_fund": "suta",
"total_employer_taxes_and_contributions": "suta",

# Idaho state tax
"id_withholding": "idaho_state_tax",

# Vision insurance
"vison": "vision",
# PTO
"sick": "pto"

}

def standardize_columns(df):
return df.rename(columns=column_map)

dfs = [standardize_columns(df) for df in dfs]

Код: Выделить всё

for i, df in enumerate(dfs):
dupes = df.columns[df.columns.duplicated()]
if len(dupes) > 0:
print(f"DF {i} has duplicate columns:", dupes.tolist())

Код: Выделить всё

# Step 5: Concatenate

def clean_detail(df):
hour_cols = ["reg_hrs", "ot_hrs", "dt_hrs"]

existing = [c for c in hour_cols if c in df.columns]

# Only drop if ALL hour columns exist AND are NaN
if existing:
df = df.dropna(subset=existing, how="all")

return df

dfs = [clean_detail(df) for df in dfs]
# Actual concat
combined_2024_2025 = pd.concat(dfs, ignore_index=True, sort=False)

# Assure datetime is added correctly and stip the time portion from those values.

if "chk_date"  in combined_2024_2025.columns:
combined_2024_2025["chk_date"] = pd.to_datetime(combined_2024_2025["chk_date"], errors="coerce").dt.normalize()

# Combine idast, idsui, and idwd into one column for suta (state unemployment tax)
suta_cols = ["idast", "idsui", "idwd"]

existing_cols = [
c for c in suta_cols
if c in combined_2024_2025.columns
]

combined_2024_2025["suta"] = (
combined_2024_2025[existing_cols]
.fillna(0)
.sum(axis=1)
)

# Make an equivalent to the total gross pay column for 2024 and 2025
gross_wages_cols = ["reg", "dt", "ot", "bonus", "pto", "retro"]
existing = [c for c in gross_wages_cols if c in combined_2024_2025.columns]
combined_2024_2025["total_gross_wages"] = (
combined_2024_2025[existing]
.fillna(0)
.sum(axis=1)
)

# Drop deductions, SUTA (state unemployment benefits) and other columns that are not needed.
drop_col = [
"total_deductions",
"total_deductions_from_gross_pay",
"total_deductions_from_net_pay",
"idast", "idsui", "idwd",
"child_support_garnishment",
"total_taxes_withheld",
"wage_garnishment_other",
"total_earnings",
"telephone_deduction",
"employee_loan_repayment"

]

combined_2024_2025 = combined_2024_2025.drop(columns=drop_col, errors="ignore")

Код: Выделить всё

# Step 6: Light Data Cleaning (Before Export)

# Remove blank rows
master_2024_2025 = combined_2024_2025.dropna(how="all")

# Ensure numeric columns
numeric_cols = [
"process","chk_vchr", "net", "reg_hrs", "reg",
"dt", "ot_hours", "ot", "ot_x1.5", "_401er", "bonus",
"ermed", "grptl", "grptl_1","haltd", "haltd_1", "hsaer",
"hsaer_1", "pto", "total_hrs", "total_earnings", "fitw",
"med_ee", "medhi", "ss", "total_taxes", "med_er",
"ss_r", "idsat", "idsui", "idwd", "futa", "total_taxes_er",
"_401cu", "_401k", "_4roth", "advre", "beamd", "cell", "child1",
"garn1", "garn2", "healt", "sterl", "tcfob", "travr", "ulife",
"vis", "vison", "401cu", "401er", "401k", "_4roth", "adv", "advre",
"amount", "casha", "has", "hrs", "id", "idaho_state_tax", "idast",
"miscd", "ot", "ot_hrs", "pto_hrs", "reimb", "retro", "sturg",
"hsa", "slevy", "401k_co_match", "adjusted_gross_pay", "adjusted_gross_pay_hours",
"advance_repayment", "federal_unemployment", "net_pay_hours", "salary"
]
for col in numeric_cols:
if col in combined_2024_2025.columns:
combined_2024_2025[col] = (
combined_2024_2025[col]
.astype(str)
.str.replace(",", "", regex=False)
.str.replace("$", "", regex=False)
.str.strip()
.replace({"": np.nan})
.pipe(pd.to_numeric, errors="coerce")
)

# Columns that must come first
priority_cols = ["employee", "job_title", "dept", "work_location"]

# Keep only ones that actually exist (prevents errors)
priority_cols = [c for c in priority_cols if c in master_2024_2025.columns]

# All remaining columns in alphabetical order
other_cols = sorted(
[c for c in master_2024_2025.columns if c not in priority_cols]
)

# Reorder dataframe
master_2024_2025 = master_2024_2025[priority_cols + other_cols]

Код: Выделить всё

# Step 7:  Export Excel file

output_path = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025\master_detail_2019-2023.xlsx"
master_2024_2025.to_excel(output_path, index=False)

# Find the zero-based column index of 'chk_date'
cols = list(master_2024_2025.columns)
date_col_idx = cols.index("chk_date") if "chk_date" in cols else None

# This ensures the date fromatting is correct for the final export of the Excel file
with pd.ExcelWriter(output_path, engine="xlsxwriter", datetime_format="m/d/yyyy", date_format="m/d/yyyy") as writer:
master_2024_2025.to_excel(writer, index=False, sheet_name="Sheet1")
if date_col_idx is not None:
workbook  = writer.book
worksheet = writer.sheets["Sheet1"]
date_fmt  = workbook.add_format({"num_format": "m/d/yyyy"})
# Convert 0-based index to Excel letter range (e.g., B:B)
col_letter = chr(ord('A') + date_col_idx)
worksheet.set_column(f"{col_letter}:{col_letter}", 12, date_fmt)

print("Wrote:", output_path)

# When you are done be sure to check sums of numerical values as they must be accurate.
Я должен добавить, что эти 4 начальных столбца пусты, чтобы избежать раскрытия конфиденциальной информации. После отчаянного просмотра ChatGPT и Copilot он спросил, есть ли в этих столбцах какие-либо значения, помеченные как total, и я убедился, что их нет ни в одном из исходных файлов. Из-за этого я сомневаюсь, что это является причиной удаления этой конкретной строки. Любое направление приветствуется.

Подробнее здесь: https://stackoverflow.com/questions/798 ... -generator
Ответить

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

Вернуться в «Python»