У меня есть файл в блокноте Jupyter, в котором несколько разных листов Excel с информацией о затратах на рабочую силу объединены в одну основную таблицу. Он более или менее делал именно то, что я хотел, но при проверке сумм числовых значений я обнаружил, что целая строка первой загруженной электронной таблицы была удалена. Я приведу снимок экрана со значениями, которые появляются в электронной таблице (выделенная строка — это удаляемая строка):
# Step 1: Load files from Master Data folder
import os
import glob
import pandas as pd
import numpy as np
import re
import xlsxwriter
location = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025"
excel_files = (
glob.glob(location + r"\*.xlsx") +
glob.glob(location + r"\*.xlsm")
)
excel_files = [
f for f in excel_files
if not os.path.basename(f).startswith("~$")
]
print(excel_files)
# Use this to make sure the worksheets you need are being loaded. Each loaded file will appear in the output with the sheet names.
for f in filtered_files:
t = pd.ExcelFile(f)
print(os.path.basename(f), "->", t.sheet_names)
# Step 3: Match by Sheet Name. In this case by the 'Detail' sheets found in both files
dfs = []
for f in filtered_files:
filename = os.path.basename(f)
# Extract year from filename
match = re.search(r"\d{4}", filename)
if not match:
print(f"Year not found in filename: {filename}")
continue
year = int(match.group())
sheet_name = f"{year} Detail"
try:
df = pd.read_excel(f, sheet_name=sheet_name)
except ValueError:
print(f"Sheet '{sheet_name}' not found in {filename}")
continue
# First, normalize column names temporarily so we can safely reference them
df.columns = (
df.columns
.str.strip()
.str.lower()
.str.replace(r"[^\w]+", "_", regex=True)
.str.strip("_")
)
df = df.rename(columns={
"dt": "dt", # dollars
"dt_hrs": "dt_hrs", # hours
"chld1": "child1", # Child Care (1)
"employee_name": "employee", # Employee name
"class": "dept", # Department or Class
"location": "work_location", # Work location
"regular_hours": "reg_hrs", # Regular hours
"regular_hourly_pay": "reg", # Regular hourly pay
"overtime_hours": "ot_hrs", # Overtime hours
"overtime_hourly_pay": "ot", # Overtime hourly pay
"overtime_x1_5_hourly": "ot_x1.5", # Overtime x1.5 hourly pay
"sick": "pto", # Paid during sick days goe sinto PTO column
"401k_emp": "_401k", # Employee 401k
"beam_dental": "beamd", # Beam dental insurance
"bonus_one_time_cash_award": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column). This includes the proceeding bonus columns.
"bonus_one_time_cash_reward": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column). This includes the proceeding bonus columns.
"clerical_hourly_bonuses": "bonus", # Clerical bonus included
"clerical_bonuses": "bonus", # Clerical bonus included
"health_insurance_emp": "healt", # Employee health insurance
"life_ins": "life", # Life insurance
"roth_401k_emp": "_4roth", # Employee Roth 401k
"4roth": "_4roth", # Employee Roth 401k
"federal_withholding": "fitw", # Federal withholding
"medicare_employee": "med_ee", # Medicare employee
"med": "med_ee", # Medicare employee
"medicare_company": "med_er", # Medicare company
"med_r": "med_er", # Medicare company
"social_security_employee": "ss", # Social security employee
"ss": "ss", # Social security employee
"social_security_company": "ss_r", # Social security company
"ss_r": "ss_r", # Social security company
"id_1": "idaho_state_tax", # Idaho state tax
"advance_repayment": "advre", # Advance repayment
"net_pay": "net", # Net pay
"federal_unemployment": "futa", # Federal unemployment (FUTA)
"id_unemployment_company": "suta", # Unemployment company (SUTA)
"id_workforce_dev_fund": "suta", # Workforce development fund (SUTA)
"total_employer_taxes_and_contributions": "suta", # Total employer taxes and contributions (SUTA)
"id_withholding": "idaho_state_tax", # Idaho state tax
"vison": "vision" # Vision insurance
})
# Merge duplicate columns by summing them
if df.columns.duplicated().any():
print(f"{filename} has duplicate columns:", df.columns[df.columns.duplicated()].tolist())
# Sum numeric duplicates, keep first for text
df = (
df.T
.groupby(level=0)
.agg(lambda x: x.sum() if pd.api.types.is_numeric_dtype(x) else x.iloc[0])
.T
)
# Remove TOTAL / SUMMARY rows
if "employee" in df.columns:
df = df[~df["employee"].str.contains(
"total|summary|grand", case=False, na=False
)]
df["source_file"] = filename
df["year"] = year
# Ensure chk_date is datetime and strip time for THIS df
if "chk_date" in df.columns:
df["chk_date"] = pd.to_datetime(df["chk_date"], errors="coerce").dt.normalize()
# This is to drop the summed 'Total' rows that are in the original sheets. Will be appended to the dataframe. r"\btotal\b" will target any version of the word total regardless of case.)
df_no_totals = df[
~df.astype(str).apply(
lambda row: row.str.contains(r"\btotal\b", case=False, na=False).any(),
axis=1
)
]
dfs.append(df_no_totals)
# Step 5: Concatenate
def clean_detail(df):
hour_cols = ["reg_hrs", "ot_hrs", "dt_hrs"]
existing = [c for c in hour_cols if c in df.columns]
# Only drop if ALL hour columns exist AND are NaN
if existing:
df = df.dropna(subset=existing, how="all")
return df
dfs = [clean_detail(df) for df in dfs]
# Actual concat
combined_2024_2025 = pd.concat(dfs, ignore_index=True, sort=False)
# Assure datetime is added correctly and stip the time portion from those values.
if "chk_date" in combined_2024_2025.columns:
combined_2024_2025["chk_date"] = pd.to_datetime(combined_2024_2025["chk_date"], errors="coerce").dt.normalize()
# Combine idast, idsui, and idwd into one column for suta (state unemployment tax)
suta_cols = ["idast", "idsui", "idwd"]
existing_cols = [
c for c in suta_cols
if c in combined_2024_2025.columns
]
combined_2024_2025["suta"] = (
combined_2024_2025[existing_cols]
.fillna(0)
.sum(axis=1)
)
# Make an equivalent to the total gross pay column for 2024 and 2025
gross_wages_cols = ["reg", "dt", "ot", "bonus", "pto", "retro"]
existing = [c for c in gross_wages_cols if c in combined_2024_2025.columns]
combined_2024_2025["total_gross_wages"] = (
combined_2024_2025[existing]
.fillna(0)
.sum(axis=1)
)
# Drop deductions, SUTA (state unemployment benefits) and other columns that are not needed.
drop_col = [
"total_deductions",
"total_deductions_from_gross_pay",
"total_deductions_from_net_pay",
"idast", "idsui", "idwd",
"child_support_garnishment",
"total_taxes_withheld",
"wage_garnishment_other",
"total_earnings",
"telephone_deduction",
"employee_loan_repayment"
]
combined_2024_2025 = combined_2024_2025.drop(columns=drop_col, errors="ignore")
# Step 7: Export Excel file
output_path = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025\master_detail_2019-2023.xlsx"
master_2024_2025.to_excel(output_path, index=False)
# Find the zero-based column index of 'chk_date'
cols = list(master_2024_2025.columns)
date_col_idx = cols.index("chk_date") if "chk_date" in cols else None
# This ensures the date fromatting is correct for the final export of the Excel file
with pd.ExcelWriter(output_path, engine="xlsxwriter", datetime_format="m/d/yyyy", date_format="m/d/yyyy") as writer:
master_2024_2025.to_excel(writer, index=False, sheet_name="Sheet1")
if date_col_idx is not None:
workbook = writer.book
worksheet = writer.sheets["Sheet1"]
date_fmt = workbook.add_format({"num_format": "m/d/yyyy"})
# Convert 0-based index to Excel letter range (e.g., B:B)
col_letter = chr(ord('A') + date_col_idx)
worksheet.set_column(f"{col_letter}:{col_letter}", 12, date_fmt)
print("Wrote:", output_path)
# When you are done be sure to check sums of numerical values as they must be accurate.
Я должен добавить, что эти 4 начальных столбца пусты, чтобы избежать раскрытия конфиденциальной информации. После отчаянного просмотра ChatGPT и Copilot он спросил, есть ли в этих столбцах какие-либо значения, помеченные как total, и я убедился, что их нет ни в одном из исходных файлов. Из-за этого я сомневаюсь, что это является причиной удаления этой конкретной строки. Любое направление приветствуется.
У меня есть файл в блокноте Jupyter, в котором несколько разных листов Excel с информацией о затратах на рабочую силу объединены в одну основную таблицу. Он более или менее делал именно то, что я хотел, но при проверке сумм числовых значений я обнаружил, что целая строка первой загруженной электронной таблицы была удалена. Я приведу снимок экрана со значениями, которые появляются в электронной таблице (выделенная строка — это удаляемая строка): [img]https://i.sstatic.net/yPu3oQ0w.png[/img]
[img]https://i.sstatic.net/eAVQjwav.png[/img]
[img]https://i.sstatic.net/XWakCq1c.png[/img]
[img]https://i.sstatic.net/wivtaG4Y.png[/img]
[img]https://i.sstatic.net/xVfzYu2i.png[/img]
И вот код: [code]# Step 1: Load files from Master Data folder
import os import glob import pandas as pd import numpy as np import re import xlsxwriter
location = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025"
pd.ExcelFile(filtered_files[0]).sheet_names [/code] [code]# Use this to make sure the worksheets you need are being loaded. Each loaded file will appear in the output with the sheet names.
for f in filtered_files: t = pd.ExcelFile(f) print(os.path.basename(f), "->", t.sheet_names) [/code] [code]# Step 3: Match by Sheet Name. In this case by the 'Detail' sheets found in both files
dfs = []
for f in filtered_files: filename = os.path.basename(f)
# Extract year from filename match = re.search(r"\d{4}", filename) if not match: print(f"Year not found in filename: {filename}") continue
year = int(match.group()) sheet_name = f"{year} Detail"
try: df = pd.read_excel(f, sheet_name=sheet_name) except ValueError: print(f"Sheet '{sheet_name}' not found in {filename}") continue
# First, normalize column names temporarily so we can safely reference them df.columns = ( df.columns .str.strip() .str.lower() .str.replace(r"[^\w]+", "_", regex=True) .str.strip("_") )
df = df.rename(columns={ "dt": "dt", # dollars "dt_hrs": "dt_hrs", # hours "chld1": "child1", # Child Care (1) "employee_name": "employee", # Employee name "class": "dept", # Department or Class "location": "work_location", # Work location "regular_hours": "reg_hrs", # Regular hours "regular_hourly_pay": "reg", # Regular hourly pay "overtime_hours": "ot_hrs", # Overtime hours "overtime_hourly_pay": "ot", # Overtime hourly pay "overtime_x1_5_hourly": "ot_x1.5", # Overtime x1.5 hourly pay "sick": "pto", # Paid during sick days goe sinto PTO column "401k_emp": "_401k", # Employee 401k "beam_dental": "beamd", # Beam dental insurance "bonus_one_time_cash_award": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column). This includes the proceeding bonus columns. "bonus_one_time_cash_reward": "bonus", # Bonus (one-time cash award) will be included in general bonus column (along with any other bonus column). This includes the proceeding bonus columns. "clerical_hourly_bonuses": "bonus", # Clerical bonus included "clerical_bonuses": "bonus", # Clerical bonus included "health_insurance_emp": "healt", # Employee health insurance "life_ins": "life", # Life insurance "roth_401k_emp": "_4roth", # Employee Roth 401k "4roth": "_4roth", # Employee Roth 401k "federal_withholding": "fitw", # Federal withholding "medicare_employee": "med_ee", # Medicare employee "med": "med_ee", # Medicare employee "medicare_company": "med_er", # Medicare company "med_r": "med_er", # Medicare company "social_security_employee": "ss", # Social security employee "ss": "ss", # Social security employee "social_security_company": "ss_r", # Social security company "ss_r": "ss_r", # Social security company "id_1": "idaho_state_tax", # Idaho state tax "advance_repayment": "advre", # Advance repayment "net_pay": "net", # Net pay "federal_unemployment": "futa", # Federal unemployment (FUTA) "id_unemployment_company": "suta", # Unemployment company (SUTA) "id_workforce_dev_fund": "suta", # Workforce development fund (SUTA) "total_employer_taxes_and_contributions": "suta", # Total employer taxes and contributions (SUTA) "id_withholding": "idaho_state_tax", # Idaho state tax "vison": "vision" # Vision insurance
})
# Merge duplicate columns by summing them if df.columns.duplicated().any(): print(f"{filename} has duplicate columns:", df.columns[df.columns.duplicated()].tolist()) # Sum numeric duplicates, keep first for text df = ( df.T .groupby(level=0) .agg(lambda x: x.sum() if pd.api.types.is_numeric_dtype(x) else x.iloc[0]) .T )
# Remove TOTAL / SUMMARY rows if "employee" in df.columns: df = df[~df["employee"].str.contains( "total|summary|grand", case=False, na=False )]
df["source_file"] = filename df["year"] = year
# Ensure chk_date is datetime and strip time for THIS df if "chk_date" in df.columns: df["chk_date"] = pd.to_datetime(df["chk_date"], errors="coerce").dt.normalize()
# This is to drop the summed 'Total' rows that are in the original sheets. Will be appended to the dataframe. r"\btotal\b" will target any version of the word total regardless of case.) df_no_totals = df[ ~df.astype(str).apply( lambda row: row.str.contains(r"\btotal\b", case=False, na=False).any(), axis=1 ) ]
dfs.append(df_no_totals) [/code] [code]# Step 4: Begin aligning columns
existing = [c for c in hour_cols if c in df.columns]
# Only drop if ALL hour columns exist AND are NaN if existing: df = df.dropna(subset=existing, how="all")
return df
dfs = [clean_detail(df) for df in dfs] # Actual concat combined_2024_2025 = pd.concat(dfs, ignore_index=True, sort=False)
# Assure datetime is added correctly and stip the time portion from those values.
if "chk_date" in combined_2024_2025.columns: combined_2024_2025["chk_date"] = pd.to_datetime(combined_2024_2025["chk_date"], errors="coerce").dt.normalize()
# Combine idast, idsui, and idwd into one column for suta (state unemployment tax) suta_cols = ["idast", "idsui", "idwd"]
existing_cols = [ c for c in suta_cols if c in combined_2024_2025.columns ]
# Make an equivalent to the total gross pay column for 2024 and 2025 gross_wages_cols = ["reg", "dt", "ot", "bonus", "pto", "retro"] existing = [c for c in gross_wages_cols if c in combined_2024_2025.columns] combined_2024_2025["total_gross_wages"] = ( combined_2024_2025[existing] .fillna(0) .sum(axis=1) )
# Drop deductions, SUTA (state unemployment benefits) and other columns that are not needed. drop_col = [ "total_deductions", "total_deductions_from_gross_pay", "total_deductions_from_net_pay", "idast", "idsui", "idwd", "child_support_garnishment", "total_taxes_withheld", "wage_garnishment_other", "total_earnings", "telephone_deduction", "employee_loan_repayment"
output_path = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025\master_detail_2019-2023.xlsx" master_2024_2025.to_excel(output_path, index=False)
# Find the zero-based column index of 'chk_date' cols = list(master_2024_2025.columns) date_col_idx = cols.index("chk_date") if "chk_date" in cols else None
# This ensures the date fromatting is correct for the final export of the Excel file with pd.ExcelWriter(output_path, engine="xlsxwriter", datetime_format="m/d/yyyy", date_format="m/d/yyyy") as writer: master_2024_2025.to_excel(writer, index=False, sheet_name="Sheet1") if date_col_idx is not None: workbook = writer.book worksheet = writer.sheets["Sheet1"] date_fmt = workbook.add_format({"num_format": "m/d/yyyy"}) # Convert 0-based index to Excel letter range (e.g., B:B) col_letter = chr(ord('A') + date_col_idx) worksheet.set_column(f"{col_letter}:{col_letter}", 12, date_fmt)
print("Wrote:", output_path)
# When you are done be sure to check sums of numerical values as they must be accurate. [/code] Я должен добавить, что эти 4 начальных столбца пусты, чтобы избежать раскрытия конфиденциальной информации. После отчаянного просмотра ChatGPT и Copilot он спросил, есть ли в этих столбцах какие-либо значения, помеченные как total, и я убедился, что их нет ни в одном из исходных файлов. Из-за этого я сомневаюсь, что это является причиной удаления этой конкретной строки. Любое направление приветствуется.