Я пытаюсь объединить файлы Excel за два разных года затрат на рабочую силу. Числовые данные одного из файлов увеличивают

Я пытаюсь объединить файлы Excel за два разных года затрат на рабочую силу. Числовые данные одного из файлов увеличивают ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Я пытаюсь объединить файлы Excel за два разных года затрат на рабочую силу. Числовые данные одного из файлов увеличивают

Цитата

Сообщение Anonymous » 05 фев 2026, 03:05

Я использую два файла Excel (каждый файл за 2024 и 2025 годы) и объединяю их с помощью Pandas. Я обнаружил эту проблему при проверке сумм, которые выводит моя программа, и сравнении их с суммой результатов листов Excel соответствующих столбцов. Для двух столбцов (с именами REG и DT) я специально обнаружил, что это точное общее значение 2024 года, добавленное дважды. Я несколько раз пробовал использовать Chat и Copilot и, похоже, заходил в тупик. Вот весь мой код (и для справки, если это поможет, это использование Jupyter Notebooks в VSCode):
# Step 1: Load files from Master Data folder

import os
import glob
import pandas as pd
import numpy as np
import re

location = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025"

excel_files = (
glob.glob(location + r"\*.xlsx") +
glob.glob(location + r"\*.xlsm")
)

excel_files = [
f for f in excel_files
if not os.path.basename(f).startswith("~$")
]

print(excel_files)

# Step 2: Filter files to just the 2024 and 2025 files

filtered_files = [
f for f in excel_files
if re.search(r'Rigby Produce Payroll Register Data 2024|Rigby Produce Payroll Register Data 2025', f)
]

print(filtered_files)

pd.ExcelFile(filtered_files[0]).sheet_names

# Step 3: Match by Sheet Name. In this case by the 'Detail' sheets found in both files

dfs = []

for f in filtered_files:
filename = os.path.basename(f)

# Extract year from filename
match = re.search(r"\d{4}", filename)
if not match:
print(f"Year not found in filename: {filename}")
continue

year = int(match.group())
sheet_name = f"{year} Detail"

try:
df = pd.read_excel(f, sheet_name=sheet_name)
except ValueError:
print(f"Sheet '{sheet_name}' not found in {filename}")
continue

# First, normalize column names temporarily so we can safely reference them
df.columns = (
df.columns
.str.strip()
.str.lower()
.str.replace(r"[^\w]+", "_", regex=True)
.str.strip("_")
)

df = df.rename(columns={
"dt": "dt", # dollars
"dt_hrs": "dt_hrs" # hours
})

# Remove TOTAL / SUMMARY rows
if "employee_name" in df.columns:
df = df[~df["employee_name"].str.contains(
"total|summary|grand", case=False, na=False
)]

df["source_file"] = filename
df["year"] = year

dfs.append(df)

# Step 4: Begin aligning columns

column_map = {
# employee
"employee": "employee",
"employee_name": "employee",

# DT dollars and hours
"dt": "dt",
"dt_hrs": "dt_hrs",

# common hour variants
"reg_hours": "reg_hrs",
"regular_hours": "reg_hrs",
"ot_hours": "ot_hours",
}

def standardize_columns(df):
return df.rename(columns=column_map)

dfs = [standardize_columns(df) for df in dfs]

print("Number of dataframes:", len(dfs))
for i, df in enumerate(dfs):
print(i, df["year"].iloc[0], len(df))

(В этом списке перечислены два фрейма данных, по 1 для 2024 и 2025 годов каждый)
# Step 6: Concatenate (Correctly)

def clean_detail(df):
df = df.dropna(
subset=["reg_hrs", "ot_hours", "dt_hrs"],
how="all"
)
return df

# Actual concat
combined_2024_2025 = pd.concat(dfs, ignore_index=True, sort=False)

# Step 7: Light Data Cleaning (Before Export)

# Remove blank rows
master_2024_2025 = combined_2024_2025.dropna(how="all")

# Ensure numeric columns
numeric_cols = [
"process", "chk_date",
"chk_vchr", "net", "reg_hrs", "reg",
"dt", "ot_hours", "ot", "_401er", "bonus",
"ermed", "grptl", "haltd", "hsaer", "pto",
"total_hrs", "total_earnings", "fitw", "med",
"medhi", "ss", "total_taxes", "med_r",
"ss_r", "idsat", "idsui", "idwd", "futa",
"total_taxes_er", "_401cu", "_401k", "_4roth",
"advre", "beamd", "cell", "chld1", "garn1",
"healt", "sterl", "tcfob", "travr", "ulife",
"vis", "vision", "total_deductions"
]
for col in numeric_cols:
if col in combined_2024_2025.columns:
combined_2024_2025[col] = pd.to_numeric(
combined_2024_2025[col], errors="coerce"
)

output_path = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025\master_detail_2024_2025.xlsx"
master_2024_2025.to_excel(output_path, index=False)
# When you are done be sure to check sums of numerical values as they must be accurate.

Подробнее здесь: https://stackoverflow.com/questions/798 ... sts-one-of

1770249903

Anonymous

Я использую два файла Excel (каждый файл за 2024 и 2025 годы) и объединяю их с помощью Pandas. Я обнаружил эту проблему при проверке сумм, которые выводит моя программа, и сравнении их с суммой результатов листов Excel соответствующих столбцов. Для двух столбцов (с именами REG и DT) я специально обнаружил, что это точное общее значение 2024 года, добавленное дважды. Я несколько раз пробовал использовать Chat и Copilot и, похоже, заходил в тупик. Вот весь мой код (и для справки, если это поможет, это использование Jupyter Notebooks в VSCode):
# Step 1: Load files from Master Data folder

import os
import glob
import pandas as pd
import numpy as np
import re

location = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025"

excel_files = (
glob.glob(location + r"\*.xlsx") +
glob.glob(location + r"\*.xlsm")
)

excel_files = [
f for f in excel_files
if not os.path.basename(f).startswith("~$")
]

print(excel_files)

# Step 2: Filter files to just the 2024 and 2025 files

filtered_files = [
f for f in excel_files
if re.search(r'Rigby Produce Payroll Register Data 2024|Rigby Produce Payroll Register Data 2025', f)
]

print(filtered_files)

pd.ExcelFile(filtered_files[0]).sheet_names

# Step 3: Match by Sheet Name.  In this case by the 'Detail' sheets found in both files

dfs = []

for f in filtered_files:
filename = os.path.basename(f)

# Extract year from filename
match = re.search(r"\d{4}", filename)
if not match:
print(f"Year not found in filename: {filename}")
continue

year = int(match.group())
sheet_name = f"{year} Detail"

try:
df = pd.read_excel(f, sheet_name=sheet_name)
except ValueError:
print(f"Sheet '{sheet_name}' not found in {filename}")
continue

# First, normalize column names temporarily so we can safely reference them
df.columns = (
df.columns
.str.strip()
.str.lower()
.str.replace(r"[^\w]+", "_", regex=True)
.str.strip("_")
)

df = df.rename(columns={
"dt": "dt",           # dollars
"dt_hrs": "dt_hrs"    # hours
})

# Remove TOTAL / SUMMARY rows
if "employee_name" in df.columns:
df = df[~df["employee_name"].str.contains(
"total|summary|grand", case=False, na=False
)]

df["source_file"] = filename
df["year"] = year

dfs.append(df)

# Step 4: Begin aligning columns

column_map = {
# employee
"employee": "employee",
"employee_name": "employee",

# DT dollars and hours
"dt": "dt",
"dt_hrs": "dt_hrs",

# common hour variants
"reg_hours": "reg_hrs",
"regular_hours": "reg_hrs",
"ot_hours": "ot_hours",
}

def standardize_columns(df):
return df.rename(columns=column_map)

dfs = [standardize_columns(df) for df in dfs]

print("Number of dataframes:", len(dfs))
for i, df in enumerate(dfs):
print(i, df["year"].iloc[0], len(df))

(В этом списке перечислены два фрейма данных, по 1 для 2024 и 2025 годов каждый)
# Step 6: Concatenate (Correctly)

def clean_detail(df):
df = df.dropna(
subset=["reg_hrs", "ot_hours", "dt_hrs"],
how="all"
)
return df

# Actual concat
combined_2024_2025 = pd.concat(dfs, ignore_index=True, sort=False)

# Step 7: Light Data Cleaning (Before Export)

# Remove blank rows
master_2024_2025 = combined_2024_2025.dropna(how="all")

# Ensure numeric columns
numeric_cols = [
"process", "chk_date",
"chk_vchr", "net", "reg_hrs", "reg",
"dt", "ot_hours", "ot", "_401er", "bonus",
"ermed", "grptl", "haltd", "hsaer", "pto",
"total_hrs", "total_earnings", "fitw", "med",
"medhi", "ss", "total_taxes", "med_r",
"ss_r", "idsat", "idsui", "idwd", "futa",
"total_taxes_er", "_401cu", "_401k", "_4roth",
"advre", "beamd", "cell", "chld1", "garn1",
"healt", "sterl", "tcfob", "travr", "ulife",
"vis", "vision", "total_deductions"
]
for col in numeric_cols:
if col in combined_2024_2025.columns:
combined_2024_2025[col] = pd.to_numeric(
combined_2024_2025[col], errors="coerce"
)

output_path = r"Z:\Intern Projects\Labor Cost Analysis\Master Data 2019-2025\master_detail_2024_2025.xlsx"
master_2024_2025.to_excel(output_path, index=False)
# When you are done be sure to check sums of numerical values as they must be accurate.
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79882231/im-tying-to-combine-excel-files-from-two-different-years-of-labor-costs-one-of[/url]