Anonymous
Почему я получаю негативную точность для прогнозирования времени и как его решить?
Сообщение
Anonymous » 25 янв 2025, 11:12
Я пытаюсь запустить некоторое время прогнозирования серий на около 8 наборов данных. < /p>
Они выглядят так: < /p>
Код: Выделить всё
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
data = [
("2022-07-01", 11767424, 25.774193548387096, 1),
("2022-08-01", 13331928, 25.677419354838708, 1),
("2022-09-01", 11194711, 19.633333333333333, 1),
("2022-10-01", 11506759, 17.967741935483872, 1),
("2022-11-01", 9525865, 12.933333333333334, 1),
("2022-12-01", 8438520, 7.96774193548387, 1),
("2023-01-01", 8811170, 8.806451612903226, 1),
("2023-02-01", 11417707, 10.464285714285714, 1),
("2023-03-01", 12539421, 10.935483870967742, 1),
("2023-04-01", 10824295, 13.5, 1),
("2023-05-01", 11651067, 17.538709677419355, 1.1),
("2023-06-01", 12870035, 24.043333333333333, 1),
("2023-07-01", 10717095, 22.032258064516128, 1),
("2023-08-01", 8262566, 22.483870967741936, 1),
("2023-09-01", 7564720, 23.53333333333333, 1),
("2023-10-01", 8540128, 17.580645161290324, 1),
("2023-11-01", 9278635, 11.366666666666667, 1),
("2023-12-01", 9293826, 10.580645161290322, 1),
("2024-01-01", 9628144, 8.483870967741936, 1.1),
("2024-02-01", 10127209, 11.96551724137931, 1),
("2024-03-01", 11405640, 12.419354838709678, 1),
("2024-04-01", 16075499, 14.26666666666667, 1.1),
("2024-05-01", 14236947, 18.64516129032258, 1),
("2024-06-01", 13946271, 20.7, 1),
("2024-07-01", 12968261, 22.548387096774192, 1),
("2024-08-01", 11624620, 24, 1),
("2024-09-01", 9705773, 19.3, 1),
("2024-10-01", 11821238, 16.096774193548388, 1),
("2024-11-01", 11457334, 11.166666666666666, 1),
("2024-12-01", 10537240, 9.66451612903226, 1)
]
schema = StructType([
StructField("date", StringType(), True),
StructField("total_invoiced_volume", IntegerType(), True),
StructField("monthly_avg_tmax", DoubleType(), True),
StructField("Week_Weight", DoubleType(), True)
])
gro_monthly_ev = spark.createDataFrame(data, schema)
display(gro_monthly_ev)
Все эти наборы данных хранятся в папке и создают серию алгоритмов прогнозирования для прогнозирования продаж через 3, 6, 9, 12 месяцев.
это код, который берет все наборы данных и применяет алгоритмы временного прогнозирования.
Код: Выделить всё
%python
import pandas as pd
import os
import glob
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from tqdm import tqdm
import numpy as np
# Define directories
input_dir = "/Workspace/Users/gab/path_to_merged_data"
output_dir = "/Workspace/Users/gab/forecast_results"
os.makedirs(output_dir, exist_ok=True)
# List all monthly datasets
file_list = [file for file in glob.glob(os.path.join(input_dir, "*.csv")) if "monthly" in file]
# Initialize results storage
results = []
future_forecasts = []
# Forecasting horizons
forecast_horizons = [3, 6, 9, 12]
# Define models
models = {
"Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42),
"Random Forest": RandomForestRegressor(max_depth=4, random_state=42),
"XGBoost": XGBRegressor(max_depth=4, random_state=42),
"LightGBM": LGBMRegressor(random_state=42)
}
# Process each dataset
for file in file_list:
# Load dataset
dataset_name = os.path.basename(file)
df = pd.read_csv(file)
# Ensure no missing values
if df.isnull().sum().sum() > 0:
continue
# Ensure date column is in datetime format
df["date"] = pd.to_datetime(df["date"])
# Process variables
target_col = "total_invoiced_volume"
df["monthly_avg_tmax"] = df["monthly_avg_tmax"].round(0).astype(int) # Convert decimals to integers for time series models
# Prepare train and test sets
df.sort_values(by="date", inplace=True)
test = df[df["date"] >= df["date"].max() - pd.DateOffset(months=3)]
train = df[df["date"] < df["date"].max() - pd.DateOffset(months=3)]
# Features and target
features = [col for col in df.columns if col not in ["date", target_col]]
X_train, y_train = train[features], train[target_col]
X_test, y_test = test[features], test[target_col]
# Train and forecast with each model
for model_name, model in tqdm(models.items(), desc=f"Processing {dataset_name}"):
# Train the model
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Calculate metrics
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = r2_score(y_test, y_pred)
# Store results
for horizon in forecast_horizons:
future_dates = pd.date_range(start=df["date"].max(), periods=horizon + 1, freq="MS")[1:]
future_X = X_test.iloc[:min(horizon, len(X_test)), :] # Ensure future_X has enough rows
future_y = model.predict(future_X)
results.append({
"model": model_name,
"dataset": dataset_name,
"horizon": horizon,
"mape": mape,
"accuracy": accuracy
})
# Store future forecast
for i, forecast_date in enumerate(future_dates[:len(future_y)]):
future_forecasts.append({
"model": model_name,
"forecast_date": forecast_date,
"forecast": future_y[i]
})
# Convert results to DataFrames
results_df = pd.DataFrame(results)
future_forecasts_df = pd.DataFrame(future_forecasts)
# Save results
results_file = os.path.join(output_dir, "forecast_results.csv")
future_forecasts_file = os.path.join(output_dir, "future_forecasts.csv")
results_df.to_csv(results_file, index=False)
future_forecasts_df.to_csv(future_forecasts_file, index=False)
print(f"Forecast results saved to {results_file}")
print(f"Future forecasts saved to {future_forecasts_file}")
< /code>
и выходы поставляются с отрицательными значениями: < /p>
model dataset horizon mape accuracy
XGBoost synthetic_dataset_1.csv 3 0.9345570675519763 -96.02492281747726
XGBoost synthetic_dataset_1.csv 6 0.6271307289552767 -70.91875294532318
XGBoost synthetic_dataset_1.csv 9 0.9917830267047274 -98.13387817239064
XGBoost synthetic_dataset_1.csv 12 1.8660074870488188 -78.53834752084283
LightGBM synthetic_dataset_1.csv 3 1.8211836390068303 -13.443149016458591
LightGBM synthetic_dataset_1.csv 6 0.6815504925684449 -155.64855098317562
LightGBM synthetic_dataset_1.csv 9 0.10735314290676172 -114.71200513051166
LightGBM synthetic_dataset_1.csv 12 1.2607188837495213 -79.2995414610846
Decision Tree synthetic_dataset_1.csv 3 1.086431157027433 -37.56293565488065
Decision Tree synthetic_dataset_1.csv 6 0.24353449685825834 -195.8726668013846
Decision Tree synthetic_dataset_1.csv 9 1.0238986600195292 -159.2508264827403
Decision Tree synthetic_dataset_1.csv 12 1.8688982557030902 -16.728460706610804
Random Forest synthetic_dataset_1.csv 3 0.9468312070925308 -171.5793613682497
Random Forest synthetic_dataset_1.csv 6 1.2874845795631709 -46.06559850787815
Random Forest synthetic_dataset_1.csv 9 0.8402783930768509 -63.063988951879764
Random Forest synthetic_dataset_1.csv 12 0.6203818639940742 -49.10659016208899
Могу ли я понять, почему он делает то, что делает, и как это решить?
Подробнее здесь:
https://stackoverflow.com/questions/793 ... o-solve-it
1737792747
Anonymous
Я пытаюсь запустить некоторое время прогнозирования серий на около 8 наборов данных. < /p> Они выглядят так: < /p> [code]from pyspark.sql.functions import col, lit from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType data = [ ("2022-07-01", 11767424, 25.774193548387096, 1), ("2022-08-01", 13331928, 25.677419354838708, 1), ("2022-09-01", 11194711, 19.633333333333333, 1), ("2022-10-01", 11506759, 17.967741935483872, 1), ("2022-11-01", 9525865, 12.933333333333334, 1), ("2022-12-01", 8438520, 7.96774193548387, 1), ("2023-01-01", 8811170, 8.806451612903226, 1), ("2023-02-01", 11417707, 10.464285714285714, 1), ("2023-03-01", 12539421, 10.935483870967742, 1), ("2023-04-01", 10824295, 13.5, 1), ("2023-05-01", 11651067, 17.538709677419355, 1.1), ("2023-06-01", 12870035, 24.043333333333333, 1), ("2023-07-01", 10717095, 22.032258064516128, 1), ("2023-08-01", 8262566, 22.483870967741936, 1), ("2023-09-01", 7564720, 23.53333333333333, 1), ("2023-10-01", 8540128, 17.580645161290324, 1), ("2023-11-01", 9278635, 11.366666666666667, 1), ("2023-12-01", 9293826, 10.580645161290322, 1), ("2024-01-01", 9628144, 8.483870967741936, 1.1), ("2024-02-01", 10127209, 11.96551724137931, 1), ("2024-03-01", 11405640, 12.419354838709678, 1), ("2024-04-01", 16075499, 14.26666666666667, 1.1), ("2024-05-01", 14236947, 18.64516129032258, 1), ("2024-06-01", 13946271, 20.7, 1), ("2024-07-01", 12968261, 22.548387096774192, 1), ("2024-08-01", 11624620, 24, 1), ("2024-09-01", 9705773, 19.3, 1), ("2024-10-01", 11821238, 16.096774193548388, 1), ("2024-11-01", 11457334, 11.166666666666666, 1), ("2024-12-01", 10537240, 9.66451612903226, 1) ] schema = StructType([ StructField("date", StringType(), True), StructField("total_invoiced_volume", IntegerType(), True), StructField("monthly_avg_tmax", DoubleType(), True), StructField("Week_Weight", DoubleType(), True) ]) gro_monthly_ev = spark.createDataFrame(data, schema) display(gro_monthly_ev) [/code] Все эти наборы данных хранятся в папке и создают серию алгоритмов прогнозирования для прогнозирования продаж через 3, 6, 9, 12 месяцев. это код, который берет все наборы данных и применяет алгоритмы временного прогнозирования. [code]%python import pandas as pd import os import glob from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from sklearn.metrics import mean_absolute_percentage_error, r2_score from tqdm import tqdm import numpy as np # Define directories input_dir = "/Workspace/Users/gab/path_to_merged_data" output_dir = "/Workspace/Users/gab/forecast_results" os.makedirs(output_dir, exist_ok=True) # List all monthly datasets file_list = [file for file in glob.glob(os.path.join(input_dir, "*.csv")) if "monthly" in file] # Initialize results storage results = [] future_forecasts = [] # Forecasting horizons forecast_horizons = [3, 6, 9, 12] # Define models models = { "Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42), "Random Forest": RandomForestRegressor(max_depth=4, random_state=42), "XGBoost": XGBRegressor(max_depth=4, random_state=42), "LightGBM": LGBMRegressor(random_state=42) } # Process each dataset for file in file_list: # Load dataset dataset_name = os.path.basename(file) df = pd.read_csv(file) # Ensure no missing values if df.isnull().sum().sum() > 0: continue # Ensure date column is in datetime format df["date"] = pd.to_datetime(df["date"]) # Process variables target_col = "total_invoiced_volume" df["monthly_avg_tmax"] = df["monthly_avg_tmax"].round(0).astype(int) # Convert decimals to integers for time series models # Prepare train and test sets df.sort_values(by="date", inplace=True) test = df[df["date"] >= df["date"].max() - pd.DateOffset(months=3)] train = df[df["date"] < df["date"].max() - pd.DateOffset(months=3)] # Features and target features = [col for col in df.columns if col not in ["date", target_col]] X_train, y_train = train[features], train[target_col] X_test, y_test = test[features], test[target_col] # Train and forecast with each model for model_name, model in tqdm(models.items(), desc=f"Processing {dataset_name}"): # Train the model model.fit(X_train, y_train) # Predict on test data y_pred = model.predict(X_test) # Calculate metrics mape = mean_absolute_percentage_error(y_test, y_pred) accuracy = r2_score(y_test, y_pred) # Store results for horizon in forecast_horizons: future_dates = pd.date_range(start=df["date"].max(), periods=horizon + 1, freq="MS")[1:] future_X = X_test.iloc[:min(horizon, len(X_test)), :] # Ensure future_X has enough rows future_y = model.predict(future_X) results.append({ "model": model_name, "dataset": dataset_name, "horizon": horizon, "mape": mape, "accuracy": accuracy }) # Store future forecast for i, forecast_date in enumerate(future_dates[:len(future_y)]): future_forecasts.append({ "model": model_name, "forecast_date": forecast_date, "forecast": future_y[i] }) # Convert results to DataFrames results_df = pd.DataFrame(results) future_forecasts_df = pd.DataFrame(future_forecasts) # Save results results_file = os.path.join(output_dir, "forecast_results.csv") future_forecasts_file = os.path.join(output_dir, "future_forecasts.csv") results_df.to_csv(results_file, index=False) future_forecasts_df.to_csv(future_forecasts_file, index=False) print(f"Forecast results saved to {results_file}") print(f"Future forecasts saved to {future_forecasts_file}") < /code> и выходы поставляются с отрицательными значениями: < /p> model dataset horizon mape accuracy XGBoost synthetic_dataset_1.csv 3 0.9345570675519763 -96.02492281747726 XGBoost synthetic_dataset_1.csv 6 0.6271307289552767 -70.91875294532318 XGBoost synthetic_dataset_1.csv 9 0.9917830267047274 -98.13387817239064 XGBoost synthetic_dataset_1.csv 12 1.8660074870488188 -78.53834752084283 LightGBM synthetic_dataset_1.csv 3 1.8211836390068303 -13.443149016458591 LightGBM synthetic_dataset_1.csv 6 0.6815504925684449 -155.64855098317562 LightGBM synthetic_dataset_1.csv 9 0.10735314290676172 -114.71200513051166 LightGBM synthetic_dataset_1.csv 12 1.2607188837495213 -79.2995414610846 Decision Tree synthetic_dataset_1.csv 3 1.086431157027433 -37.56293565488065 Decision Tree synthetic_dataset_1.csv 6 0.24353449685825834 -195.8726668013846 Decision Tree synthetic_dataset_1.csv 9 1.0238986600195292 -159.2508264827403 Decision Tree synthetic_dataset_1.csv 12 1.8688982557030902 -16.728460706610804 Random Forest synthetic_dataset_1.csv 3 0.9468312070925308 -171.5793613682497 Random Forest synthetic_dataset_1.csv 6 1.2874845795631709 -46.06559850787815 Random Forest synthetic_dataset_1.csv 9 0.8402783930768509 -63.063988951879764 Random Forest synthetic_dataset_1.csv 12 0.6203818639940742 -49.10659016208899 [/code] Могу ли я понять, почему он делает то, что делает, и как это решить? Подробнее здесь: [url]https://stackoverflow.com/questions/79385143/why-do-i-get-negative-accuracy-rate-for-time-forecasting-and-how-to-solve-it[/url]