Подходит ли это для шаблона регрессионной модели

Подходит ли это для шаблона регрессионной модели ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Подходит ли это для шаблона регрессионной модели

Цитата

Сообщение Anonymous » 30 мар 2026, 00:09

Подходит ли этот шаблон регрессионной модели
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────
# 1. LOAD DATA
# ─────────────────────────────────────────────
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nFirst 5 rows:")
print(train_df.head())
print("\nData types & nulls:")
print(train_df.info())
print("\nBasic statistics:")
print(train_df.describe())

# ─────────────────────────────────────────────
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ─────────────────────────────────────────────
TARGET = "target" # 0].sort_values("Percent", ascending=False))

# --- 2c. Target distribution ---
plt.figure(figsize=(7, 4))
sns.histplot(train_df[TARGET], kde=True, color="steelblue")
plt.title(f"Target Distribution: {TARGET}")
plt.tight_layout()
plt.show()

# --- 2d. Correlation heatmap (numeric only) ---
plt.figure(figsize=(12, 8))
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
corr_matrix = train_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# --- 2e. Top correlated features with target ---
top_corr = corr_matrix[TARGET].abs().sort_values(ascending=False)
print("\nTop correlations with target:")
print(top_corr.head(10))

# --- 2f. Pairplot of top features ---
top_features = top_corr.head(5).index.tolist()
sns.pairplot(train_df[top_features], diag_kind="kde")
plt.suptitle("Pairplot - Top Correlated Features", y=1.02)
plt.show()

# --- 2g. Boxplots for outliers ---
plt.figure(figsize=(14, 6))
for i, col in enumerate(numeric_cols[:6]): # first 6 numeric cols
plt.subplot(2, 3, i + 1)
sns.boxplot(y=train_df[col], color="lightcoral")
plt.title(col)
plt.suptitle("Boxplots - Outlier Check")
plt.tight_layout()
plt.show()

# ─────────────────────────────────────────────
# 3. DATA CLEANING
# ─────────────────────────────────────────────

# --- 3a. Drop columns with too many nulls (threshold: 60%) ---
threshold = 0.6
cols_to_drop = missing_df[missing_df["Percent"] > threshold * 100].index.tolist()
print(f"\nDropping columns (>{threshold*100}% missing): {cols_to_drop}")
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns], inplace=True)

# --- 3b. Drop manually irrelevant columns (IDs, free text, etc.) ---
cols_to_remove = ["id", "Id", "ID", "name", "Name"] #

1774818561

Anonymous

Подходит ли этот шаблон регрессионной модели
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# ─────────────────────────────────────────────
# 1. LOAD DATA
# ─────────────────────────────────────────────
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nFirst 5 rows:")
print(train_df.head())
print("\nData types & nulls:")
print(train_df.info())
print("\nBasic statistics:")
print(train_df.describe())

# ─────────────────────────────────────────────
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ─────────────────────────────────────────────
TARGET = "target"          #  0].sort_values("Percent", ascending=False))

# --- 2c. Target distribution ---
plt.figure(figsize=(7, 4))
sns.histplot(train_df[TARGET], kde=True, color="steelblue")
plt.title(f"Target Distribution: {TARGET}")
plt.tight_layout()
plt.show()

# --- 2d. Correlation heatmap (numeric only) ---
plt.figure(figsize=(12, 8))
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
corr_matrix = train_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# --- 2e. Top correlated features with target ---
top_corr = corr_matrix[TARGET].abs().sort_values(ascending=False)
print("\nTop correlations with target:")
print(top_corr.head(10))

# --- 2f. Pairplot of top features ---
top_features = top_corr.head(5).index.tolist()
sns.pairplot(train_df[top_features], diag_kind="kde")
plt.suptitle("Pairplot - Top Correlated Features", y=1.02)
plt.show()

# --- 2g. Boxplots for outliers ---
plt.figure(figsize=(14, 6))
for i, col in enumerate(numeric_cols[:6]):            # first 6 numeric cols
plt.subplot(2, 3, i + 1)
sns.boxplot(y=train_df[col], color="lightcoral")
plt.title(col)
plt.suptitle("Boxplots - Outlier Check")
plt.tight_layout()
plt.show()

# ─────────────────────────────────────────────
# 3. DATA CLEANING
# ─────────────────────────────────────────────

# --- 3a. Drop columns with too many nulls (threshold: 60%) ---
threshold = 0.6
cols_to_drop = missing_df[missing_df["Percent"] > threshold * 100].index.tolist()
print(f"\nDropping columns (>{threshold*100}% missing): {cols_to_drop}")
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns], inplace=True)

# --- 3b.  Drop manually irrelevant columns (IDs, free text, etc.) ---
cols_to_remove = ["id", "Id", "ID", "name", "Name"]        #