Подходит ли этот шаблон регрессионной модели
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
# ─────────────────────────────────────────────
# 1. LOAD DATA
# ─────────────────────────────────────────────
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nFirst 5 rows:")
print(train_df.head())
print("\nData types & nulls:")
print(train_df.info())
print("\nBasic statistics:")
print(train_df.describe())
# ─────────────────────────────────────────────
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ─────────────────────────────────────────────
TARGET = "target" # 0].sort_values("Percent", ascending=False))
# --- 2c. Target distribution ---
plt.figure(figsize=(7, 4))
sns.histplot(train_df[TARGET], kde=True, color="steelblue")
plt.title(f"Target Distribution: {TARGET}")
plt.tight_layout()
plt.show()
# --- 2d. Correlation heatmap (numeric only) ---
plt.figure(figsize=(12, 8))
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
corr_matrix = train_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
# --- 2e. Top correlated features with target ---
top_corr = corr_matrix[TARGET].abs().sort_values(ascending=False)
print("\nTop correlations with target:")
print(top_corr.head(10))
# --- 2f. Pairplot of top features ---
top_features = top_corr.head(5).index.tolist()
sns.pairplot(train_df[top_features], diag_kind="kde")
plt.suptitle("Pairplot - Top Correlated Features", y=1.02)
plt.show()
# --- 2g. Boxplots for outliers ---
plt.figure(figsize=(14, 6))
for i, col in enumerate(numeric_cols[:6]): # first 6 numeric cols
plt.subplot(2, 3, i + 1)
sns.boxplot(y=train_df[col], color="lightcoral")
plt.title(col)
plt.suptitle("Boxplots - Outlier Check")
plt.tight_layout()
plt.show()
# ─────────────────────────────────────────────
# 3. DATA CLEANING
# ─────────────────────────────────────────────
# --- 3a. Drop columns with too many nulls (threshold: 60%) ---
threshold = 0.6
cols_to_drop = missing_df[missing_df["Percent"] > threshold * 100].index.tolist()
print(f"\nDropping columns (>{threshold*100}% missing): {cols_to_drop}")
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns], inplace=True)
# --- 3b. Drop manually irrelevant columns (IDs, free text, etc.) ---
cols_to_remove = ["id", "Id", "ID", "name", "Name"] #
Подходит ли это для шаблона регрессионной модели ⇐ Python
Программы на Python
-
Anonymous
1774818561
Anonymous
Подходит ли этот шаблон регрессионной модели
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
# ─────────────────────────────────────────────
# 1. LOAD DATA
# ─────────────────────────────────────────────
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nFirst 5 rows:")
print(train_df.head())
print("\nData types & nulls:")
print(train_df.info())
print("\nBasic statistics:")
print(train_df.describe())
# ─────────────────────────────────────────────
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ─────────────────────────────────────────────
TARGET = "target" # 0].sort_values("Percent", ascending=False))
# --- 2c. Target distribution ---
plt.figure(figsize=(7, 4))
sns.histplot(train_df[TARGET], kde=True, color="steelblue")
plt.title(f"Target Distribution: {TARGET}")
plt.tight_layout()
plt.show()
# --- 2d. Correlation heatmap (numeric only) ---
plt.figure(figsize=(12, 8))
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
corr_matrix = train_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
# --- 2e. Top correlated features with target ---
top_corr = corr_matrix[TARGET].abs().sort_values(ascending=False)
print("\nTop correlations with target:")
print(top_corr.head(10))
# --- 2f. Pairplot of top features ---
top_features = top_corr.head(5).index.tolist()
sns.pairplot(train_df[top_features], diag_kind="kde")
plt.suptitle("Pairplot - Top Correlated Features", y=1.02)
plt.show()
# --- 2g. Boxplots for outliers ---
plt.figure(figsize=(14, 6))
for i, col in enumerate(numeric_cols[:6]): # first 6 numeric cols
plt.subplot(2, 3, i + 1)
sns.boxplot(y=train_df[col], color="lightcoral")
plt.title(col)
plt.suptitle("Boxplots - Outlier Check")
plt.tight_layout()
plt.show()
# ─────────────────────────────────────────────
# 3. DATA CLEANING
# ─────────────────────────────────────────────
# --- 3a. Drop columns with too many nulls (threshold: 60%) ---
threshold = 0.6
cols_to_drop = missing_df[missing_df["Percent"] > threshold * 100].index.tolist()
print(f"\nDropping columns (>{threshold*100}% missing): {cols_to_drop}")
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns], inplace=True)
# --- 3b. Drop manually irrelevant columns (IDs, free text, etc.) ---
cols_to_remove = ["id", "Id", "ID", "name", "Name"] #
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия