Я создаю модель классификации, используя XGBoost с конвейером Scikit-learn, который включает предварительную обработку как числовых, так и категориальных признаков.
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
TARGET_COL = 'Heart Disease'
ID_COL = 'id'
# Load datasets
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
# Separate features and target
X = train_df.drop(columns=[TARGET_COL, 'id'])
y = train_df[TARGET_COL]
# Save test IDs for the submission file, then drop from features
test_ids = test_df['id']
X_test_submission = test_df.drop(columns=['id'])
# 3. PREPROCESSING PIPELINE
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns
# Numerical Transformer: Impute missing with mean, then scale
num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Categorical Transformer: Impute missing with most frequent, then One-Hot Encode
cat_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessors
preprocessor = ColumnTransformer(
transformers=[
('num', num_transformer, num_cols),
('cat', cat_transformer, cat_cols)
])
# ==========================================
# 4. MODEL SELECTION & TRAINING
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Create the final pipeline with XGBoost
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', XGBClassifier(
n_estimators=150,
learning_rate=0.1,
max_depth=5,
random_state=42,
eval_metric='logloss'
))
])
print("Training Classification Model...")
model.fit(X_train, y_train)
# Local Evaluation
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {acc:.4f}")
# ==========================================
# 5. GENERATE FINAL PREDICTIONS
print("Generating predictions for the test dataset...")
# Retrain on FULL training data for maximum accuracy
model.fit(X, y)
final_predictions = model.predict(X_test_submission)
# Create submission file
submission = pd.DataFrame({
ID_COL: test_ids,
TARGET_COL: final_predictions
})
submission.to_csv('classification_submission.csv', index=False)
print("Saved to 'classification_submission.csv'")
Как я могу повысить точность проверки дальше (настройка гиперпараметров, разработка функций....) и является ли передовой практикой переобучение модели на полном наборе данных перед прогнозированием?
Я создаю модель классификации, используя [b]XGBoost[/b] с [b]конвейером Scikit-learn[/b], который включает предварительную обработку как числовых, так и категориальных признаков. [code]import pandas as pd import numpy as np from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.metrics import accuracy_score
# ========================================== # 5. GENERATE FINAL PREDICTIONS print("Generating predictions for the test dataset...") # Retrain on FULL training data for maximum accuracy model.fit(X, y) final_predictions = model.predict(X_test_submission)
# Create submission file submission = pd.DataFrame({ ID_COL: test_ids, TARGET_COL: final_predictions }) submission.to_csv('classification_submission.csv', index=False) print("Saved to 'classification_submission.csv'") [/code] Как я могу [b]повысить точность проверки[/b] дальше (настройка гиперпараметров, разработка функций....) и является ли передовой практикой переобучение модели на полном наборе данных перед прогнозированием?