Я структурировал код на отдельные функции для предварительной обработки, масштабирования и обучения модели.
Я хотел бы проверить, соответствует ли этот проект передовым практикам, особенно в отношении предварительной обработки и выбора модели.
Основные проблемы:
- Модульная ли структура подходит для рабочих процессов машинного обучения?
- Правильен ли мой подход к выбору признаков с использованием порога корреляции?
- Правильен ли порядок нормализации и разделения на поезд-тест?
- Уместно ли использовать линейную регрессию для задачи двоичной классификации?
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
def load_data(path: str = "/kaggle/input/heart-disease-uci/heart.csv") -> pd.DataFrame:
"""Load raw data from CSV."""
df = pd.read_csv(path)
return df
def encode_binary_target(df: pd.DataFrame, target_column: str) -> pd.DataFrame:
"""Encode a binary string target column, e.g. B/M -> 0/1.
Adjust the mapping if your labels are different.
"""
mapping = {"B": 0, "M": 1}
df[target_column] = df[target_column].replace(mapping)
return df
def preprocess_data(df: pd.DataFrame, target_column: str) -> pd.DataFrame:
"""Basic cleaning and feature selection around a target column.
Note: adjust column names for your specific dataset if needed.
"""
# drop missing values
df = df.dropna()
# drop duplicate rows
df = df.drop_duplicates()
# encode binary labels (e.g., B/M) if present
df = encode_binary_target(df, target_column)
# ensure target is float for downstream models
df[target_column] = df[target_column].astype(float)
# select features with correlation above a threshold with target
correlation = df.corr(numeric_only=True)
threshold = 0.5
mask = np.abs(correlation[target_column]) > threshold
selected_features = correlation.columns[mask].tolist()
print("Selected features:", selected_features)
return df[selected_features]
def scale_data(df: pd.DataFrame, target_column: str):
"""Standardize features using StandardScaler."""
X = df.drop(columns=[target_column])
y = df[target_column]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X_scaled, y
def normalize_data(df: pd.DataFrame, target_column: str):
"""Normalize features to [0, 1] using MinMaxScaler."""
X = df.drop(columns=[target_column])
y = df[target_column]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
return X_normalized, y
def train_test_split_data(X, y, test_size: float = 0.2, random_state: int = 42):
"""Split into train/test sets."""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
return X_train, X_test, y_train, y_test
def run_linear_regression(X_train, X_test, y_train, y_test):
"""Train and evaluate a LinearRegression model."""
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Linear Regression MSE:", mse)
def run_decision_tree_classifier(X_train, X_test, y_train, y_test):
"""Train and evaluate a DecisionTreeClassifier model."""
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", acc)
def run_knn_classifier(X_train, X_test, y_train, y_test):
"""Train and evaluate a KNeighborsClassifier model."""
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", acc)
def main():
# set this to the correct target column for your dataset
target_column = "diagnosis"
df_raw = load_data()
df_processed = preprocess_data(df_raw, target_column=target_column)
X_normalized, y = normalize_data(df_processed, target_column=target_column)
X_train, X_test, y_train, y_test = train_test_split_data(X_normalized, y)
run_linear_regression(X_train, X_test, y_train, y_test)
run_decision_tree_classifier(X_train, X_test, y_train, y_test)
run_knn_classifier(X_train, X_test, y_train, y_test)
if __name__ == "__main__":
main()
Мобильная версия