Используйте простые правила:
df.isnull().sum()
✔ Числовые столбцы
df['age'].fillna(df['age'].mean(), inplace=True)
✔ Категориальные столбцы
df['gender'].fillna(df['gender'].mode()[0], inplace=True)
✔ Кодировка меток (для упорядоченных/простых категорий)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
✔ One-Hot Encoding (ЛУЧШИЙ для большинства случаев)
df = pd.get_dummies(df, drop_first=True)
Даже простые трюки могут повысить точность:
✔ Создавайте новые функции
df['total'] = df['math'] + df['science']
✔ Объедините функции
df['age_income'] = df['age'] * df['income']
✔ Биннинг
df['age_group'] = pd.cut(df['age'], bins=3)
Не придерживайтесь одной модели!
✔ Логистическая регрессия (базовый уровень)
from sklearn.linear_model import LogisticRegressionmodel = LogisticRegression()
✔ КНН
from sklearn.neighbors import KNeighborsClassifiermodel = KNeighborsClassifier(n_neighbors=5)
✔ Дерево решений
from sklearn.tree import DecisionTreeClassifiermodel = DecisionTreeClassifier(max_depth=5)
✔ Случайный лес (ЛУЧШЕ часто
)
from sklearn.ensemble import RandomForestClassifiermodel = RandomForestClassifier(n_estimators=100)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
pred = model.predict(X_val)
print(accuracy_score(y_val, pred))
final_pred = model.predict(test_data)
Не теряйте времени → сначала создайте работающий конвейер
RandomForestClassifier(n_estimators=200, max_depth=10)
Если точность обучения >> точность проверки → уменьшите сложность
Примените ТАКУЮ предварительную обработку к тестовым данным
# 1. Load
import pandas as pd
df = pd.read_csv('train.csv')
# 2. Handle missing
df.fillna(df.mean(), inplace=True)
# 3. Encode
df = pd.get_dummies(df, drop_first=True)
# 4. Split
X = df.drop('target', axis=1)
y = df['target']
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
# 5. Model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
# 6. Evaluate
from sklearn.metrics import accuracy_score
pred = model.predict(X_val)
print(accuracy_score(y_val, pred))
#################################33
ohe = OneHotEncoder()
feature_array = ohe.fit_transform(train[["course"]]).toarray()
feature_labels = ohe.get_feature_names_out(["course"])
new_features = pd.DataFrame(feature_array,columns = feature_labels)
train= pd.concat([train,new_features],axis = 1)
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size = 0.2,random_state = 42)
train_Y = train["burnout_level"]
train_X = train.drop("burnout_level",axis = 1)
test_X = test.drop("burnout_level",axis= 1)
test_Y = test["burnout_level"]
len(train_X) == len(train_Y)
train_X.shape
le = LabelEncoder()
train_X["gender"] = le.fit_transform(train_X["gender"])
test_X["gender"] = le.fit_transform(test_X["gender"])
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
train_X = train_X.iloc[:len(train_Y)]
model.fit(train_X,train_Y)
Мобильная версия