В настоящее время я участвую в конкурсе Kaggle «Титаник — машинное обучение на случай катастроф».
Я применил целевое кодирование к 5 категориальным столбцам, как показано в программе ниже.
target_train = df_all[:890]
target_test = df_all[891:]
target_x = target_train.drop(['Survived'], axis=1)
target_y = target_train[['PassengerId', 'Survived']]
cat_cols = ['Deck', 'People', 'Embarked', 'Fare_range', 'Age_group']
# redo target encoding for each cross validation fold
kf = KFold(n_splits=5, shuffle=True, random_state=SEED-1)
for i, (tr_idx, va_idx) in enumerate(kf.split(target_x)):
# Separate validation data from training data
tr_x, va_x = target_x.iloc[tr_idx].copy(), target_x.iloc[va_idx].copy()
tr_y, va_y = target_y.iloc[tr_idx], target_y.iloc[va_idx]
# Loop through variables and target encoding
for c in cat_cols:
# Calculate average of targets in each category across training data
data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y['Survived']})
target_mean = data_tmp.groupby(c)['target'].mean()
# Replace validation data categories
cl_name = c + '_target'
va_x.loc[:, cl_name] = va_x[c].map(target_mean)
# Prepare an array to store the transformed values of the training data
tmp = np.repeat(np.nan, tr_x.shape[0])
kf_encoding = KFold(n_splits=5, shuffle=True, random_state=SEED+1)
for idx_1, idx_2 in kf_encoding.split(tr_x):
# out-of-fold to compute the mean of the objective variable in each category
target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
# Store converted values in a temporary array
tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)
tr_x.loc[:, cl_name] = tmp
target_x[cl_name] = np.nan
target_x.loc[tr_x.index, cl_name] = tr_x[cl_name]
target_x.loc[va_x.index, cl_name] = va_x[cl_name]
for c in cat_cols:
data_tmp = pd.DataFrame({c: target_x[c], 'target': target_y['Survived']})
target_mean = data_tmp.groupby(c)['target'].mean()
cl_name = c + '_target'
target_test.loc[:, cl_name] = target_test[c].map(target_mean)
target_train = pd.merge(target_x, target_y).reset_index(drop=True)
df_all = pd.concat([target_train, target_test]).reset_index(drop=True)
Точность данных обучения и проверки составляет около 0,8, но фактическая оценка при подаче на конкурс составляет около 0,6. Как это исправить?
Следующая программа предназначена для отладки.
#import libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
!pip install optuna
import optuna
import string
import warnings
warnings.filterwarnings('ignore')
SEED = 42
# Loading Data
df_train=pd.read_csv("../input/titanic/train.csv")
df_test=pd.read_csv("../input/titanic/test.csv")
df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
# Missing value completion & feature engineering
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
df_all['Embarked'] = df_all['Embarked'].fillna(df_all.groupby(['Sex', 'Pclass'])['Embarked'].transform(lambda x: x.mode().iloc[0]))
df_all['Family'] = df_all['SibSp'] + df_all['Parch'] + 1
df_all['Cabinmates'] = df_all['Ticket'].duplicated(keep=False).astype(int) * df_all.groupby('Ticket')['Ticket'].transform('count') - 1
df_all.loc[df_all['Cabinmates'] == -1, 'Cabinmates'] = 0
df_all['Fare'] = df_all.groupby(['Pclass', 'Cabinmates'])['Fare'].transform(lambda x:x.fillna(x.median()))
df_all['Personal_Expenses'] = df_all['Fare'] / (df_all['Cabinmates'] + 1)
df_all = pd.get_dummies(df_all, columns=['Sex'])
df_all = df_all.rename(columns={'Sex_female': 'Sex'})
df_all['Sex'] = df_all['Sex'].astype(int)
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
idx = df_all[df_all['Deck'] == 'T'].index
df_all.loc[idx, 'Deck'] = 'A'
df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')
md_deck = (
df_all[df_all['Deck'] != 'M'].groupby('Deck')['Personal_Expenses'].median()
)
def assign_deck(row):
if row['Deck'] == 'M':
diff = (md_deck - row['Personal_Expenses']).abs()
clst = diff.idxmin()
return clst
return row['Deck']
df_all['Deck'] = df_all.apply(assign_deck, axis=1)
df_all['People'] = df_all['Sex']
df_all['People'] = np.where(df_all['Age'] 0.5 ).astype( int )
# Creating a Submission File
submission = pd.read_csv("../input/titanic/gender_submission.csv")
submission['Survived'] = y_submit
submission.to_csv("submission.csv", index=False)
Подробнее здесь: https://stackoverflow.com/questions/791 ... t-encoding
Как я могу предотвратить утечки в целевой кодировке? [закрыто] ⇐ Python
Программы на Python
-
Anonymous
1731968723
Anonymous
В настоящее время я участвую в конкурсе Kaggle «Титаник — машинное обучение на случай катастроф».
Я применил целевое кодирование к 5 категориальным столбцам, как показано в программе ниже.
target_train = df_all[:890]
target_test = df_all[891:]
target_x = target_train.drop(['Survived'], axis=1)
target_y = target_train[['PassengerId', 'Survived']]
cat_cols = ['Deck', 'People', 'Embarked', 'Fare_range', 'Age_group']
# redo target encoding for each cross validation fold
kf = KFold(n_splits=5, shuffle=True, random_state=SEED-1)
for i, (tr_idx, va_idx) in enumerate(kf.split(target_x)):
# Separate validation data from training data
tr_x, va_x = target_x.iloc[tr_idx].copy(), target_x.iloc[va_idx].copy()
tr_y, va_y = target_y.iloc[tr_idx], target_y.iloc[va_idx]
# Loop through variables and target encoding
for c in cat_cols:
# Calculate average of targets in each category across training data
data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y['Survived']})
target_mean = data_tmp.groupby(c)['target'].mean()
# Replace validation data categories
cl_name = c + '_target'
va_x.loc[:, cl_name] = va_x[c].map(target_mean)
# Prepare an array to store the transformed values of the training data
tmp = np.repeat(np.nan, tr_x.shape[0])
kf_encoding = KFold(n_splits=5, shuffle=True, random_state=SEED+1)
for idx_1, idx_2 in kf_encoding.split(tr_x):
# out-of-fold to compute the mean of the objective variable in each category
target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
# Store converted values in a temporary array
tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)
tr_x.loc[:, cl_name] = tmp
target_x[cl_name] = np.nan
target_x.loc[tr_x.index, cl_name] = tr_x[cl_name]
target_x.loc[va_x.index, cl_name] = va_x[cl_name]
for c in cat_cols:
data_tmp = pd.DataFrame({c: target_x[c], 'target': target_y['Survived']})
target_mean = data_tmp.groupby(c)['target'].mean()
cl_name = c + '_target'
target_test.loc[:, cl_name] = target_test[c].map(target_mean)
target_train = pd.merge(target_x, target_y).reset_index(drop=True)
df_all = pd.concat([target_train, target_test]).reset_index(drop=True)
Точность данных обучения и проверки составляет около 0,8, но фактическая оценка при подаче на конкурс составляет около 0,6. Как это исправить?
Следующая программа предназначена для отладки.
#import libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
!pip install optuna
import optuna
import string
import warnings
warnings.filterwarnings('ignore')
SEED = 42
# Loading Data
df_train=pd.read_csv("../input/titanic/train.csv")
df_test=pd.read_csv("../input/titanic/test.csv")
df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
# Missing value completion & feature engineering
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
df_all['Embarked'] = df_all['Embarked'].fillna(df_all.groupby(['Sex', 'Pclass'])['Embarked'].transform(lambda x: x.mode().iloc[0]))
df_all['Family'] = df_all['SibSp'] + df_all['Parch'] + 1
df_all['Cabinmates'] = df_all['Ticket'].duplicated(keep=False).astype(int) * df_all.groupby('Ticket')['Ticket'].transform('count') - 1
df_all.loc[df_all['Cabinmates'] == -1, 'Cabinmates'] = 0
df_all['Fare'] = df_all.groupby(['Pclass', 'Cabinmates'])['Fare'].transform(lambda x:x.fillna(x.median()))
df_all['Personal_Expenses'] = df_all['Fare'] / (df_all['Cabinmates'] + 1)
df_all = pd.get_dummies(df_all, columns=['Sex'])
df_all = df_all.rename(columns={'Sex_female': 'Sex'})
df_all['Sex'] = df_all['Sex'].astype(int)
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
idx = df_all[df_all['Deck'] == 'T'].index
df_all.loc[idx, 'Deck'] = 'A'
df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')
md_deck = (
df_all[df_all['Deck'] != 'M'].groupby('Deck')['Personal_Expenses'].median()
)
def assign_deck(row):
if row['Deck'] == 'M':
diff = (md_deck - row['Personal_Expenses']).abs()
clst = diff.idxmin()
return clst
return row['Deck']
df_all['Deck'] = df_all.apply(assign_deck, axis=1)
df_all['People'] = df_all['Sex']
df_all['People'] = np.where(df_all['Age'] 0.5 ).astype( int )
# Creating a Submission File
submission = pd.read_csv("../input/titanic/gender_submission.csv")
submission['Survived'] = y_submit
submission.to_csv("submission.csv", index=False)
Подробнее здесь: [url]https://stackoverflow.com/questions/79196742/how-can-i-prevent-leaks-in-my-target-encoding[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия