Как улучшить мою модель линейной регрессии? [закрыто]

Как улучшить мою модель линейной регрессии? [закрыто] ⇐ Python

Ответить

1 сообщение • Страница 1 из 1

Anonymous

Как улучшить мою модель линейной регрессии? [закрыто]

Цитата

Сообщение Anonymous » 06 янв 2025, 21:20

Сегодня я наконец завершил реализацию линейной регрессии с нуля

Код: Выделить всё

filepath = f"{path}/Food_Delivery_Times.csv"
df = pd.read_csv(filepath)
df.head()

print(df.columns)
df.isnull().sum()

df.dropna(inplace = True , thresh=4)
df.drop_duplicates(inplace= True)
df['Courier_Experience_yrs'] = df['Courier_Experience_yrs'].interpolate()
columns_to_fill = ['Weather', 'Traffic_Level', 'Time_of_Day']
for col in columns_to_fill:
mode_value = df[col].mode()[0]
df[col]= df[col].fillna(mode_value)
df.info()

print(df['Weather'].unique())
print(df['Traffic_Level'].unique())
print(df['Time_of_Day'].unique())
print(df['Vehicle_Type'].unique())

def onehot(df,column):
values = df[column].unique()
for val in values:
df[val] = (df[column] == val).astype(int)
return df

df = onehot(df,'Weather')
df = onehot(df,'Vehicle_Type')
df = onehot(df,'Time_of_Day')
traffic_mapping = {'Low':0,'Medium':1,'High':2}
df['Traffic_encoded'] = df['Traffic_Level'].map(traffic_mapping)
df

categorical_col = ['Weather', 'Traffic_Level', 'Time_of_Day','Vehicle_Type']
col_new_df = [col for col in df.columns if col not in categorical_col ]
col_new_df

new_df = df[col_new_df]
new_df.info()

# feature scaling
for col in new_df.columns:
if col not in ['Order_ID','Delivery_Time_min']:
new_df[col] = new_df[col].astype('float64')
mean = new_df[col].mean()
std = new_df[col].std()
new_df.loc[:,col] = (new_df[col]-mean)/std #converted to float as after scaling the float values as assigned to int data type which raises a warning

new_df.describe()

train_df = new_df.sample(frac = 0.8,random_state =200)
test_df = new_df.drop(train_df.index)

train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

train_df.drop(columns = ['index'],inplace=True)
test_df.drop(columns = ['index'],inplace=True)

columns_needed = list(new_df.columns)
columns_needed.remove('Order_ID')
columns_needed.remove('Delivery_Time_min')
X = train_df[columns_needed].to_numpy()
Y = train_df['Delivery_Time_min'].to_numpy()
Y_mean = train_df['Delivery_Time_min'].mean()
Y_std = train_df['Delivery_Time_min'].std()
Y = (Y-Y_mean)/Y_std

m = len(X)
np.random.seed(42)
W = np.random.randn(len(X[0]))
b = 0
alpha = 0.5
Lambda = 0.5
iteration = 0
dW = np.zeros(len(X[0]))
while iteration < 100000:
f = np.dot(X,W) + b
loss = (np.sum((f - Y)**2) + Lambda*np.sum(W*W))/(2*m)
dW = (np.dot(X.T,(f-Y)) + Lambda*W)/m
db = np.sum(f-Y)/m
W -= alpha*dW
b -= alpha*db
iteration += 1
if iteration % 10000 == 0:
print(iteration ,",",loss)
if(loss < 10**(-3)):
break

print("done")

#Testing
sum = 0
sum2 =0
ymean = test_df['Delivery_Time_min'].mean()
Y_original = test_df['Delivery_Time_min'].to_numpy()
X_test = test_df[columns_needed].to_numpy()
Y_predicted = np.dot(X_test,W) + b
Y_predicted_ori = Y_predicted*Y_std + Y_mean
print("Original ---- Predicted ---- Error")
for i in range(len(Y_original)):
print(Y_original[i],"---",Y_predicted_ori[i],"---",Y_original[i]-Y_predicted_ori[i])
sum += (Y_original[i]-Y_predicted_ori[i])**2
sum2 += (Y_original[i] - ymean)**2

rscore = 1 - (sum/sum2)
msme = sum/len(Y_original)
print(rscore)
print(msme)

Выше приведена реализация этого

набора данных — прогнозирование времени доставки еды (Kaggle)
R2_score — 0,754.
MSME — 133,17.

Так подойдет ли это новичку?
и что можно сделать, чтобы это улучшить?

Подробнее здесь: https://stackoverflow.com/questions/793 ... sion-model

1736187634

Anonymous

Сегодня я наконец завершил реализацию линейной регрессии с нуля
[code]filepath = f"{path}/Food_Delivery_Times.csv"
df = pd.read_csv(filepath)
df.head()

print(df.columns)
df.isnull().sum()

df.dropna(inplace = True , thresh=4)
df.drop_duplicates(inplace= True)
df['Courier_Experience_yrs'] = df['Courier_Experience_yrs'].interpolate()
columns_to_fill = ['Weather', 'Traffic_Level', 'Time_of_Day']
for col in columns_to_fill:
mode_value = df[col].mode()[0]
df[col]= df[col].fillna(mode_value)
df.info()

print(df['Weather'].unique())
print(df['Traffic_Level'].unique())
print(df['Time_of_Day'].unique())
print(df['Vehicle_Type'].unique())

def onehot(df,column):
values = df[column].unique()
for val in values:
df[val] = (df[column] == val).astype(int)
return df

df = onehot(df,'Weather')
df = onehot(df,'Vehicle_Type')
df = onehot(df,'Time_of_Day')
traffic_mapping = {'Low':0,'Medium':1,'High':2}
df['Traffic_encoded'] = df['Traffic_Level'].map(traffic_mapping)
df

categorical_col = ['Weather', 'Traffic_Level', 'Time_of_Day','Vehicle_Type']
col_new_df = [col for col in df.columns if col not in categorical_col ]
col_new_df

new_df = df[col_new_df]
new_df.info()

# feature scaling
for col in new_df.columns:
if col not in ['Order_ID','Delivery_Time_min']:
new_df[col] = new_df[col].astype('float64')
mean = new_df[col].mean()
std = new_df[col].std()
new_df.loc[:,col] = (new_df[col]-mean)/std #converted to float as after scaling the float values as assigned to int data type which raises a warning

new_df.describe()

train_df = new_df.sample(frac = 0.8,random_state =200)
test_df = new_df.drop(train_df.index)

train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

train_df.drop(columns = ['index'],inplace=True)
test_df.drop(columns = ['index'],inplace=True)

columns_needed = list(new_df.columns)
columns_needed.remove('Order_ID')
columns_needed.remove('Delivery_Time_min')
X = train_df[columns_needed].to_numpy()
Y = train_df['Delivery_Time_min'].to_numpy()
Y_mean = train_df['Delivery_Time_min'].mean()
Y_std = train_df['Delivery_Time_min'].std()
Y = (Y-Y_mean)/Y_std

m = len(X)
np.random.seed(42)
W = np.random.randn(len(X[0]))
b = 0
alpha = 0.5
Lambda = 0.5
iteration = 0
dW = np.zeros(len(X[0]))
while iteration < 100000:
f = np.dot(X,W) + b
loss = (np.sum((f - Y)**2) + Lambda*np.sum(W*W))/(2*m)
dW = (np.dot(X.T,(f-Y)) + Lambda*W)/m
db = np.sum(f-Y)/m
W -= alpha*dW
b -= alpha*db
iteration += 1
if iteration % 10000 == 0:
print(iteration ,",",loss)
if(loss < 10**(-3)):
break

print("done")

#Testing
sum = 0
sum2 =0
ymean = test_df['Delivery_Time_min'].mean()
Y_original = test_df['Delivery_Time_min'].to_numpy()
X_test = test_df[columns_needed].to_numpy()
Y_predicted = np.dot(X_test,W) + b
Y_predicted_ori = Y_predicted*Y_std + Y_mean
print("Original ---- Predicted ---- Error")
for i in range(len(Y_original)):
print(Y_original[i],"---",Y_predicted_ori[i],"---",Y_original[i]-Y_predicted_ori[i])
sum += (Y_original[i]-Y_predicted_ori[i])**2
sum2 += (Y_original[i] - ymean)**2

rscore = 1 - (sum/sum2)
msme = sum/len(Y_original)
print(rscore)
print(msme)
[/code]
Выше приведена реализация этого
[list]
[*]набора данных — прогнозирование времени доставки еды (Kaggle)[*]R2_score — 0,754.
[*]MSME — 133,17.
[/list]
Так подойдет ли это новичку?
и что можно сделать, чтобы это улучшить? 

Подробнее здесь: [url]https://stackoverflow.com/questions/79333949/how-to-improve-my-linear-regression-model[/url]