Прогнозирование цен акций на основе LSTM: моделирование, оценка и независимость от тестовых данных

Прогнозирование цен акций на основе LSTM: моделирование, оценка и независимость от тестовых данных ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Прогнозирование цен акций на основе LSTM: моделирование, оценка и независимость от тестовых данных

Цитата

Сообщение Anonymous » 13 янв 2025, 07:23

У меня проблема с этим кодом. Похоже, что прогноз не работает без доступа к тестовым данным. Обычно прогноз следует выполнять независимо от тестовых данных. Я тестировал его на тестовых данных ранее, и результат показался мне слишком хорошим. По этой причине я решил манипулировать тестовыми данными в файле CSV. Они не соответствуют реальным значениям. Я думал, что если прогноз будет работать совершенно независимо от тестовых данных, он не изменится. К сожалению, прогноз скорректирован именно с измененными тестовыми данными с помощью этого измененного файла. Это показывает мне, что код не может быть правильным.

Код: Выделить всё

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.optimizers import Adam
import numpy as np

# 1. Read and prepare the CSV file
df = pd.read_csv('./Schlusspreise_2000-2009_manipuliert.csv', header=None, names=['Date', 'Price'])
df['Date'] = pd.to_datetime(df['Date'])
df['Price'] = df['Price'].astype(float)
df.set_index('Date', inplace=True)

# 2. Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df[['Price']])

# 3. Split into training, validation, and test sets
train_size = int(len(df_scaled) * 0.8)
val_size = int(len(df_scaled) * 0.1)

train_data = df_scaled[:train_size]
val_data = df_scaled[train_size:train_size + val_size]
test_data = df_scaled[train_size + val_size:]

# Prepare training data
x_train, y_train = [], []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Prepare validation data
inputs_val = np.concatenate((train_data[-60:], val_data), axis=0)
x_val, y_val = [], []
for i in range(60, len(inputs_val)):
x_val.append(inputs_val[i-60:i, 0])
y_val.append(inputs_val[i, 0])

x_val, y_val = np.array(x_val), np.array(y_val)
x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))

# Prepare test data
inputs_test = np.concatenate((val_data[-60:], test_data), axis=0)
x_test, y_test = [], []
for i in range(60, len(inputs_test)):
x_test.append(inputs_test[i-60:i, 0])
y_test.append(inputs_test[i, 0])

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# 4. Create LSTM model
lstm_model = Sequential([
LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)),
Dropout(0.2),
LSTM(units=50),
Dropout(0.2),
Dense(1)
])
lstm_model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.0015))

# 5. Train the model
history = lstm_model.fit(
x_train, y_train,
epochs=4,
batch_size=32,
verbose=2,
validation_data=(x_val, y_val)
)

# 6.  Make predictions
train_predictions = lstm_model.predict(x_train)
train_predictions = scaler.inverse_transform(train_predictions)

val_predictions = lstm_model.predict(x_val)
val_predictions = scaler.inverse_transform(val_predictions)

test_predictions = lstm_model.predict(x_test)
test_predictions = scaler.inverse_transform(test_predictions)

# Calculate error metrics
train_actual = scaler.inverse_transform(train_data[60:])
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))

val_actual = scaler.inverse_transform(val_data)
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

test_actual = scaler.inverse_transform(test_data)
test_rmse = np.sqrt(mean_squared_error(test_actual[:len(test_predictions)], test_predictions))

# Visualize train_loss and val_loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Train vs Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Set x-axis for each epoch
plt.xticks(ticks=range(len(history.history['loss'])), labels=range(1, len(history.history['loss']) + 1))

plt.legend()
plt.grid()
plt.show()

# Visualize training and validation data
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.xlabel('Date')
plt.ylabel('Closing Price')

# Calculate RMSE for training and validation
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

# Title with RMSE
plt.title(f'Training and Validation Data with Predictions for Apple 01.01.2004 – 26.05.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[train_size + val_size:], df['Price'][train_size + val_size:], label="Test Data", color="green")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.plot(df.index[train_size + val_size:], test_predictions, label="Prediction on Unseen Test Data", color="red")
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title(f'LSTM Prediction - AAPL 01.01.2004 - 31.12.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}, Test RMSE: {test_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()

Буду очень благодарен за любую поддержку. Не могли бы вы объяснить, в чем может быть проблема? Я пишу об этом магистерскую диссертацию.

Подробнее здесь: https://stackoverflow.com/questions/793 ... nce-from-t

1736742226

Anonymous

У меня проблема с этим кодом. Похоже, что прогноз не работает без доступа к тестовым данным. Обычно прогноз следует выполнять независимо от тестовых данных. Я тестировал его на тестовых данных ранее, и результат показался мне слишком хорошим. По этой причине я решил манипулировать тестовыми данными в файле CSV. Они не соответствуют реальным значениям. Я думал, что если прогноз будет работать совершенно независимо от тестовых данных, он не изменится. К сожалению, прогноз скорректирован именно с измененными тестовыми данными с помощью этого измененного файла. Это показывает мне, что код не может быть правильным.
[code]import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.optimizers import Adam
import numpy as np

# 1. Read and prepare the CSV file
df = pd.read_csv('./Schlusspreise_2000-2009_manipuliert.csv', header=None, names=['Date', 'Price'])
df['Date'] = pd.to_datetime(df['Date'])
df['Price'] = df['Price'].astype(float)
df.set_index('Date', inplace=True)

# 2. Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df[['Price']])

# 3. Split into training, validation, and test sets
train_size = int(len(df_scaled) * 0.8)
val_size = int(len(df_scaled) * 0.1)

train_data = df_scaled[:train_size]
val_data = df_scaled[train_size:train_size + val_size]
test_data = df_scaled[train_size + val_size:]

# Prepare training data
x_train, y_train = [], []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Prepare validation data
inputs_val = np.concatenate((train_data[-60:], val_data), axis=0)
x_val, y_val = [], []
for i in range(60, len(inputs_val)):
x_val.append(inputs_val[i-60:i, 0])
y_val.append(inputs_val[i, 0])

x_val, y_val = np.array(x_val), np.array(y_val)
x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))

# Prepare test data
inputs_test = np.concatenate((val_data[-60:], test_data), axis=0)
x_test, y_test = [], []
for i in range(60, len(inputs_test)):
x_test.append(inputs_test[i-60:i, 0])
y_test.append(inputs_test[i, 0])

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# 4. Create LSTM model
lstm_model = Sequential([
LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)),
Dropout(0.2),
LSTM(units=50),
Dropout(0.2),
Dense(1)
])
lstm_model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.0015))

# 5. Train the model
history = lstm_model.fit(
x_train, y_train,
epochs=4,
batch_size=32,
verbose=2,
validation_data=(x_val, y_val)
)

# 6.  Make predictions
train_predictions = lstm_model.predict(x_train)
train_predictions = scaler.inverse_transform(train_predictions)

val_predictions = lstm_model.predict(x_val)
val_predictions = scaler.inverse_transform(val_predictions)

test_predictions = lstm_model.predict(x_test)
test_predictions = scaler.inverse_transform(test_predictions)

# Calculate error metrics
train_actual = scaler.inverse_transform(train_data[60:])
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))

val_actual = scaler.inverse_transform(val_data)
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

test_actual = scaler.inverse_transform(test_data)
test_rmse = np.sqrt(mean_squared_error(test_actual[:len(test_predictions)], test_predictions))

# Visualize train_loss and val_loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Train vs Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Set x-axis for each epoch
plt.xticks(ticks=range(len(history.history['loss'])), labels=range(1, len(history.history['loss']) + 1))

plt.legend()
plt.grid()
plt.show()

# Visualize training and validation data
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.xlabel('Date')
plt.ylabel('Closing Price')

# Calculate RMSE for training and validation
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

# Title with RMSE
plt.title(f'Training and Validation Data with Predictions for Apple 01.01.2004 – 26.05.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[train_size + val_size:], df['Price'][train_size + val_size:], label="Test Data", color="green")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.plot(df.index[train_size + val_size:], test_predictions, label="Prediction on Unseen Test Data", color="red")
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title(f'LSTM Prediction - AAPL 01.01.2004 - 31.12.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}, Test RMSE: {test_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()
[/code]
Буду очень благодарен за любую поддержку. Не могли бы вы объяснить, в чем может быть проблема? Я пишу об этом магистерскую диссертацию. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79351172/lstm-based-stock-price-prediction-modeling-evaluation-and-independence-from-t[/url]