多种方法实现股价预测——LSTM
LSTM是一种深度学习模型,用于解决长序列中的梯度消失问题。LSTM有三个门:更新门、遗忘门和输出门。更新和忘记门决定是否更新单元的每个元素。
我们利用LSTM模型对NDAQ的z股价指数进行预测。
- 数据准备
# 利用LSTM模型进行股价预测
# 使用多元属性
# 使用单层LSTM, 滞后期为N(窗口大小)
import math
import numpy as np
import pandas as pd
from numpy.random import seed
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow import set_random_seed
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import Adam
#### 基本参数配置 ##################
stk_path = "NDAQ.csv"
test_size = 0.2 # proportion of dataset to be used as test set
cv_size = 0.2 # proportion of dataset to be used as cross-validation set
N = 9 # for feature at day t, we use lags from t-1, t-2, ..., t-N as features.
# initial value before tuning
lstm_units=50 # lstm param. initial value before tuning.
dropout_prob=0.5 # lstm param. initial value before tuning.
optimizer='adam' # lstm param. initial value before tuning.
epochs=1 # lstm param. initial value before tuning.
batch_size=1 # lstm param. initial value before tuning.
model_seed = 100
fontsize = 14
ticklabelsize = 14
# Set seeds to ensure same output results
seed(101)
set_random_seed(model_seed)
# 1. 数据准备
df = pd.read_csv(stk_path, sep = ",")
# Convert Date column to datetime
df.loc[:, 'Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d')
# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]
# Get month of each sample
df['month'] = df['date'].dt.month
#删除部分volumn=0的行
df=df.drop(df[df['volume']==0].index, axis=0)
# 增加特征1:difference between high and low of each day
df['range_h_l'] = df['high'] - df['low']
df.drop(['high', 'low'], axis=1, inplace=True)
# 增加特征2:difference between open and close of each day
df['range_o_c'] = df['open'] - df['close']
df.drop(['open', 'close'], axis=1, inplace=True)
# Sort by datetime
df.sort_values(by='date', inplace=True, ascending=True)
引入辅助函数:
######## 辅助函数 #############
#函数1:计算MAPE指标
def get_mape(y_true, y_pred):
Compute mean absolute percentage error (MAPE)
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# 函数2: 给定数据集data, 进行窗口操作,生成LSTM的数据格式: X: [sample, time_step, N_features], Y:[obj]
def get_x_y(data, N, offset):
Split data into x (features) and y (target)
x, y = [], []
for i in range(offset, len(data)):
x.append(data[i - N:i])
y.append(data[i,0])
x = np.array(x)
y = np.array(y)
return x, y
#####################################
训练模型的关键函数:
###### 关键函数:训练和测试############
# 函数3: 实现LSTM模型的训练、性能评价和复原预测结果:
# Train model, do prediction, scale back to original range and do evaluation
def train_pred_eval_model(x_train_scaled, \
y_train_scaled, \
x_cv_scaled, \
y_cv_scaled, \
scaler, \
lstm_units=50, \
dropout_prob=0.5, \
optimizer='adam', \
epochs=1, \
batch_size=1):
Inputs
x_train_scaled : e.g. x_train_scaled.shape=(451, 9, 1). Here we are using the past 9 values to predict the next value
y_train_scaled : e.g. y_train_scaled.shape=(451, 1)
x_cv_scaled : use this to do predictions
y_cv_scaled : actual value of the predictions (scaled)
scaler : scaler that is used to fit_transform train set
lstm_units : lstm param 隐层神经元数量
dropout_prob : lstm param 防止过拟合的参数
optimizer : lstm param
epochs : lstm param
batch_size : lstm param
Outputs
rmse : root mean square error
mape : mean absolute percentage error
pred_y : predictions
history, : 模型训练的结果,包含了训练损失
# Create the LSTM network
model = Sequential()
#两层,return_sequences=True,
#model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=(x_train_scaled.shape[1], x_train_scaled.shape[2])))
# 单层:return_sequences=False
model.add(LSTM(units=lstm_units, return_sequences=False, input_shape=(x_train_scaled.shape[1], x_train_scaled.shape[2])))
model.add(Dropout(dropout_prob)) # Add dropout with a probability of 0.5
#两层:model.add(LSTM(units=lstm_units))
#两层:model.add(Dropout(dropout_prob)) # Add dropout with a probability of 0.5
model.add(Dense(1))
# Compile and fit the LSTM network
model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0005)) #loss可选'mae','mean_squared_error'
# 这里没有参数optimizer, 而是采用了带学习速率的Adam
model.summary()
history = model.fit(x_train_scaled, y_train_scaled, epochs=epochs, batch_size=batch_size, validation_data=(x_cv_scaled, y_cv_scaled), verbose=0, shuffle=False)
# 进行预测:
pred_y_cv_scaled = model.predict(x_cv_scaled)
#借助x,还原pred_y_cv
X_cv_reduce = x_cv_scaled[:, 0:1, :] # 去掉time_lag
X_cv_reduce = np.reshape(X_cv_reduce, [len(X_cv_reduce), -1])
pred_y_cv = np.concatenate((pred_y_cv_scaled, X_cv_reduce[:, 1:]), axis=1) # 主要目的: (y+X)匹配Scaler做逆变换
pred_y_cv = scaler.inverse_transform(pred_y_cv)
pred_y_cv=pred_y_cv[:,0]
#借助x,还原y_cv
# invert scaling for actual 原始矩阵
y_cv_scaled = y_cv_scaled.reshape((len(y_cv_scaled), 1))
y_cv = np.concatenate((y_cv_scaled, X_cv_reduce[:, 1:]), axis=1) # 还原原始test_y
y_cv = scaler.inverse_transform(y_cv)
y_cv = y_cv[:, 0]
# 计算 RMSE and MAPE
rmse = math.sqrt(mean_squared_error(y_cv, pred_y_cv))
mape = get_mape(y_cv, pred_y_cv)
return rmse, mape, pred_y_cv,history
#########################################
2. 划分训练和测试集, 数据缩放(min_max_scale),并生成LSTM的输入格式
#2. 划分训练和测试集, 数据缩放(min_max_scale),并生成LSTM的输入格式
#2.1划分训练集和测试集
# Get sizes of each of the datasets
num_cv = int(cv_size*len(df))
num_test = int(test_size*len(df))
num_train = len(df) - num_cv - num_test
print("num_train = " + str(num_train))
print("num_cv = " + str(num_cv))
print("num_test = " + str(num_test))
# Split into train, cv, and test
# 注意:采用了多元属性
train = df[:num_train][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
cv = df[num_train:num_train+num_cv][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
train_cv = df[:num_train+num_cv][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
test = df[num_train+num_cv:][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
#2.2假设已经通过网格搜索方法获得的最优的参数:(寻优过程省略)
#最优参数
N_opt=5 #窗口大小
lstm_units_opt=50 #隐层神经元
dropout_prob_opt=0.15 #dropout 率
optimizer_opt='adam' #优化器
epochs_opt=20 #迭代次数,默认50
batch_size_opt=64 #batch大小
# 2.3数据缩放(max-min-scale),并生成LSTM所需要的数据形式X=[sample, time_step, feature=4] Y=[adj_close]
#把train_cv当作训练集, test是测试集
scaler_final = MinMaxScaler(feature_range=(0, 1))
train_cv_scaled_final = scaler_final.fit_transform(np.array(train_cv[['adj_close','volume', 'range_h_l', 'range_o_c']]).reshape(-1,4))
# 训练集划分:Split train_cv into x(符合LSTM输入形式) and y
x_train_cv_scaled, y_train_cv_scaled = get_x_y(train_cv_scaled_final, N_opt, N_opt)
#测试集划分,因为test的窗口涉及cv部分的数据,所以从df中截取比较好
# 对整个数据集进行缩放
df_scaled = scaler_final.transform(np.array(df[['adj_close','volume', 'range_h_l', 'range_o_c']]).reshape(-1,4))
x_test_scaled, y_test_scaled = get_x_y(df_scaled, N_opt, num_train+num_cv)
3. LSTM模型训练和结果评价
调用函数实现模型训练及绘图
#3. 训练最终模型和结果评价
# Train, predict and eval model
rmse, mape, pred_test_y,history = train_pred_eval_model(x_train_cv_scaled, \
y_train_cv_scaled, \
x_test_scaled, \
y_test_scaled, \
scaler_final, \
lstm_units=lstm_units_opt, \
dropout_prob=dropout_prob_opt, \
optimizer=optimizer_opt, \
epochs=epochs_opt, \
batch_size=batch_size_opt)
# 计算 RMSE
print("RMSE on test set = %0.3f" % rmse)
# 计算 MAPE
print("MAPE on test set = %0.3f%%" % mape)