多种方法实现股价预测——LSTM

budomo

LSTM是一种深度学习模型，用于解决长序列中的梯度消失问题。LSTM有三个门：更新门、遗忘门和输出门。更新和忘记门决定是否更新单元的每个元素。

我们利用LSTM模型对NDAQ的z股价指数进行预测。

数据准备

# 利用LSTM模型进行股价预测
   # 使用多元属性
   # 使用单层LSTM, 滞后期为N（窗口大小)
import math
import numpy as np
import pandas as pd
from numpy.random import seed
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow import set_random_seed
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import  Adam
#### 基本参数配置 ##################
stk_path = "NDAQ.csv"
test_size = 0.2                # proportion of dataset to be used as test set
cv_size = 0.2                  # proportion of dataset to be used as cross-validation set
N = 9                          # for feature at day t, we use lags from t-1, t-2, ..., t-N as features.
                               # initial value before tuning
lstm_units=50                  # lstm param. initial value before tuning.
dropout_prob=0.5                 # lstm param. initial value before tuning.
optimizer='adam'               # lstm param. initial value before tuning.
epochs=1                       # lstm param. initial value before tuning.
batch_size=1                   # lstm param. initial value before tuning.
model_seed = 100
fontsize = 14
ticklabelsize = 14
# Set seeds to ensure same output results
seed(101)
set_random_seed(model_seed)
# 1. 数据准备
df = pd.read_csv(stk_path, sep = ",")
# Convert Date column to datetime
df.loc[:, 'Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d')
# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]
# Get month of each sample
df['month'] = df['date'].dt.month
#删除部分volumn=0的行
df=df.drop(df[df['volume']==0].index, axis=0)
# 增加特征1：difference between high and low of each day
df['range_h_l'] = df['high'] - df['low']
df.drop(['high', 'low'], axis=1, inplace=True)
# 增加特征2：difference between open and close of each day
df['range_o_c'] = df['open'] - df['close']
df.drop(['open', 'close'], axis=1, inplace=True)
# Sort by datetime
df.sort_values(by='date', inplace=True, ascending=True)

引入辅助函数：

######## 辅助函数 #############
#函数1：计算MAPE指标
def get_mape(y_true, y_pred):
    Compute mean absolute percentage error (MAPE)
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# 函数2： 给定数据集data, 进行窗口操作，生成LSTM的数据格式： X: [sample, time_step, N_features], Y:[obj]
def get_x_y(data, N, offset):
    Split data into x (features) and y (target)
    x, y = [], []
    for i in range(offset, len(data)):
        x.append(data[i - N:i])
        y.append(data[i,0])
    x = np.array(x)
    y = np.array(y)
    return x, y
#####################################

训练模型的关键函数：

###### 关键函数：训练和测试############
# 函数3： 实现LSTM模型的训练、性能评价和复原预测结果：
# Train model, do prediction, scale back to original range and do evaluation
def train_pred_eval_model(x_train_scaled, \
                          y_train_scaled, \
                          x_cv_scaled, \
                          y_cv_scaled, \
                          scaler, \
                          lstm_units=50, \
                          dropout_prob=0.5, \
                          optimizer='adam', \
                          epochs=1, \
                          batch_size=1):
    Inputs
        x_train_scaled  : e.g. x_train_scaled.shape=(451, 9, 1). Here we are using the past 9 values to predict the next value
        y_train_scaled  : e.g. y_train_scaled.shape=(451, 1)
        x_cv_scaled     : use this to do predictions
        y_cv_scaled     : actual value of the predictions (scaled)
        scaler          : scaler that is used to fit_transform train set
        lstm_units      : lstm param  隐层神经元数量
        dropout_prob    : lstm param  防止过拟合的参数
        optimizer       : lstm param
        epochs          : lstm param
        batch_size      : lstm param
    Outputs
        rmse            : root mean square error
        mape            : mean absolute percentage error
        pred_y           : predictions
        history,        : 模型训练的结果，包含了训练损失
    # Create the LSTM network
    model = Sequential()
    #两层，return_sequences=True,
    #model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=(x_train_scaled.shape[1], x_train_scaled.shape[2]))) 
    # 单层：return_sequences=False
    model.add(LSTM(units=lstm_units, return_sequences=False, input_shape=(x_train_scaled.shape[1], x_train_scaled.shape[2]))) 
    model.add(Dropout(dropout_prob))  # Add dropout with a probability of 0.5
    #两层：model.add(LSTM(units=lstm_units))
    #两层：model.add(Dropout(dropout_prob))  # Add dropout with a probability of 0.5
    model.add(Dense(1))
    # Compile and fit the LSTM network
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0005))  #loss可选'mae','mean_squared_error'
    # 这里没有参数optimizer, 而是采用了带学习速率的Adam
    model.summary()
    history = model.fit(x_train_scaled, y_train_scaled, epochs=epochs, batch_size=batch_size, validation_data=(x_cv_scaled, y_cv_scaled), verbose=0, shuffle=False)
    # 进行预测：
    pred_y_cv_scaled = model.predict(x_cv_scaled)
    #借助x，还原pred_y_cv
    X_cv_reduce = x_cv_scaled[:, 0:1, :]  # 去掉time_lag
    X_cv_reduce = np.reshape(X_cv_reduce, [len(X_cv_reduce), -1])
    pred_y_cv = np.concatenate((pred_y_cv_scaled, X_cv_reduce[:, 1:]), axis=1)  # 主要目的： （y+X）匹配Scaler做逆变换
    pred_y_cv = scaler.inverse_transform(pred_y_cv)
    pred_y_cv=pred_y_cv[:,0]
    #借助x，还原y_cv
    # invert scaling for actual 原始矩阵
    y_cv_scaled = y_cv_scaled.reshape((len(y_cv_scaled), 1))
    y_cv = np.concatenate((y_cv_scaled, X_cv_reduce[:, 1:]), axis=1)  # 还原原始test_y
    y_cv = scaler.inverse_transform(y_cv)
    y_cv = y_cv[:, 0]
    # 计算 RMSE and MAPE
    rmse = math.sqrt(mean_squared_error(y_cv, pred_y_cv))
    mape = get_mape(y_cv, pred_y_cv)
    return rmse, mape, pred_y_cv,history
#########################################

2. 划分训练和测试集，数据缩放（min_max_scale),并生成LSTM的输入格式

#2. 划分训练和测试集， 数据缩放（min_max_scale),并生成LSTM的输入格式
#2.1划分训练集和测试集
# Get sizes of each of the datasets
num_cv = int(cv_size*len(df))
num_test = int(test_size*len(df))
num_train = len(df) - num_cv - num_test
print("num_train = " + str(num_train))
print("num_cv = " + str(num_cv))
print("num_test = " + str(num_test))
# Split into train, cv, and test
# 注意：采用了多元属性
train = df[:num_train][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
cv = df[num_train:num_train+num_cv][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
train_cv = df[:num_train+num_cv][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
test = df[num_train+num_cv:][['date', 'adj_close', 'volume', 'range_h_l', 'range_o_c' ]]
#2.2假设已经通过网格搜索方法获得的最优的参数：（寻优过程省略）
#最优参数
N_opt=5   #窗口大小
lstm_units_opt=50     #隐层神经元
dropout_prob_opt=0.15  #dropout 率
optimizer_opt='adam'   #优化器
epochs_opt=20       #迭代次数，默认50
batch_size_opt=64    #batch大小
# 2.3数据缩放(max-min-scale)，并生成LSTM所需要的数据形式X=[sample, time_step, feature=4]  Y=[adj_close]
#把train_cv当作训练集， test是测试集
scaler_final = MinMaxScaler(feature_range=(0, 1))
train_cv_scaled_final = scaler_final.fit_transform(np.array(train_cv[['adj_close','volume', 'range_h_l', 'range_o_c']]).reshape(-1,4))
# 训练集划分：Split train_cv into x（符合LSTM输入形式） and y
x_train_cv_scaled, y_train_cv_scaled = get_x_y(train_cv_scaled_final, N_opt, N_opt)
#测试集划分，因为test的窗口涉及cv部分的数据，所以从df中截取比较好
# 对整个数据集进行缩放
df_scaled  = scaler_final.transform(np.array(df[['adj_close','volume', 'range_h_l', 'range_o_c']]).reshape(-1,4))
x_test_scaled, y_test_scaled = get_x_y(df_scaled, N_opt, num_train+num_cv)

3. LSTM模型训练和结果评价

调用函数实现模型训练及绘图

#3. 训练最终模型和结果评价
# Train, predict and eval model
rmse, mape, pred_test_y,history = train_pred_eval_model(x_train_cv_scaled, \
                                        y_train_cv_scaled, \
                                        x_test_scaled, \
                                        y_test_scaled, \
                                        scaler_final, \
                                        lstm_units=lstm_units_opt, \
                                        dropout_prob=dropout_prob_opt, \
                                        optimizer=optimizer_opt, \
                                        epochs=epochs_opt, \
                                        batch_size=batch_size_opt)
# 计算 RMSE
print("RMSE on test set = %0.3f" % rmse)
# 计算 MAPE
print("MAPE on test set = %0.3f%%" % mape)