导入包
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from datetime import datetime
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
导入数据
data = pd.read_csv('DataSet.csv')
datas = data.values
数据输出
data.index = convert_date_from_int(data['date'])
data.drop('date',inplace=True,axis=1)
data.sort_index(inplace=True)
将date转化为datetime格式 并且设置为索引
# 将date转化为datetime格式 并且设置为索引
def string_to_date(string:str)->str:
shift_datetime = datetime.strptime(string,'%Y%m%d')
shift_date = shift_datetime.strftime('%Y-%m-%d')
return shift_date
def convert_date_from_int(time_series:pd.Series):
time_series = time_series.astype('str').apply(string_to_date)
return pd.to_datetime(time_series)
数据集分割
X_train,X_test,y_train,y_test = train_test_split(data.iloc[:,1:].round(2),data.iloc[:,0].round(0),shuffle=False,test_size=0.2)
将数据集转换为lgb格式的dataset
# 将数据转化为 LightGBM 的数据格式
train_data = lgb.Dataset(X_train,label=y_train)
test_data = lgb.Dataset(X_test,label=y_test)
设置LightGBM参数
# 设置Light的参数
params = {'boosting_type':'gbdt',#提升类型为 梯度提升决策树
'objective':'regression',#回归问题
'metric':'rmse',#评价指标rmse
'num_leaves':40,#每个决策树的叶子节点数目
'learning_rate':0.1,#学习率
'feature_fraction':0.9#迭代中使用特征的比例为0.9,防止过拟合
}
模型训练
# 训练模型
# 训练轮数 即迭代次数 每一轮迭代会生成一颗新的决策树
num_rounds = 100
# early_stopping_rounds: 如果验证集上的表现在连续轮中没有明显的进展 则停止训练
model = lgb.train(params,train_data,num_rounds,valid_sets=[test_data])
模型输出
模型预测
# 预测测试结果 得到测试集标签 和训练集的标签
y_pre = model.predict(X_test,num_iteration=model.best_iteration)
# y_train_pre = model.predict(X_train,num_iteration=model.best_iteration)
# model.best_iteration 模型在验证集上表现最佳的一轮迭代
模型精度
# 输出模型的r2和mse
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
R2 = r2_score(y_pre,y_test)
print(R2)
print(mean_squared_error(y_pre,y_test))
可视化
datetime = data.index[-len(y_test):]
plt.plot(datetime,y_test,color='red',label ='validation')
plt.plot(datetime,y_pre,color='green',label ='predict')
plt.legend()
plt.show()