用python编写的机器学习模型,此模型不包含模型训练。模型的训练会在下一篇文章发布。
# -*- codeing = utf-8 -*-
# @Time : :
# @Author:zzs
# @File:机器学习模型.py
import tensorflow as tf
import seaborn
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pandas as pd
import os
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from sklearn.metrics import roc_auc_score, classification_report, f1_score, recall_score, precision_score
from keras.callbacks import Callback
from keras import backend as K
import time
from sklearn import metrics
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
import matplotlib.gridspec as gridspec
def main():
model_path = "./pre_model515_ml.h5"
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
filepath = './beijing.csv' # 文件路径
data = pd.read_csv(filepath, index_col=0)
print(data.shape)
print(data.head(10)) # 输出前10行数据
# 构造数据集
X_data = data[['AQI指数', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']] # 提取特征数据
Y_data = data[['AQI指数']] #
print(X_data.shape)
print(Y_data.shape)
X = np.zeros((X_data.shape[0]-3,3,7))
Y = np.zeros((Y_data.shape[0]-3,1))
print(X.shape)
print(Y.shape)
for i in range(0,2603):
X[i, :, :] = X_data.iloc[i: i + 3]
Y[i] = Y_data.iloc[i + 3]
X=X.reshape((2603,21))
print(X.shape)
print(Y.shape)
X_train = X[:int(X.shape[0] * 0.8)]
Y_train = Y[:int(X.shape[0] * 0.8)]
X_val = X[int(X.shape[0] * 0.8)::]
Y_val = Y[int(X.shape[0] * 0.8)::]
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
Y_mean = Y_train.mean(axis=0)
Y_std = Y_train.std(axis=0)
print('X_train的类型是' + str(type(X_train)))
print('X_train的shape' + str(X_train.shape))
print('X_mean的类型是' + str(type(X_mean)))
print('X_mean的shape' + str(X_mean.shape))
print('X_train的类型是' + str(type(X_train)))
print('X_train的shape' + str(X_train.shape))
X_train_norm = (X_train - X_mean) / X_std
Y_train_norm = (Y_train - Y_mean) / Y_std
X_val_norm = (X_val - X_mean) / X_std
Y_val_norm = (Y_val - Y_mean) / Y_std
X_all_norm = (X - X_mean) / X_std
if os.path.exists(model_path):
# 导入训练好的模型
model = pickle.load(open("xgb.pickle.dat", "rb"))
else:
model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, gamma=0, subsample=0.9, \
colsample_bytree=0.9, max_depth=10,
reg_lambda=1) # , objective ='reg:squarederror' n——estimators是迭代次数,gamma是最小的分裂损失变化,subsample是随机采样率,colsample_bytree是每棵树采样的列的占比
model.fit(X_train_norm, Y_train_norm)
pickle.dump(model, open("xgb.pickle.dat", "wb"))
print('模型保存完毕')
# 预测
model_pred = model.predict(X_val_norm)
val_pred = model_pred * Y_std + Y_mean # 别忘了,数据进行了标准化处理,因此预测值需要处理,再计算R方
# 计算R2
R_2_0 = metrics.r2_score(Y_val[:], val_pred[:]) # 计算0时预测的R方
plt.plot(range(Y_val.shape[0]), Y_val[:], 'b-', label='AQI实际图')
plt.plot(range(Y_val.shape[0]), val_pred[:], 'r-', label='AQI预测图')
plt.legend(loc='best')
plt.text(150, 400, '拟合R2:{0}%'.format(round(R_2_0 * 100, 2)))
plt.show()
model_pred2 = model.predict(X_all_norm)
val_pred2 = model_pred2 * Y_std + Y_mean # 别忘了,数据进行了标准化处理,因此预测值需要处理,再计算R方
R_2_1 = metrics.r2_score(Y[:], val_pred2[:])
plt.plot(range(Y_val.shape[0]), Y_val[:], 'b-', label='AQI实际图')
plt.plot(range(Y_val.shape[0]), val_pred[:], 'r-', label='AQI预测图')
plt.legend(loc='best')
plt.text(150, 400, '拟合R2:{0}%'.format(round(R_2_1 * 100, 2)))
plt.show()
df_val = pd.DataFrame(val_pred2)
df_val.to_csv('val_pre.csv')
if __name__ == '__main__':
main()