import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import glob
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import importlib
import sys
importlib.reload(sys)
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
plt.rcParams['font.sans-serif'] = 'SimHei' #显示中文
plt.rcParams['axes.unicode_minus'] = False #显示负号
plt.rcParams['figure.dpi'] = 200 # 图像分辨率
plt.rcParams['text.color'] = 'black' # 文字颜色
plt.style.use('ggplot')
print(plt.style.available) # 可选的plt绘图风格
'''
['bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn', 'Solarize_Light2', 'tableau-colorblind10', '_classic_test']
'''
#导入数据
dataset= pd.read_excel(r"D:\alldata\pythonfiles\四姑娘山\四姑娘山数据.xlsx")
dataset.drop('日期',axis=1, inplace=True)
# print(dataset)
scaler = MinMaxScaler(feature_range=(0,1))
dataset= scaler.fit_transform(dataset)
print(dataset)
X=np.array(dataset)
# X=pd.DataFrame(X)
# X.to_csv(r'D:\alldata\pythonfiles\九寨沟\nan.csv')
# isnan = np.isnan(dataset) # 判断每个元素是不是nan,返回[False,False,False,False,True]
# print(True in isnan)
#划分数据集
training=dataset[:-61]
testing=dataset[-61:]
def createXY(dataset,n_past):
dataX = []
dataY = []
for i in range(n_past, len(dataset)):
dataX.append(dataset[i - n_past:i, 0:dataset.shape[1]])
dataY.append(dataset[i,0])
return np.array(dataX),np.array(dataY)
n_past=30
trainX,trainY=createXY(training,n_past)
testX,testY=createXY(testing,n_past)
# print(trainY)
# X=np.array(trainY)
# X=pd.DataFrame(X)
# X.to_csv(r'D:\alldata\pythonfiles\九寨沟\nan.csv')
print("trainX Shape-- ",trainX.shape)
print("trainY Shape-- ",trainY.shape)
print("testX Shape-- ",testX.shape)
print("testY Shape-- ",testY.shape)
trainX=np.reshape(trainX,(trainX.shape[0],trainX.shape[1]))
# trainY=np.reshape(trainY,(trainY.shape[0],trainY.shape[1]))
testX=np.reshape(testX,(testX.shape[0],testX.shape[1]))
# testY=np.reshape(testY,(testY.shape[0],testY.shape[1]))
print("trainX Shape-- ",trainX.shape)
#RandomForest训练过程
model = RandomForestRegressor(n_estimators =50, oob_score =True, n_jobs = -1,random_state =50,
max_features = "auto", min_samples_leaf =10)
model.fit(trainX, trainY)
# 对测试集进行预测
pre = model.predict(testX)
# 显示重要特征
# plot_importance(model)
# plt.show()
print(pre)
pre=np.array(pre).reshape(1,len(pre))
pre= scaler.inverse_transform(pre).flatten()
print(pre)
y= pd.read_excel(r"D:\alldata\pythonfiles\四姑娘山\预测结果\四姑娘山-original.xlsx")
y=np.array(y['value']).reshape(1,31).flatten()
MSE = mean_squared_error(y, pre)
RMSE = math.sqrt(MSE)
# MAE=mean_absolute_error(y, pre)
# MAPE = metrics.mean_absolute_percentage_error(y, pre)
MAE = np.mean(np.abs(y-pre))
MAPE = np.mean(np.abs((y - pre) / y))
print("rmse :",RMSE)
print("mae :", MAE)
print("mape :", MAPE)
网格搜索法调参
%%time
from sklearn.model_selection import RandomizedSearchCV
RFR = RandomForestRegressor()
# 设置范围
n_estimators = np.arange(1, 1001, 100)
min_samples_split = [2, 7, 12, 17]
min_samples_leaf = [1, 4, 7, 10]
max_depth = [4, 7, 10, 13]
max_features = ['auto','sqrt']
bootstrap = [True,False]
# 需要调整的参数
random_params_group = {'n_estimators': n_estimators,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_depth': max_depth,
'max_features': max_features,
'bootstrap': bootstrap}
# 建立RandomizedSearchCV模型
random_search_model = RandomizedSearchCV(RFR, param_distributions = random_params_group, n_iter = 100,
scoring = 'neg_mean_squared_error' ,n_jobs = -1, cv = 3, random_state = 44)
# 训练数据
random_search_model.fit(train_X, train_y)
# 打印最佳参数
random_search_model.best_params_
#预测
best_model = grid_search_model.best_estimator_
RFR_gs_preds = best_model.predict(test_X)
print('模型默认参数验证集R2:', r2_score(test_y, RFR_gs_preds))
test['pred_price_RFR_gs'] = np.power(np.e, RFR_gs_preds)
print('模型默认参数验证集RMSE:',mean_squared_error(test['价格'], test['pred_price_RFR_gs'])** 0.5)
# 创建特征重要性df
feature_importance= (pd.DataFrame({'feature': train_X.columns,
'feature_importance': RFR_model.feature_importances_})
.sort_values('feature_importance', ascending=False)
.round(4))
# 绘制barh图查看特征重要性排序
plt.figure(figsize=(10, 15))
sns.barplot(x='feature_importance', y='feature', data=feature_importance)
plt.show()