# -*- coding: utf-8 -*-
"""
Created on Sat Sep 3 16:56:20 2022
@author: Lenovo
"""
site_list=[]
year_list=[]
total_number=[]
post_dropna_number=[]
post_drop_le_abnormal_number=[]
test_number=[]
train_number=[]
N_estimators=[]
Max_depth=[]
rmse_list=[]
r2_list=[]
bias_list=[]
RANK_1=[]
RANK_2=[]
RANK_3=[]
RANK_4=[]
RANK_5=[]
RANK_6=[]
RANK_7=[]
RANK_8=[]
RANK_9=[]
RANK_10=[]
RANK_11=[]
RANK_12=[]
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from scipy.stats import gaussian_kde
from sklearn.feature_selection import RFECV
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
path=r'D:\Fluxnet\sites NDVI'
outpath=r'D:\Fluxnet\ndvi777'
# if not os.path.exists(outpath):
# os.makedirs(outpath)
# print([s.split('_')[1] for s in os.listdir(r'D:\Fluxnet\加了土壤水和土壤温度的\TOTAL')])
# def Date_NDVI_list():
# count=0
# for i,j,k in os.walk(path): # i,j,k folder subfolders files
# for sole in k:
# if sole.split('.')[0] in [s.split('_')[1] for s in os.listdir(r'D:\Fluxnet\加了土壤水和土壤温度的\TOTAL')]:
# dfsole=pd.DataFrame(pd.read_csv(os.path.join(i,sole),header=None)) #直接用i 和 sole 就好
# dfsole['TIMESTAMP_START']=['/'.join(i.split('_')[:3]) for i in dfsole[0]]
# print(dfsole['TIMESTAMP_START'])
# dfsole['NDVI']=dfsole[2]*0.0001 # NDVI要在[-1,1]之间
# dfsole.to_csv(os.path.join(outpath,sole),index=False,sep=',')
# Date_NDVI_list()
path1=r'D:\Fluxnet\加了土壤水和土壤温度的\TOTAL'
path2=r'D:\Fluxnet\ndvi777' # 认真一点谢谢 别老粗心 改了上边不改下边 丢三落四的
for i,j in zip(os.listdir(path1),os.listdir(path2)):
sole_i=pd.read_csv(os.path.join(path1,i))
sole_j=pd.read_csv(os.path.join(path2,j))
sole_j=sole_j[['TIMESTAMP_START','NDVI']]
sole_j['TIMESTAMP_START']=['-'.join(i.split('/')[:3]) +' 00:00:00' for i in sole_j['TIMESTAMP_START']]
sole=pd.merge(sole_i, sole_j,how='left',on='TIMESTAMP_START')
sole['NDVI']=sole['NDVI'].interpolate()
print(sole[['TIMESTAMP_START','NDVI']])
b=sole[sole['LE_F_MDS_QC']==0]
gap=sole[sole['LE_F_MDS_QC']!=0]
b_drop=b.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
gap_drop=gap.drop(['TIMESTAMP_START','TIMESTAMP_END'
,'LE_F_MDS_QC','LE_F_MDS'],axis=1)
print(b_drop.shape)#原始数值
print(b_drop.isna().sum().sort_values(ascending=True))
print(gap_drop.shape)
print(gap_drop.isna().sum().sort_values(ascending=True))
# a.replace(-9999,np.nan,inplace=True)
#============================去除空值=======================================
a=b_drop.dropna() #训练数据去空值
print('训练集去掉空值后: ',a.shape)
gap_dropna=gap_drop.dropna() #填补数据去空值
print('插值集去掉空值后: ', gap_dropna.shape)
#===========================去除LE异常值====================================
des=a.describe()
shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
c=a[(a['LE_F_MDS'] <=shangxu[0])
&(a['LE_F_MDS'] >=xiaxu[0])]
print(c.shape)
Drivers=c.drop(['LE_F_MDS'],axis=1)
LE=c['LE_F_MDS']
x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
,test_size=0.25
,random_state=(0))
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
#========================GridsearchCV 寻找最优超参数=========================
# rfr=RandomForestRegressor()
# param_grid={'n_estimators':[600,700,800,900,1000]#300,500,700,900,1100,1300,1500,1700,1900,2100
# #500,800,1100,1400,1700,2000 1,2,3,4,5,6
# ,'max_depth':[110,120,130,140,150]}#30,50,70,90,110,150 30,50,70,90,110
# gs=GridSearchCV(rfr
# ,param_grid=param_grid
# ,scoring=make_scorer(mean_squared_error,greater_is_better=False)
# ,cv=3
# ,verbose=1
# ,n_jobs=-1)
# gs.fit(x_train,y_train)
# max_depth=gs.best_params_['max_depth']
# Max_depth+=[max_depth]
# print(gs.best_params_)
# n_estimators=gs.best_params_['n_estimators']
# N_estimators+=[n_estimators]
# print(np.sqrt(-1*gs.score(x_test,y_test)))
# #设置字体格式
# sns.set(style='ticks')
# plt.rc('font', family='Times New Roman',size=20)
# # plt.rcParams["font.weight"] = "bold"
# # plt.rcParams["axes.labelweight"] = "bold"
#==================================建模====================================
rf=RandomForestRegressor(n_estimators=1100
,max_depth=800
,random_state=(0))
rf.fit(x_train,y_train)
rmse=np.sqrt(mean_squared_error(y_test,rf.predict(x_test)))
rmse_list.append(rmse)
rmse_df=pd.DataFrame({'rmse':rmse_list})
print(rmse_df)
r2=r2_score(y_test,rf.predict(x_test))
r2_list.append(r2)
r2_df=pd.DataFrame({'r2':r2_list})
bias=(rf.predict(x_test)-y_test).mean()
bias_list.append(bias)
bias_df=pd.DataFrame({'bias':bias_list})
#==============================高斯核密度散点图==============================
post_gs=pd.DataFrame({'predict':rf.predict(x_test),'in_situ':y_test,})
post_gs['index']=[i for i in range(post_gs.shape[0])]
post_gs=post_gs.set_index('index')
print('实际的: ',pd.DataFrame(y_test))
print('预测的: ',pd.DataFrame(rf.predict(x_test)))
# plt.scatter(post_gs['in_situ'],post_gs['predict'])
# sns.jointplot(post_gs,x=post_gs['in_situ'],y=post_gs['predict'],kind='reg');
x=post_gs['in_situ']
y=post_gs['predict']
x1=np.array(range(0,int(max(y))+30))
y1=len(x1)*[0]
y2=np.array(range(0,int(max(y))+30))
x2=len(y2)*[0]
#计算点密度
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)#高斯核密度
#根据密度对点进行排序,最密集的点在最后绘制
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots(figsize=(10,8),dpi=500) #figuresize图片比例
fw=int(max(y))+30
plt.xlim(0,fw)
plt.ylim(0,fw)
# 绘图
scatter = ax.scatter(x,y,marker='o',c=z,s=15,label='LST',cmap='PuBuGn_r') # o是实心圆,c=是设置点的颜色,cmap设置色彩范围,'Spectral_r'和'Spectral'色彩映射相反
divider = make_axes_locatable(ax) #画色域图
# cax = divider.append_axes("right", size="5%", pad=0.1)
# cbar = fig.colorbar(scatter, cax=cax, label='frequency')
# cbar = fig.colorbar(scatter, cax=cax)
ax.plot(x1,y1,c='grey',ls='--',lw=1)
ax.plot(x2,y2,c='grey',ls='--',lw=1)
plt.plot([0,fw], [0,fw], 'gray', lw=2) # 画的1:1线,线的颜色为black,线宽为0.8
plt.scatter(x, y, c=z, s=7, cmap='jet')
plt.text(10, fw-10, 'Site = %s' % i.split('_',6)[1], family = 'Times New Roman',size=15)
plt.text(10, fw-18, 'Size = %.f' % len(y), family = 'Times New Roman',size=15) # text的位置需要根据x,y的大小范围进行调整。
plt.text(10, fw-26, 'RMSE = %.3f W/m²' % rmse, family = 'Times New Roman',size=15)
plt.text(10, fw-34, 'R² = %.3f' % r2, family = 'Times New Roman',size=15)
plt.text(10, fw-42, 'BIAS = %.3f W/m²' % bias, family = 'Times New Roman',size=15)
plt.xlabel('Station LE (W/m²)',family = 'Times New Roman',size=18)
plt.ylabel('Estimated LE (W/m²)',family = 'Times New Roman',size=18)
plt.xticks(fontproperties='Times New Roman',size=15)
plt.yticks(fontproperties='Times New Roman',size=15)
plt.axis([0, fw, 0, fw]) # 设置线的范围
plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\GAUSSIAN——ndvi',i.split('_',6)[1]) #小心这里不要被复写
, bbox_inches='tight', dpi=500)
plt.show()
# plt.clf ()
# plt.close ()
#=================================特征重要性================================
importance=rf.feature_importances_
print(importance)
# importance=f'{importance:.2f}'
indices=np.argsort(importance)[::-1]
print(indices)
importance_list=[]
for s in range(x_train.shape[1]):
importance_sort=f'{Drivers.columns[:][indices[s]]}:{importance[indices[s]]:.2f}'
importance_list+=[importance_sort]
print(importance_list)
RANK_1+=[importance_list[0]]
RANK_2+=[importance_list[1]]
RANK_3+=[importance_list[2]]
RANK_4+=[importance_list[3]]
RANK_5+=[importance_list[4]]
RANK_6+=[importance_list[5]]
RANK_7+=[importance_list[6]]
RANK_8+=[importance_list[7]]
RANK_9+=[importance_list[8]]
RANK_10+=[importance_list[9]]
RANK_11+=[importance_list[10]]
RANK_12+=[importance_list[11]]
#Box plot
# sns.boxplot(y=rmse['rmse'])
#============================信息输出到csv==================================
site_list+=[i.split('_',6)[1]]
year_list+=[int(i.split('_',6)[5].split('-',1)[1])
-int(i.split('_',6)[5].split('-',1)[0])+1]
total_number.append(int(b.shape[0]))
post_dropna_number.append(int(a.shape[0]))
post_drop_le_abnormal_number.append(int(c.shape[0]))
test_number.append(int(c.shape[0]*0.25))
train_number.append(int(c.shape[0]*0.75))
dic={'SITES':site_list,'YEAR':year_list,'原始数目':total_number
,'去掉空值后':post_dropna_number
,'去掉LE异常值后':post_drop_le_abnormal_number
,'TRAIN_NUMBER':train_number
,'TEST_NUMBER':test_number
# ,'n_estimators':N_estimators,'max_depth':Max_depth
,'RMSE':rmse_list,'R2':r2_list,'BIAS':bias_list
,'RANK_1':RANK_1,'RANK_2':RANK_2,'RANK_3':RANK_3,'RANK_4':RANK_4
,'RANK_5':RANK_5,'RANK_6':RANK_6,'RANK_7':RANK_7,'RANK_8':RANK_8
,'RANK_9':RANK_9,'RANK_10':RANK_10,'RANK_11':RANK_11,'RANK_12':RANK_12
}
dic=pd.DataFrame(dic)
print(dic)
dic.to_csv(r'D:\Fluxnet\OUTCOME\try903.csv')
【重要性排序】
最新推荐文章于 2024-07-23 14:36:35 发布