1125jujia

# -*- coding: utf-8 -*-
"""
Created on Mon Nov 21 14:42:44 2022

@author: Lenovo
"""
# -*- coding: utf-8 -*-
"""
Created on Fri Dec  2 13:46:48 2022

@author: Lenovo
"""


from sklearn.metrics import make_scorer
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score,mean_squared_error
# from sklearn.preprocessing import StandardScaler
import seaborn as sns  
from scipy.stats import gaussian_kde
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.feature_selection import RFECV
from scipy.interpolate import griddata
from itertools import combinations
from operator import itemgetter
dic = {}
path=r'D:\Fluxnet\try'
outpath=r'D:\Fluxnet\OUTCOME\每种变量组合放在一起之前的仓库'

site_list=[]
year_list=[]

total_number=[]
post_dropna_number=[]
post_drop_le_abnormal_number=[]
test_number=[]
train_number=[]
N_estimators=[]
Max_depth=[]

Rmse_list=[]
R2_list=[]
Bias_list=[]

Drivers_column=[]
Filling_rate_list=[]
Feature_list=[]

# path1=r'D:\Fluxnet\try'
# path2=r'D:\Fluxnet\try_ndvi'  # 认真一点谢谢 别老粗心 改了上边不改下边 丢三落四的
# path3=r''
# path1=r'D:\Fluxnet\加了土壤水和土壤温度的\MDS_用'
# path2=r'D:\Fluxnet\ndvi777 - SHAOSHAOSHAO'  # 认真一点谢谢 别老粗心 改了上边不改下边 丢三落四的
  
# for s,j in zip(os.listdir(path1),os.listdir(path2)):
#     print(s)
#     print(os.listdir(path2))
#     sole_s=pd.read_csv(os.path.join(path1,s))
#     sole_j=pd.read_csv(os.path.join(path2,j)) 
       
#     sole_s['TIMESTAMP_START']=sole_s['TIMESTAMP_START'].astype('str') 
#     sole_s['TIMESTAMP_START']=pd.to_datetime(sole_s['TIMESTAMP_START'])   
      
#     sole_j=sole_j[['TIMESTAMP_START','NDVI']]
#     sole_j['TIMESTAMP_START'] = pd.to_datetime(sole_j['TIMESTAMP_START'])
            
#     sole_j = sole_j.set_index('TIMESTAMP_START')
#     sole_j = sole_j.resample('1D').interpolate() # 30T 按分钟(T)插值  1D按天插值
#     sole_j = sole_j.reset_index() 
    
#     sole=pd.merge(sole_s, sole_j,how='left',on='TIMESTAMP_START') 
    
#     sole['NDVI']=sole['NDVI'].interpolate(method='pad') # 1天一个值
#     print(sole)
    
    #========================================================================
path1 =  r'C:\Users\Lenovo\Desktop\四大类\REALTRY'
for file in os.listdir(path1):
      
    sole = pd.read_csv(os.path.join(path1,file))
    site_list1=[]
    year_list1=[]
    test_number1=[]
    train_number1=[]
    rmse_list1=[]
    r2_list1=[]
    bias_list1=[]
  
    sole_raw = sole
    sole_copy = sole
    print('原始数据:',sole.shape)
    sole.dropna(subset=['LE_F_MDS_QC'],axis=0,inplace=True) #删除LE_F_MDS_QC中含有空值的行
    print('去掉没QC后的原始数据:',sole.shape)
    
    trainset=sole[sole['LE_F_MDS_QC']==0]
    print('观测数据:',trainset.shape)
    
    #=================================以LE_F_MDS=20W/m² 为界 白天和晚上分别训练
    
    trainset=trainset[trainset['LE_F_MDS']>=20]
    print('白天的总量: ',trainset.shape)
    
    gap=sole[sole['LE_F_MDS_QC']!=0]
    print('插补数据:',gap.shape)
    
    gap_drople=gap.drop(['LE_F_MDS','LE_F_MDS_QC'
                         ,'TIMESTAMP_START','TIMESTAMP_END']#
                         # , 'SW_IN_F_MDS_QC', 'NETRAD'
                         ,axis=1)
    # gap_drople=gap_drop.drop(['SW_IN_F_MDS_QC', 'NETRAD'],axis=1)
    
    #===============================每行至少有一个/三个不是空值时保留
    
    gap_dropna=gap_drople
    # gap_dropna=gap_drople.dropna(axis=0,thresh=3) 
    
    print('去空值后的插补数据:',gap_dropna.shape)
    
    dff=pd.DataFrame(gap_dropna.isna().sum().sort_values(ascending=False))

    print('预测集的空值:',dff)
    
    #看下训练集的空值,可以看出跟插补集不太一样
    print('训练集的空值:\n',trainset.drop(['LE_F_MDS','LE_F_MDS_QC'
                         ,'TIMESTAMP_START','TIMESTAMP_END']#
                         ,axis=1).isna().sum().sort_values(ascending=False))
  
    #==========================获得所有变量组合
    
    def combine(list0,o):
        list1=[]
        for i in combinations(list0,o):
            list1.append(i)
        return list1
    
    #==========================遍历
    tianchongliang=[]
    chabuliang=[]
    rmseliang=[]
    site_list=[]
    ALL_rmse_list=[]
    rmse_list=[]
    r2_list=[]
    bias_list=[]
    filling_rate_list=[]
    dic_list = []
    fig  = plt.figure(figsize=(4,40),dpi=600)
    fig1 = plt.figure(figsize=(16,36),dpi=600)
    ALL_x_test = pd.Series()
    ALL_y_test = pd.Series()
    qian=0
    hou=-1
    
    for u in reversed(range(3,len(gap_drople.columns)+1)) : 

        fillrate_mid_list=[]
        col_list=[]
        list666=[]
        list666.extend(combine(dff.index,u))
        #===========================获取不同插补率的组合特征
        
        list_score=[]
        score=[]
        big_list=[]
        
        for i in range(0,len(list666)):
            
            sco=f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}'
            
            score+=[f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}']
            
            list_score+=[{'score':sco,'list':list666[i]}]
            
        # print(list_score)   # print(list_score)去空值后的插补数据
        #=============================plot
        
        key_list=[a['list'] for a in list_score]
    
        len_list = [ len(i) for i in key_list ]
        
        score=[np.float64(i) for i in score]
        
        plt.rc('font', family='Times New Roman',size=20)
        plt.figure(figsize=(10,8),dpi=400)
        plt.scatter(len_list,score)
        plt.xlabel('Number of drivers', {'family':'Times New Roman','weight':'normal','size':20})
        plt.ylabel('Filling rate',{'family':'Times New Roman','weight':'normal','size':20})
        #============================填充率最大对应去的变量列表shunxu
        sorted_list=sorted(list_score, key=lambda list_score: list_score['score'], reverse=True)
        # print(sorted_list)   # 按降序排列
        biggest_score=[a['score'] for a in sorted_list][0]      
        biggest_score_feature_list=[a['list'] for a in sorted_list][0]
        # print(biggest_score_feature_list)   
        Feature_list.append(biggest_score_feature_list)
        filling_rate_list.append(biggest_score)
        Filling_rate_list.append(biggest_score)
        #==============================建模准备================================
        train_copy=trainset.copy()
        train_copy.drop(['LE_F_MDS_QC','TIMESTAMP_START','TIMESTAMP_END']#
                   ,axis=1,inplace=True)#.isna().sum().sort_values(ascending=False)
        
        feature=[x for x in biggest_score_feature_list]  
        train_option=train_copy[feature]
        train_option['LE_F_MDS']=train_copy['LE_F_MDS']
            
        print("Train_option原始数值\n",train_option.shape)#
        print(train_option.isna().sum().sort_values(ascending=True))
        #============================去除空值=======================================
        train_option_dropna=train_option.dropna() #训练数据去空值
        print('训练集去掉空值后: ',train_option_dropna.shape)
        c=train_option_dropna    
        print(c.shape)
            
        Drivers=c.drop(['LE_F_MDS'],axis=1)
            
        Drivers_column+=[' '.join(Drivers.columns.tolist())]
            
        LE=c['LE_F_MDS']
        x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
                                                        ,test_size=0.20
                                                        ,random_state=(0))                            
        print(x_train.shape)
        print(x_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        #==================================建模====================================
            
        rf=RandomForestRegressor(n_estimators=1100
                                      ,max_depth=80
                                       ,oob_score=True
                                      ,random_state=(0))   
        rf.fit(x_train,y_train)    
        # rf.fit(Drivers,LE)     
        # pred_oob = rf.oob_prediction_ #袋外预测值
        # print(len(pred_oob))
        # print(pred_oob)
        # rmse=np.sqrt(mean_squared_error(LE, pred_oob)) #袋外均方根误差
        # rmse_list.append(rmse)
        # Rmse_list.append(rmse)
        # rmse_df=pd.DataFrame({'rmse':rmse_list})
        # print(rmse_df)
        # print(rmse_list)
        site_list+=[file.split('_',6)[1]]
        tianchongliang+=[biggest_score]
        chabuliang+=[gap.shape[0]]
        rmseliang+=[y_test.shape[0]]
        rmse=np.sqrt(mean_squared_error(y_test,rf.predict(x_test)))
        rmse_list.append(rmse)
        Rmse_list.append(rmse)
        rmse_df=pd.DataFrame({'site':site_list,'rmse':rmse_list
                              ,'rmse量':rmseliang,'插补量':chabuliang
                              ,'插补率':tianchongliang})
        print(rmse_df)
        print(tianchongliang[qian])
        rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(file.split('_',6)[1])  +'.csv'),index = False)

        print(rmse_list[qian] , rmse_list[hou])
        print(tianchongliang[qian] , tianchongliang[hou])
        if qian==0 or rmse_list[qian] < rmse_list[hou] or tianchongliang[qian] > tianchongliang[hou]   :
            
            y_test6 = y_test[~y_test.isin(ALL_y_test)] #在y_test里不在大的合集里
            print(y_test6)
            x_test6 = pd.Series(rf.predict(x_test),index=y_test.index)
            x_test6 = x_test6[y_test6.index]
            print(x_test6)
            print(len(y_test6))
            print(len(x_test6))
            ALL_y_test = pd.concat([ALL_y_test, y_test6], axis=0, ignore_index=False)
            ALL_x_test = pd.concat([ALL_x_test, x_test6], axis=0,ignore_index=False)
            
    
            print('拼接后\n',ALL_x_test)
            print('拼接后\n',ALL_y_test)
            
            ALL_rmse=np.sqrt(mean_squared_error(ALL_y_test,ALL_x_test))
    
            ALL_rmse_list.append(ALL_rmse)
            ALL_rmse_df=pd.DataFrame({'ALL_rmse':ALL_rmse_list
                                      })
            print(ALL_rmse_df)
            ALL_rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE_ALL',str(file.split('_',6)[1])  +'.csv'),index = False)
            
        qian+=1
        hou+=1
            
        r2=r2_score(y_test,rf.predict(x_test))  
        r2_list.append(r2)
        R2_list.append(r2)
        r2_df=pd.DataFrame({'r2':r2_list})
        
        # bias=(pred_oob-LE).mean()
        bias=(rf.predict(x_test)-y_test).mean()
        bias_list.append(bias)
        Bias_list.append(bias)
        bias_df=pd.DataFrame({'bias':bias_list})
        
       
        # ===========================高斯核密度散点图===========================
        
        # post_gs=pd.DataFrame({'predict':pred_oob,'in_situ':LE,})
        post_gs=pd.DataFrame({'predict':rf.predict(x_test),'in_situ':y_test,}) 
        post_gs['index']=[i for i in range(post_gs.shape[0])]
        post_gs=post_gs.set_index('index')
        x=post_gs['in_situ']
        y=post_gs['predict']
        xy = np.vstack([x,y])#计算点密度
        z = gaussian_kde(xy)(xy)#高斯核密度
        idx = z.argsort()#根据密度对点进行排序,最密集的点在最后绘制
        x, y, z = x[idx], y[idx], z[idx]
        fw = 800
        ax = fig.add_subplot(len(gap_drople.columns)-1,1,u-2)
        scatter = ax.scatter(x,y,marker='o',c=z,s=15,label='LST',cmap='jet') # o是实心圆,c=是设置点的颜色,cmap设置色彩范围,'Spectral_r'和'Spectral'色彩映射相反
        divider = make_axes_locatable(ax) #画色域图
        # plt.scatter(x, y, c=z, s=7, cmap='jet')
        # plt.axis([0, fw, 0, fw])  # 设置线的范围
        # plt.title( file.split('_',6)[1], family = 'Times New Roman',size=21)
        # plt.text( 10, 700,len(feature), family = 'Times New Roman',size=21)
        # plt.text(10, 700, 'Driver numbers = %s' % len(feature), family = 'Times New Roman',size=21)
         # plt.text(10, 600, 'Size = %.f' % len(y), family = 'Times New Roman',size=18) # text的位置需要根据x,y的大小范围进行调整。
         # plt.text(10, 650, 'RMSE = %.3f W/m²' % rmse, family = 'Times New Roman',size=18)
         # plt.text(10, 700, 'R² = %.3f' % r2, family = 'Times New Roman',size=18)
         # plt.text(10, 750, 'BIAS = %.3f W/m²' % bias, family = 'Times New Roman',size=18)
        ax.set_xlabel('Station LE (W/m²)',family = 'Times New Roman',size=19)
        ax.set_ylabel('Estimated LE (W/m²)',family = 'Times New Roman',size=19)
        ax.plot([0,fw], [0,fw], 'gray', lw=2)  # 画的1:1线,线的颜色为black,线宽为0.8
        ax.set_xlim(0,fw)
        ax.set_ylim(0,fw)
        # ax.xaxis.set_tick_params(labelsize=19) 
        # ax.xaxis.set_tick_params(labelsize=19) 
        # plt.xticks(fontproperties='Times New Roman',size=19)
        # plt.yticks(fontproperties='Times New Roman',size=19)
        fig.set_tight_layout(True) 
 

        #================================================================MDS
        s_ori = pd.read_csv(os.path.join(path1,file))
        MDS_GAP=s_ori
        if 'SW_IN' in MDS_GAP.columns.to_list() and 'TA' in  MDS_GAP.columns.to_list() and 'VPD' in  MDS_GAP.columns.to_list():
            
            MDS_GAP['Year']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            MDS_GAP['Year'] = MDS_GAP['TIMESTAMP_END'].dt.year  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            
            MDS_GAP['DoY']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            doy=[]
            for k in MDS_GAP['TIMESTAMP_END']:
                doy += [k.strftime("%j")]
            MDS_GAP['DoY'] = doy  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            MDS_GAP['Hour'] = MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            hour=[]
            for l in MDS_GAP['TIMESTAMP_END']:
                hour += [int(l.strftime('%H'))+float(l.strftime('%M'))/60]
            MDS_GAP['Hour'] = hour          
            MDS_GAP.loc[:,'LE'] = y_test
            print(MDS_GAP['LE'].dropna().sum())
            MDS_GAP['LE'].to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) +  str(len(gap_drople.columns)) + '.txt'),sep='	',index = False)
            MDS_GAP['LE_F_MDS']=s_ori['LE_F_MDS']
            MDS_GAP.loc[MDS_GAP['LE']>=-9999,['LE']] = -9999
            MDS_GAP['LE'].fillna(MDS_GAP['LE_F_MDS'],inplace=True)
            
            MDS_GAP['Rg']=MDS_GAP['SW_IN']        
            MDS_GAP['Tair']=MDS_GAP['TA']
            MDS_GAP['VPD']=MDS_GAP['VPD']
            # MDS_GAP['NEE']=MDS_GAP['NEE_VUT_REF']
                
            MDS_GAP=MDS_GAP[['Year','DoY','Hour','LE','Rg','Tair','VPD']]#,'Tsoil','rH',
            MDS_GAP.loc[MDS_GAP['Rg'] > 1200 , ['Rg']] = -9999 # Drivers control Rg <= 1200W/m² Ta <= 2.5℃W/m² VPD <= 50hPa
            # MDS_GAP.loc[MDS_GAP['Tair'] > 2.5 , ['Tair']] ==-9999
            MDS_GAP.loc[MDS_GAP['VPD'] > 50 , ['VPD']] = -9999
            #将单位插到第零行的位置上r
            row = 0  # 插入的位置
            value = pd.DataFrame([['-', '-', '-','Wm-2', 'Wm-2', 'degC','hPa']],columns=MDS_GAP.columns)  # 插入的数据  'degC','%',
            df_tmp1 = MDS_GAP[:row]
            df_tmp2 = MDS_GAP[row:]
            # 插入合并数据表
            MDS_GAP = df_tmp1.append(value).append(df_tmp2)
            MDS_GAP = MDS_GAP.fillna(-9999) 
            MDS_GAP.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\MDS_TRY666', str(file.split('_',6)[1]) + str(len(gap_drople.columns)) + '.txt'),sep='	',index = False)#+ str(gaplong)
   
        
        #==============================================================MDS_ALL
        s_ori = pd.read_csv(os.path.join(path1,file))
        MDS_GAP=s_ori
        if 'SW_IN' in MDS_GAP.columns.to_list() and 'TA' in  MDS_GAP.columns.to_list() and 'VPD' in  MDS_GAP.columns.to_list():
            
            MDS_GAP['Year']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            MDS_GAP['Year'] = MDS_GAP['TIMESTAMP_END'].dt.year  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            
            MDS_GAP['DoY']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            doy=[]
            for k in MDS_GAP['TIMESTAMP_END']:
                doy += [k.strftime("%j")]
            MDS_GAP['DoY'] = doy  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            MDS_GAP['Hour'] = MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            hour=[]
            for l in MDS_GAP['TIMESTAMP_END']:
                hour += [int(l.strftime('%H'))+float(l.strftime('%M'))/60]
            MDS_GAP['Hour'] = hour          
            MDS_GAP.loc[:,'LE'] = ALL_y_test
            print(MDS_GAP['LE'].dropna().sum())
            MDS_GAP['LE'].to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来ALL_rmse的原始值666', str(file.split('_',6)[1]) + '.txt'),sep='	',index = False)
            MDS_GAP['LE_F_MDS']=s_ori['LE_F_MDS']
            MDS_GAP.loc[MDS_GAP['LE']>=-9999,['LE']] = -9999
            MDS_GAP['LE'].fillna(MDS_GAP['LE_F_MDS'],inplace=True)
            
            MDS_GAP['Rg']=MDS_GAP['SW_IN']        
            MDS_GAP['Tair']=MDS_GAP['TA']
            MDS_GAP['VPD']=MDS_GAP['VPD']
            # MDS_GAP['NEE']=MDS_GAP['NEE_VUT_REF']
                
            MDS_GAP=MDS_GAP[['Year','DoY','Hour','LE','Rg','Tair','VPD']]#,'Tsoil','rH',
            MDS_GAP.loc[MDS_GAP['Rg'] > 1200 , ['Rg']] = -9999 # Drivers control Rg <= 1200W/m² Ta <= 2.5℃W/m² VPD <= 50hPa
            # MDS_GAP.loc[MDS_GAP['Tair'] > 2.5 , ['Tair']] ==-9999
            MDS_GAP.loc[MDS_GAP['VPD'] > 50 , ['VPD']] = -9999
            #将单位插到第零行的位置上r
            row = 0  # 插入的位置
            value = pd.DataFrame([['-', '-', '-','Wm-2', 'Wm-2', 'degC','hPa']],columns=MDS_GAP.columns)  # 插入的数据  'degC','%',
            df_tmp1 = MDS_GAP[:row]
            df_tmp2 = MDS_GAP[row:]
            # 插入合并数据表
            MDS_GAP = df_tmp1.append(value).append(df_tmp2)
            MDS_GAP = MDS_GAP.fillna(-9999) 
            MDS_GAP.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\ALL_MDS_TRY666', str(file.split('_',6)[1])  + '.txt'),sep='	',index = False)#+ str(gaplong)
   
        
   
        
       #==============================复制一下整个的 插补 保存 比较 导出ALL_y_test
        
        gap_dropna_copy=gap_dropna.copy()
        gap_dropna_copy=gap_dropna_copy[feature]
        gap_dropna_copy=gap_dropna_copy.dropna()
        gap_dropna_copy.loc[:, 'LE_gap_filled'] = rf.predict(gap_dropna_copy)

        le=sole.copy()
        le['LE_F_MDS_QC'].replace([1,2,3], np.nan, inplace=True)
        le['LE_F_MDS_QC'].replace(0, -9999, inplace=True)
        le['LE_F_MDS_QC'].fillna(gap_dropna_copy['LE_gap_filled'], inplace=True)
        le['RMSE']=[rmse]*sole.shape[0]
        
        dic0={'TIMESTAMP_START':le['TIMESTAMP_START'].tolist()
            ,'TIMESTAMP_END':le['TIMESTAMP_END'].tolist() 
            ,'LE_Gap_filled'+ str(u): le['LE_F_MDS_QC'].tolist()
            ,'RMSE'+ str(u): le['RMSE']
            ,'Drivers'+ str(u): [' '.join(Drivers.columns.tolist())]*sole.shape[0]
            }
        df0 = pd.DataFrame(dic0)
        dic={'list_name':df0, 'rmse':df0['RMSE'+ str(u)][df0.index[0]]} 
        dic_list += [dic]
        sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False) 
        list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe count
        df = pd.concat(list_name,axis=1)
        df = df.loc[:,~df.columns.duplicated()]
        
        
        shunxu = [''.join(list(filter(str.isdigit,x))) for x in df.columns]
        shunxu0 = list(filter(None,shunxu))
        shunxu = list(set(shunxu0)) #set的方法会改变顺序 按照原来的index排个序
        shunxu.sort(key = shunxu0.index)
        print(shunxu)  
        
    #=============================== 变量个数 VS.插补率
    
    # fig = plt.subplot(8,5,36+dalei)    
    # plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\DoubleY',s.split('_',6)[1])
    #             , bbox_inches='tight', dpi=500)

    x = [x for x in reversed(range(3,len(gap_drople.columns)+1))] #reversed(range(len(df.index)+1),3)matplotlib does not support generators as input
    y1 = rmse_list
    y2 = filling_rate_list
    
    
    ax = fig.add_subplot(len(gap_drople.columns)-1,1,len(gap_drople.columns)-1)
    fig = plt.figure(figsize=(12,8),dpi=400)
    ax = fig.add_subplot(1,1,1)

    line1=ax.plot(x, y1,color='red',linestyle='--',marker='o',linewidth=2.5)
    ax.set_ylabel('RMSE of 25% tesing set', {'family':'Times New Roman','weight':'normal','size':21},color='red')
    ax.set_xlabel('Number of drivers',{'family':'Times New Roman','weight':'normal','size':21})
    ax.tick_params(labelsize=19)
    # ax1.set_title("")
    ax2 = ax.twinx()  # this is a important function
    #ax2.set_ylim([-0.05,1.05]) # 设置y轴取值范围   
    # ax2.set_yticks([0.0,0.3,0.5,0.7,0.9]) # 设置刻度范围 
    # ax2.set_yticklabels([0.0,0.3,0.5,0.7,0.9]) # 设置刻度
    line2=ax2.plot(x, y2,color='blue',marker='o',linewidth=2.5)
    ax2.tick_params(labelsize=19)
    ax2.set_ylabel('Filling rate', {'family':'Times New Roman','weight':'normal','size':21},color='blue')
    # a2.invert_yaxis() #invert_yaxis()翻转纵轴,invert_xaxis()翻转横轴
    # plt.tick_params(labelsize=19)
    # plt.xticks(np.arange(5, 13, 1),fontproperties='Times New Roman',size=19)
    plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\1129',str(file.split('_',6)[1]) +'.png')
                      , bbox_inches='tight', dpi=500)
    plt.show()
    
    
# =============================================================================
#      动态插补
# =============================================================================
    # for latter in shunxu[1:]:
    #     a = df
    #     b=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] > -9999, ['LE_Gap_filled'+ str(shunxu[0]), 'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]] # 只是有LE数值的地方,用来填充上边的空集
    
    #     a['Drivers'+ str(shunxu[0])]=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == np.nan, ['Drivers'+ str(shunxu[0])]]
    #     a['Drivers'+ str(shunxu[0])].fillna( b['Drivers'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    
    #     a['RMSE' + str(shunxu[0])]=a.loc[a['LE_Gap_filled'+str(shunxu[0])] == np.nan, ['RMSE' + str(shunxu[0])]]
    #     a['RMSE' + str(shunxu[0])].fillna( b['RMSE'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
        
    #     b=a.loc[a['LE_Gap_filled'+ str(latter)] > -9999, ['LE_Gap_filled'+ str(latter), 'Drivers'+ str(latter), 'RMSE'+ str(latter)]] # 只是有LE数值的地方,用来填充上边的空集
    
    #     a['Drivers'+ str(latter)]=a.loc[a['LE_Gap_filled'+ str(latter)] == np.nan, ['Drivers'+ str(latter)]]
    #     a['Drivers'+ str(latter)].fillna( b['Drivers'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    
    #     a['RMSE' + str(latter)]=a.loc[a['LE_Gap_filled'+str(latter)] == np.nan, ['RMSE' + str(latter)]]
    #     a['RMSE' + str(latter)].fillna( b['RMSE'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    
    #     a['LE_Gap_filled'+str(shunxu[0])].fillna(a['LE_Gap_filled'+ str(latter)], inplace=True) # LE Update
    #     df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
    #     print(a['LE_Gap_filled'+str(shunxu[0])])
    #     a['Drivers'+str(shunxu[0])].fillna(a['Drivers'+ str(latter)],inplace=True)  # Drivers Update
    
    #     a['RMSE'+str(shunxu[0])].fillna(a['RMSE'+ str(latter)],inplace=True)  # Rmse Update
 
    #     # 加一下a的时间
    #     so=pd.read_csv(os.path.join(path1,file))
    #     so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]
    #     print(a['TIMESTAMP_START'])
    #     a.to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) + '.csv'),index = False)
        
    # # print(a)
    
    # a['QC'] = np.nan 
    # a.loc[a['LE_Gap_filled'+ str(shunxu[0])] != -9999, 'QC'] = 1
    # a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -9999 , 'QC'] = 0
    # a['LE_Gap_filled'+ str(shunxu[0])].replace(np.nan,-8888,inplace=True)   # 原本是空值的部分  由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
    # a['LE_Gap_filled'+ str(shunxu[0])].replace(-9999,np.nan,inplace=True)   #        |          空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
    # a['LE_Gap_filled'+ str(shunxu[0])].fillna(so['LE_F_MDS'],inplace=True)#  最后依旧是空值     
    # a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -8888 , 'QC'] = -9999
    # a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC',  'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
    # a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')
    # a['LE_Gap_filled'+ str(shunxu[0])].fillna(a['LE_F_MDS'],inplace=True)   
    # a['LE_Gap_filled'+ str(shunxu[0])].replace(-8888,np.nan,inplace=True)    
    # a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC',  'Drivers' + str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
    
    # bianliangmen = pd.read_csv(os.path.join(path1,file))
    # bianliangmen = bianliangmen.drop(['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS'],axis=1).columns
    # for i in bianliangmen:
    #     a[str(i)]=np.nan
    # # print(a.columns)  year   
    
    # a['Drivers' + str(shunxu[0])].replace(np.nan,-9999,inplace=True)      
    # b=a.loc[a['Drivers' + str(shunxu[0])]!=-9999]

    # for i in b.columns[6:]:

    #     c=b[b['Drivers' + str(shunxu[0])].str.contains(i)]
    #     c[i].replace(np.nan,'+',inplace=True)
    #     a[i]=c[i]
        
    # b=a.count(axis=1)-6
    # b=pd.DataFrame(b)
    
    # a['n_drivers']=b
    # a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
    # a['Drivers' + str(shunxu[0])].replace(-9999,np.nan,inplace=True)
    # # print(a)
    # a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(file.split('_',6)[1]) +'.csv'),index = False)

    
 
    # # 
    #     # total_number.append(int(sole.shape[0]))
    #     # post_dropna_number.append(int(train_option_dropna.shape[0]))
    #     # post_drop_le_abnormal_number.append(int(c.shape[0]))
    #     # test_number.append(int(c.shape[0]*0.25))
    #     # train_number.append(int(c.shape[0]*0.75))
    #     # # N_estimators.append(n_estimators)
    #     # # Max_depth.append(max_depth)
         
    # # ===========================================================绘制散点图file
    # s_ori = pd.read_csv(os.path.join(path1,file))
    # ori = s_ori.loc[s_ori['LE_F_MDS_QC']==0,['TIMESTAMP_START','LE_F_MDS']]
    # filled = s_ori.loc[s_ori['LE_F_MDS_QC']!=0,['TIMESTAMP_START','LE_F_MDS']]
    # s_ori['TIMESTAMP_START'] = pd.to_datetime(s_ori['TIMESTAMP_START'])
    # s_ori['year'] = s_ori['TIMESTAMP_START'].dt.year
    # gap_filled = a.loc[a['QC'] == 1,['TIMESTAMP_START','LE_Gap_filled'+ str(shunxu[0])]]
    
    # fig1 ,ax = plt.subplots(5,1,sharex='col',figsize=(25,9),dpi=300)
    # ax0 = ax[0]
    # ax0.plot( 'LE_F_MDS', data=ori, linestyle='none',marker='o')
    # ax1 = ax[1]
    # ax1.plot(  'LE_F_MDS', data=filled, color='#ff7f0e',linestyle='none', marker='o')
    # ax2 = ax[2]
    # ax2.plot(  'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
    # ax2.plot( 'LE_F_MDS', data=filled, alpha=0.6, linestyle='none', marker='o')
    # ax3 = ax[3]
    # # ax2.plot(  'LE_F_MDS', data=s_ori, alpha=0.6, linestyle='none', marker='o')
    # ax3.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled, color='#FAA460', linestyle='none', marker='o' )
    # ax4 = ax[4]
    # ax4.plot(  'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
    # ax4.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled,color='#FAA460', alpha=0.6, linestyle='none', marker='o' )
    
    # ax0.set_ylabel('in-situ', fontsize=19)
    # ax1.set_ylabel('MDS', fontsize=19)
    # ax2.set_ylabel('FLUXNET2015', fontsize=19)
    # ax3.set_ylabel('RF', fontsize=19)
    # ax4.set_ylabel('Dynamic', fontsize=19)
    
    # nianfen = int(file.split('_',6)[5].split('-',2)[0])
    # nianfen1 = int(file.split('_',6)[5].split('-',2)[1])
    # ax2.set_xticks([365*48*x  for x in range(nianfen1+2-nianfen)]) 
    # ax2.set_xticklabels([x  for x in range(nianfen,nianfen1+2)],fontproperties='Times New Roman',size=19)
    # ax4.set_xlabel('Year', fontsize=19)
    # plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\1128',str(file.split('_',6)[1]) +'.png')
    #                   , bbox_inches='tight', dpi=500)
    # plt.show()
    
        
        #===============================================导出
         
    # dic={'SITES':site_list,'YEAR':year_list,'原始数目':total_number
    #           ,'去掉空值后':post_dropna_number
    #           ,'去掉LE异常值后':post_drop_le_abnormal_number
    #           ,'TRAIN_NUMBER':train_number
    #           ,'TEST_NUMBER':test_number
    #           # ,'n_estimators':N_estimators,'max_depth':Max_depth
    #           ,'RMSE':Rmse_list,'R2':R2_list,'BIAS':Bias_list
    #           ,'Drivers_column':Drivers_column
    #           ,'Filling_rate' : Filling_rate_list
    #         }
        
    # dic=pd.DataFrame(dic)
    # # print(dic)
    # dic.to_csv(r'D:\Fluxnet\OUTCOME\RMSE_ALL\RMSE_All_Day.csv')
    
    # dic_sole={
    #           'RMSE':rmse_list,'R2':r2_list,'BIAS':bias_list
    #           } 
    # dic_sole=pd.DataFrame(dic_sole)
    # dic_sole.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(file.split('_',6)[1])  +'.csv'),index = False)
     
        #===============================================Various length of gap
        
        # for j,k in zip([0.05,0.075,0.125],[6,24,48]): #一天 七天 一月 一共占总数据的0.25
        # #48,336,720
          
        #   df0=sole.copy()
        #   print(len(df0))
        #   df=df0[df0['LE_F_MDS_QC']==0]
        #   print(df['LE_F_MDS_QC'])
        #   print(len(df))
        #   print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
          
          
          
        #   #可以开始make gap的位置区间
        #   start_point=np.arange(df['LE_F_MDS_QC'].index[0],df['LE_F_MDS_QC'].index[-1]-k+1) #k是gap长度 
         
        #   #gap的个数
        #   gap_number=int(len(df)*j/k)
        #   print(gap_number)
          
        #   # 随机选择开始的位置
        #   # np.random.seed(1) # 每次的随机数都是一样的
          
        #   gap_posi=np.random.choice(start_point,gap_number*3) #多一点选择的余地
          
        #   posi=sorted(gap_posi) # 排一下顺序}
        #   print(posi)
          
        #   count=0
        #   gap_qujian=[]
          
        #   # 并不是每个随机开始的位置都可以用,不能和以前的gap开始的位置重叠,gap的位置数据量也要充足

        #   for m,n in enumerate(posi): # m是索引 n是开始的位置(其实也是索引)
             
        #       # 单个gap的区间
        #       # 意思是从第多少位到多少位是gap区间
        #       gap_danqujian_list =[h for h in np.arange(n,n+k)]
        #       print(gap_danqujian_list)
        #       print('==')
        #       # 整个DataFrame中的gap
        #       gap_df = df0.iloc[gap_danqujian_list]
        #       # print(gap_df)
        #       # gap区间要在限定的范围内
        #       if np.isin(gap_danqujian_list,start_point).all():
                 
        #           # 不同长度gap不能重叠
        #           if m>0 and n-posi[m-1] <= k: 
        #               continue
    
    
        #           # gap区间内要有足够的原始数据
        #           if len(gap_df.dropna()) / len(gap_df) < 0.5:
        #               continue
         
        #           gap_qujian.extend(gap_danqujian_list)
        #           print(gap_qujian)
        #           count += 1

 
        #       if count == gap_number: # 每种gap的数目都要达到gap_number,达到规定的比例才会停止
                  
        #           print('@@@@@@@@@@@@@@@@@@@@@')
        #           print(count)
        #           break
          
        #   # 要去掉索引对应的le为空的suoyin
    
        #   test_df=df0.iloc[gap_qujian] # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   print(test_df)
        #   print(len(test_df))
         
        #   test=test_df.loc[test_df['LE_F_MDS_QC']==0,].dropna(axis=0) # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   print(test)
        #   print(len(test))
  
        #   train_index=np.setdiff1d(df0.index,test_df.index) # setdiff1d 前面那个数组有 后边那个没有的值
        #   print(train_index)
        
        #   train_df=df0.iloc[train_index] # # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   train=train_df.loc[train_df['LE_F_MDS_QC']==0,].dropna(axis=0)
        #   print(train)
        #   print(len(train))
        
        
        #   pd.set_option('display.max_columns', None)
        # # print(test.head(5))
        #   print(train.shape)
        #   print(test.shape)
        
        #   a=pd.DataFrame(test.isna().sum().sort_values(ascending=False))
            
        # # des=test.describe()
        # # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
        # # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
        # # test=test[(test['LE_F_MDS'] <=shangxu[3])
        # #           &(test['LE_F_MDS'] >=xiaxu[3])]
     
        
        #  # print(des)
        #  # des=train.describe()
        #  # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
        #  # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
        #  # train=train[(train['LE_F_MDS'] <=shangxu[3])
        #  #             &(train['LE_F_MDS'] >=xiaxu[3])]
        #  # print(xiaxu)
             
        #   train=train.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
        #   test=test.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
        
        #   # train_Drivers=train.drop(['LE_F_MDS'],axis=1)
        #   train_Drivers=train[feature]
        #   print(train_Drivers.index)
         
        #   # test_Drivers=test.drop(['LE_F_MDS'],axis=1) 
        #   test_Drivers=test[feature]
        #   print(test_Drivers.index)
         
        #   train_LE=train['LE_F_MDS']
        #   print(train_LE.index)
         
        #   test_LE=test['LE_F_MDS']
        #   print(test_LE.index)
         
        #   # x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
        #   #                                                ,test_size=0.25
        #   #                                                ,random_state=(0))                            
        #   print(train_Drivers.shape)
        #   print(test_Drivers.shape)
        #   print(train_LE.shape)
        #   print(test_LE.shape)
     
        #   # ==============================建模====================================
         
        #   rf1=RandomForestRegressor(n_estimators=1100
        #                             ,max_depth=80
        #                             ,random_state=(0))   
        #   rf1.fit(train_Drivers,train_LE)    
         
        #   rmse1=np.sqrt(mean_squared_error(test_LE,rf1.predict(test_Drivers)))
         
  
        #   rmse_list1.append(rmse1)
        #   rmse_df=pd.DataFrame({'rmse':rmse_list1})
        #   print(rmse_df)
         
        #   r2=r2_score(test_LE,rf1.predict(test_Drivers))  
        #   r2_list1.append(r2)
        #   r2_df=pd.DataFrame({'r2':r2_list1})
         
        #   bias=(rf1.predict(test_Drivers)-test_LE).mean()
        #   bias_list1.append(bias)
        #   bias_df=pd.DataFrame({'bias':bias_list1})
         
        #   site_list1+=[s.split('_',6)[1]]
        #   year_list1+=[int(s.split('_',6)[5].split('-',1)[1])
        #               -int(s.split('_',6)[5].split('-',1)[0])+1]  
         
        #   # total_number.append(int(b.shape[0]))
        #   # post_dropna_number.append(int(a.shape[0]))
        #   # post_drop_le_abnormal_number.append(int(c.shape[0]))
        #   test_number1.append(int(test.shape[0]))
        #   train_number1.append(int(train.shape[0]))
         
   
        #   dic2={'SITES':site_list1,'YEAR':year_list1
        #         # ,'原始数目':total_number
        #         # ,'去掉空值后':post_dropna_number
        #         # ,'去掉LE异常值后':post_drop_le_abnormal_number
        #         ,'TRAIN_NUMBER':train_number1
        #         ,'TEST_NUMBER':test_number1
        #         # ,'n_estimators':N_estimators,'max_depth':Max_depth
        #         ,'RMSE':rmse_list1,'R2':r2_list1,'BIAS':bias_list1
               
        #       }
         
        #   dic2=pd.DataFrame(dic2)
        #   print(dic2)
        #   dic2.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\GAP_diff', str(s.split('_',6)[1]) + '.csv'),index = False)


    
    #========================================读一下八个csv
    
#     dic_list=[]
    
#     for i in range(3,5):
        
#         df=pd.read_csv(os.path.join(outpath,str(s.split('_',6)[1]) + str(i) + '.csv'))
        
#         dic={'list_name':df, 'rmse':df['RMSE'][0]}
        
#         dic_list+=[dic]
        
#         print(dic_list)
#     print('=============================================')
        
#     # df3=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '3' +'.csv'))
#     # df4=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '4' +'.csv'))
#     # df5=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '5' +'.csv'))
#     # df6=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '6' +'.csv'))
#     # df7=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '7' +'.csv'))
#     # df8=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '8' +'.csv'))
#     # df9=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '9' +'.csv'))
#     # df10=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '10' +'.csv'))
#     # df11=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '11' +'.csv'))   
    
#     # dic=[{'list_name':df3, 'rmse':df3['RMSE'][0]}
#     #      ,{'list_name':df4, 'rmse':df4['RMSE'][0]}
#     #      ,{'list_name':df5, 'rmse':df5['RMSE'][0]}
#     #      ,{'list_name':df6, 'rmse':df6['RMSE'][0]}
#     #      ,{'list_name':df7, 'rmse':df7['RMSE'][0]}
#     #      ,{'list_name':df8, 'rmse':df8['RMSE'][0]}
#     #      ,{'list_name':df9, 'rmse':df9['RMSE'][0]}
#     #      ,{'list_name':df10, 'rmse':df10['RMSE'][0]}
#     #      ,{'list_name':df11, 'rmse':df11['RMSE'][0]}
#     #     ]
    
#     sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False)
#     print(sorted_dic)
#     list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe
#     print(list_name)
#     df=pd.concat(list_name,axis=1)
        
#     print(df)
#     df.to_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))


#     a=pd.read_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))
#     # pd.set_option('display.max_columns', None)
#     df=pd.DataFrame(a.isna().sum().sort_values(ascending=False))
#     print(a)
#     # 直接用fillna来填,可行, 但还要填drivers!!!
#     # 找rmse最低值 对应的来开始填补
#     print(df.columns)
    # 一
    # b=a.loc[a['LE_Gap_filled'] > -9999, ['LE_Gap_filled','Drivers','RMSE']]

    # a['Drivers']=a.loc[a['LE_Gap_filled'] == np.nan, ['Drivers']]
    # a['Drivers'].fillna( b['Drivers'] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    # print(a['Drivers'].describe())

    # a['RMSE']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE']]
    # a['RMSE'].fillna( b['RMSE'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    # print(a['RMSE'].describe())

    # b=a.loc[a['LE_Gap_filled.1'] > -9999, ['LE_Gap_filled.1', 'Drivers.1', 'RMSE.1']] # 只是有LE数值的地方,用来填充上边的空集

    # a['Drivers.1']=a.loc[a['LE_Gap_filled.1'] == np.nan, ['Drivers.1']]
    # a['Drivers.1'].fillna( b['Drivers.1'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
    # print(a['Drivers.1'].describe())

    # a['RMSE.1']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.1']]
    # a['RMSE.1'].fillna( b['RMSE.1'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    # print(a['RMSE.1'].describe())

    # a['LE_Gap_filled'].fillna(a['LE_Gap_filled.1'], inplace=True) # LE Update
    # df1=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
    # print(df1)

    # a['Drivers'].fillna(a['Drivers.1'],inplace=True)  # Drivers Update
    # print(a['Drivers'].describe())

    # a['RMSE'].fillna(a['RMSE.1'],inplace=True)  # Rmse Update
    # print(a['RMSE'].describe())


#     # 二
#     b=a.loc[a['LE_Gap_filled.2'] > -9999, ['LE_Gap_filled.2', 'Drivers.2', 'RMSE.2']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.2']=a.loc[a['LE_Gap_filled.2'] == np.nan, ['Drivers.2']]
#     a['Drivers.2'].fillna( b['Drivers.2'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.2'].describe())

#     a['RMSE.2']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.2']]
#     a['RMSE.2'].fillna( b['RMSE.2'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.2'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.2'], inplace=True) # LE Update
#     df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df2)

#     a['Drivers'].fillna(a['Drivers.2'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.2'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 三
#     b=a.loc[a['LE_Gap_filled.3'] > -9999, ['LE_Gap_filled.3', 'Drivers.3', 'RMSE.3']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.3']=a.loc[a['LE_Gap_filled.3'] == np.nan, ['Drivers.3']]
#     a['Drivers.3'].fillna( b['Drivers.3'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.3'].describe())

#     a['RMSE.3']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.3']]
#     a['RMSE.3'].fillna( b['RMSE.3'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.3'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.3'], inplace=True) # LE Update
#     df3=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df3)

#     a['Drivers'].fillna(a['Drivers.3'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.3'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 四
#     b=a.loc[a['LE_Gap_filled.4'] > -9999, ['LE_Gap_filled.4', 'Drivers.4', 'RMSE.4']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.4']=a.loc[a['LE_Gap_filled.4'] == np.nan, ['Drivers.4']]
#     a['Drivers.4'].fillna( b['Drivers.4'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.4'].describe())

#     a['RMSE.4']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.4']]
#     a['RMSE.4'].fillna( b['RMSE.4'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.4'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.4'], inplace=True) # LE Update
#     df4=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df4)

#     a['Drivers'].fillna(a['Drivers.4'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.4'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 五
#     b=a.loc[a['LE_Gap_filled.5'] > -9999, ['LE_Gap_filled.5', 'Drivers.5', 'RMSE.5']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.5']=a.loc[a['LE_Gap_filled.5'] == np.nan, ['Drivers.5']]
#     a['Drivers.5'].fillna( b['Drivers.5'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.5'].describe())

#     a['RMSE.5']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.5']]
#     a['RMSE.5'].fillna( b['RMSE.5'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.5'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.5'], inplace=True) # LE Update
#     df5=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df5)

#     a['Drivers'].fillna(a['Drivers.5'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.5'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 六
#     b=a.loc[a['LE_Gap_filled.6'] > -9999, ['LE_Gap_filled.6', 'Drivers.6', 'RMSE.6']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.6']=a.loc[a['LE_Gap_filled.6'] == np.nan, ['Drivers.6']]
#     a['Drivers.6'].fillna( b['Drivers.6'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.6'].describe())

#     a['RMSE.6']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.6']]
#     a['RMSE.6'].fillna( b['RMSE.6'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.5'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.6'], inplace=True) # LE Update
#     df6=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df6)

#     a['Drivers'].fillna(a['Drivers.6'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.6'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 七
#     b=a.loc[a['LE_Gap_filled.7'] > -9999, ['LE_Gap_filled.7', 'Drivers.7', 'RMSE.7']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.7']=a.loc[a['LE_Gap_filled.7'] == np.nan, ['Drivers.7']]
#     a['Drivers.7'].fillna( b['Drivers.7'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.7'].describe())

#     a['RMSE.7']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.7']]
#     a['RMSE.7'].fillna( b['RMSE.7'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.7'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.7'], inplace=True) # LE Update
#     df7=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df7)

#     a['Drivers'].fillna(a['Drivers.7'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.7'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())

#     # 八
#     b=a.loc[a['LE_Gap_filled.8'] > -9999, ['LE_Gap_filled.8', 'Drivers.8', 'RMSE.8']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.8']=a.loc[a['LE_Gap_filled.8'] == np.nan, ['Drivers.8']]
#     a['Drivers.8'].fillna( b['Drivers.8'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.8'].describe())

#     a['RMSE.8']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.8']]
#     a['RMSE.8'].fillna( b['RMSE.8'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.8'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.8'], inplace=True) # LE Update
#     df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df8)

#     a['Drivers'].fillna(a['Drivers.8'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.8'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())
    
    
#     # 九
#     b=a.loc[a['LE_Gap_filled.9'] > -9999, ['LE_Gap_filled.9', 'Drivers.9', 'RMSE.9']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.9']=a.loc[a['LE_Gap_filled.9'] == np.nan, ['Drivers.9']]
#     a['Drivers.9'].fillna( b['Drivers.9'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.9'].describe())

#     a['RMSE.9']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.9']]
#     a['RMSE.9'].fillna( b['RMSE.9'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.9'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.9'], inplace=True) # LE Update
#     df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df8)

#     a['Drivers'].fillna(a['Drivers.9'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.9'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())





#     # 加一下a的时间

#     so=pd.read_csv(os.path.join(path1,s))
#     so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]

#     print(a['TIMESTAMP_START'])

#     print(a.shape)

#     a['QC'] = np.nan
#     a.loc[a['LE_Gap_filled'] > -9999, 'QC'] = 1
#     a.loc[a['LE_Gap_filled'] == -9999 , 'QC'] = 0
    
#     a['LE_Gap_filled'].replace(np.nan,-8888,inplace=True) # 原本是空值的部分  由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
#     a['LE_Gap_filled'].replace(-9999,np.nan,inplace=True) #       |          空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
#     a['LE_Gap_filled'].fillna(sole['LE_F_MDS'],inplace=True)#  最后依旧是空值     
     
#     a.loc[a['LE_Gap_filled'] == -8888 , 'QC'] = -9999

    
#     print(a.dropna().shape[0]/a.shape[0])
    
#     a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled', 'QC',  'Drivers', 'RMSE']]
    
#     a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')

 
#     a['LE_Gap_filled'].fillna(a['LE_F_MDS'],inplace=True)   
#     a['LE_Gap_filled'].replace(-8888,np.nan,inplace=True)    
    
#     a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled', 'QC',  'Drivers', 'RMSE']]
    
    
#     a['SW_IN_F_MDS']=np.nan
#     a['NETRAD']=np.nan
#     a['G_F_MDS']=np.nan
#     a['TA_F_MDS']=np.nan
#     a['RH']=np.nan
#     a['WD']=np.nan 
#     a['WS']=np.nan 
    
#     a['PA_F']=np.nan
#     a['VPD_F_MDS']=np.nan
#     a['NDVI']=np.nan
#     a['TS_F_MDS_1']=np.nan
#     a['SWC_F_MDS_1']=np.nan
#     a['TA_F_MDS']=np.nan
    
#     a['Drivers'].replace(np.nan,-9999,inplace=True)

    
#     b=a.loc[a['Drivers']!=-9999]
#     # print(b)
    
#     for i in b.columns[6:]:
        
#         # print(i)
        
#         c=b[b['Drivers'].str.contains(i)]

#         c[i].replace(np.nan,'+',inplace=True)
        
#         a[i]=c[i]
        
#     b=a.count(axis=1)-6
#     b=pd.DataFrame(b)
    
#     a['n_drivers']=b
    
#     a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
    
#     a['Drivers'].replace(-9999,np.nan,inplace=True)



#     # a.to_csv(os.path.join(path,sole+'.csv'),index = False)

    
#     a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(s.split('_',6)[1]) +'.csv'),index = False)
             



# #  创造空列
# # df["Empty_1"] = ""
# # df["Empty_2"] = np.nan
# # df['Empty_3'] = pd.Series() 
    














        
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 26 12:29:18 2022 没加 ndvi

@author: Lenovo
"""
from sklearn.metrics import make_scorer
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score,mean_squared_error
# from sklearn.preprocessing import StandardScaler
import seaborn as sns  
from scipy.stats import gaussian_kde
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.feature_selection import RFECV
from scipy.interpolate import griddata
from itertools import combinations
from operator import itemgetter
dic = {}
path=r'D:\Fluxnet\try'
outpath=r'D:\Fluxnet\OUTCOME\每种变量组合放在一起之前的仓库'

site_list=[]
year_list=[]

total_number=[]
post_dropna_number=[]
post_drop_le_abnormal_number=[]
test_number=[]
train_number=[]
N_estimators=[]
Max_depth=[]

Rmse_list=[]
R2_list=[]
Bias_list=[]

Drivers_column=[]
Filling_rate_list=[]
Feature_list=[]

# path1=r'D:\Fluxnet\try'
# path2=r'D:\Fluxnet\try_ndvi'  # 认真一点谢谢 别老粗心 改了上边不改下边 丢三落四的
# path3=r''
# path1=r'D:\Fluxnet\加了土壤水和土壤温度的\MDS_用'
# path2=r'D:\Fluxnet\ndvi777 - SHAOSHAOSHAO'  # 认真一点谢谢 别老粗心 改了上边不改下边 丢三落四的
  
# for s,j in zip(os.listdir(path1),os.listdir(path2)):
#     print(s)
#     print(os.listdir(path2))
#     sole_s=pd.read_csv(os.path.join(path1,s))
#     sole_j=pd.read_csv(os.path.join(path2,j)) 
       
#     sole_s['TIMESTAMP_START']=sole_s['TIMESTAMP_START'].astype('str') 
#     sole_s['TIMESTAMP_START']=pd.to_datetime(sole_s['TIMESTAMP_START'])   
      
#     sole_j=sole_j[['TIMESTAMP_START','NDVI']]
#     sole_j['TIMESTAMP_START'] = pd.to_datetime(sole_j['TIMESTAMP_START'])
            
#     sole_j = sole_j.set_index('TIMESTAMP_START')
#     sole_j = sole_j.resample('1D').interpolate() # 30T 按分钟(T)插值  1D按天插值
#     sole_j = sole_j.reset_index() 
    
#     sole=pd.merge(sole_s, sole_j,how='left',on='TIMESTAMP_START') 
    
#     sole['NDVI']=sole['NDVI'].interpolate(method='pad') # 1天一个值
#     print(sole)
    
    #========================================================================
path1 =  r'C:\Users\Lenovo\Desktop\四大类\REALTRY'
for file in os.listdir(path1):
      
    sole = pd.read_csv(os.path.join(path1,file))
    site_list1=[]
    year_list1=[]
    test_number1=[]
    train_number1=[]
    rmse_list1=[]
    r2_list1=[]
    bias_list1=[]
  
    sole_raw = sole
    sole_copy = sole
    print('原始数据:',sole.shape)
    sole.dropna(subset=['LE_F_MDS_QC'],axis=0,inplace=True) #删除LE_F_MDS_QC中含有空值的行
    print('去掉没QC后的原始数据:',sole.shape)
    
    trainset=sole[sole['LE_F_MDS_QC']==0]
    print('观测数据:',trainset.shape)
    
    #=================================以LE_F_MDS=20W/m² 为界 白天和晚上分别训练
    
    # trainset=trainset[trainset['LE_F_MDS']>=20]
    print('白天的总量: ',trainset.shape)
    
    gap=sole[sole['LE_F_MDS_QC']!=0]
    print('插补数据:',gap.shape)
    
    gap_drople=gap.drop(['LE_F_MDS','LE_F_MDS_QC'
                         ,'TIMESTAMP_START','TIMESTAMP_END']#,'Unnamed: 0.1', 'Unnamed: 0','NEE_VUT_REF'
                         # , 'SW_IN_F_MDS_QC', 'NETRAD'
                         ,axis=1)
    # gap_drople=gap_drop.drop(['SW_IN_F_MDS_QC', 'NETRAD'],axis=1)
    
    #===============================每行至少有一个/三个不是空值时保留
    
    # gap_dropna=gap_drople[gap_drople.isnull().T.sum()<=8] 
    gap_dropna=gap_drople.dropna(axis=0,thresh=3) 
    
    print('去空值后的插补数据:',gap_dropna.shape)
    
    dff=pd.DataFrame(gap_dropna.isna().sum().sort_values(ascending=False))

    print('预测集的空值:',dff)
    
    #看下训练集的空值,可以看出跟插补集不太一样
    print('训练集的空值:\n',trainset.drop(['LE_F_MDS','LE_F_MDS_QC'
                         ,'TIMESTAMP_START','TIMESTAMP_END']#,'Unnamed: 0.1', 'Unnamed: 0','NEE_VUT_REF'
                         ,axis=1).isna().sum().sort_values(ascending=False))
      
    #==========================获得所有变量组合
    
    def combine(list0,o):
        list1=[]
        for i in combinations(list0,o):
            list1.append(i)
        return list1
    
    #==========================遍历
    
    rmse_list=[]
    r2_list=[]
    bias_list=[]
    filling_rate_list=[]
    dic_list = []
    fig  = plt.figure(figsize=(4,40),dpi=600)
    fig1 = plt.figure(figsize=(16,36),dpi=600)

    for u in reversed(range(3,len(gap_drople.columns)+1)) : 

        fillrate_mid_list=[]
        col_list=[]
        list666=[]
        list666.extend(combine(dff.index,u))
        #===========================获取不同插补率的组合特征
        
        list_score=[]
        score=[]
        big_list=[]
        
        for i in range(0,len(list666)):
            
            sco=f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}'
            
            score+=[f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}']
            
            list_score+=[{'score':sco,'list':list666[i]}]
            
        # print(list_score)   # print(list_score)
        #=============================plot
        
        key_list=[a['list'] for a in list_score]
    
        len_list = [ len(i) for i in key_list ]
        
        score=[np.float64(i) for i in score]
        
        plt.rc('font', family='Times New Roman',size=20)
        
        plt.figure(figsize=(10,8),dpi=400)
       
        plt.scatter(len_list,score)
        
        plt.xlabel('Number of drivers', {'family':'Times New Roman','weight':'normal','size':20})
        plt.ylabel('Filling rate',{'family':'Times New Roman','weight':'normal','size':20})

        #============================填充率最大对应去的变量列表shunxu
        
        sorted_list=sorted(list_score, key=lambda list_score: list_score['score'], reverse=True)
        print('********************')
        # print(sorted_list)   # 按降序排列
        
        biggest_score=[a['score'] for a in sorted_list][0]
        # print(biggest_score)
        
        biggest_score_feature_list=[a['list'] for a in sorted_list][0]
        # print(biggest_score_feature_list)
                
        Feature_list.append(biggest_score_feature_list)

        filling_rate_list.append(biggest_score)
        Filling_rate_list.append(biggest_score)
        # print(Feature_list,Filling_rate_list)
        
        #==============================建模准备================================
        
        train_copy=trainset.copy()
        train_copy.drop(['LE_F_MDS_QC','TIMESTAMP_START','TIMESTAMP_END']#,'Unnamed: 0.1', 'Unnamed: 0','NEE_VUT_REF'
                   ,axis=1,inplace=True)#.isna().sum().sort_values(ascending=False)
        
        feature=[x for x in biggest_score_feature_list]
        # print(feature)
        
        train_option=train_copy[feature]
        train_option['LE_F_MDS']=train_copy['LE_F_MDS']
            
        print(train_option.shape)#原始数值
        print(train_option.isna().sum().sort_values(ascending=True))
            
            #============================去除空值=======================================
        train_option_dropna=train_option.dropna() #训练数据去空值
        print('训练集去掉空值后: ',train_option_dropna.shape)
    
            #===========================去除LE异常值====================================
        # des=train_option_dropna.describe()
        # print(des)
        # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
        # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
        # print(shangxu)
        # print(xiaxu)
        # c=train_option_dropna[(train_option_dropna['LE_F_MDS'] <=shangxu[-1])
        #         &(train_option_dropna['LE_F_MDS'] >=xiaxu[-1])]
        c=train_option_dropna    
        print(c.shape)
            
        Drivers=c.drop(['LE_F_MDS'],axis=1)
            
        Drivers_column+=[' '.join(Drivers.columns.tolist())]
            
        LE=c['LE_F_MDS']
        x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
                                                        ,test_size=0.20
                                                        ,random_state=(0))                            
        print(x_train.shape)
        print(x_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        
        #========================网格搜索+OOB 寻找最有超参数========================
        
        # # def simpleGridSearch(x_train, x_test, y_train, y_test):
        # # 使用for循环实现网格搜索
        # best_score = 100
        # rmse = []
        # for n_esti in [300,800,1100,1500]:
        #     for max_dep in [30,90,110]:
        #         rf = RandomForestRegressor(n_estimators=n_esti
        #                                     ,max_depth=max_dep
        #                                     ,oob_score=True
        #                                     ,random_state=(0)) # 对于每种参数可能的组合,进行一次训练;
        #         rf.fit(x_train,y_train)
        #         score = np.sqrt(mean_squared_error(y_train, rf.oob_prediction_))
        #         print(score)
                
        #         rmse+=[score]
                    
        #         if score < best_score:#找到表现最好的参数
                      
        #             n_estimators = n_esti
        #             max_depth = max_dep
                    
        #             best_score = score
        # print("Best score:{:.2f}".format(best_score))
    
        #     # return n_estimators,max_depth
            
        #========================GridsearchCV 寻找最优超参数=========================
        
        # rfr=RandomForestRegressor()
       
        # param_grid={'n_estimators':[300,800,1100]#300,500,700,900,1100,1300,1500,1700,1900,2100
        #             #500,800,1100,1400,1700,2000 1,2,3,4,5,6
        #           ,'max_depth':[30,80,110]}#30,50,70,90,110,150   30,50,70,90,110
        
        # gs=GridSearchCV(rfr 
        #                 ,param_grid=param_grid
        #                 ,scoring=make_scorer(mean_squared_error,greater_is_better=False) 
                          # ,score=['r2','neg_root_mean_squared_error']      # sklearn.metrics.SCORES.keys()       
                          #,refit='neg_root_mean_squared_error'
        #                 ,cv=2
        #                 ,verbose=1
        #                 ,n_jobs=-1) 
        
        # gs.fit(x_train,y_train)

        # max_depth=gs.best_params_['max_depth']
        # Max_depth+=[max_depth]
        # print(gs.best_params_)
        
        # n_estimators=gs.best_params_['n_estimators']
        # N_estimators+=[n_estimators]
        # print(np.sqrt(-1*gs.score(x_test,y_test)))
        
        # #设置字体格式
        # sns.set(style='ticks')
        # plt.rc('font', family='Times New Roman',size=20) 
        # # plt.rcParams["font.weight"] = "bold"
        # # plt.rcParams["axes.labelweight"] = "bold"
        
        # #=============================Heat map of RMSE=============================
        
        # gs_df=pd.DataFrame(gs.cv_results_)
        # gs_df['RMSE']=np.sqrt(-1*(gs_df['mean_test_score']))
        # gs_df[['MAX_DEPTH','N_ESTIMATORS']]=gs_df[['param_max_depth','param_n_estimators']]
        
        # heatmap_data=gs_df.pivot_table(index='MAX_DEPTH'
        #                                 ,columns='N_ESTIMATORS'
        #                                 ,values='RMSE')
        
        # plt.figure(figsize=(10, 10),dpi=500)
        
        # heat_map=sns.heatmap(data=heatmap_data
        #                       ,linewidths=.05 #单个格子边框宽度 linecolor格子边框颜色
        #                       ,fmt='.2f'
        #                       ,cmap='jet'#'PuBuGn_r'
        #                       ,cbar=True
        #                       ,cbar_kws={'label':'RMSE of Cross-validation (W/m²)'
        #                                 ,'orientation':'vertical'#默认竖直,水平为horizontal
        #                                 ,'format':'%.2f'
        #                                 ,'extend':'both'          
        #                                 }#'pad':colorbar与heatmap间的距离
        #                         )# cmap = 'PuBuGn'  'cubehelix_r' 'PuBuGn_r' 'YlGnBu_r'
        #                       #  mask=heatmap_data>10 数据掩膜
        #                       # ,annot=True #默认FALSE不显示单个格子数值
        #                       # ,annot_kws={'size':15,'weight':'normal','color':'black'}#单个格子字体设置
        #                       #  heat_map.figure.colorbar(heat_map.collections[0],extend='both').set_label('RMSE of Cross-validation (W/m²)',fontdict={'size':16})
        #                       # 'interpolation':'nearest'
        # plt.gca ().invert_yaxis ()                     
        # plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\HeatMap1',s.split('_',6)[1])
        #             , bbox_inches='tight', dpi=500)
        
        # plt.show()
        # # plt.clf ()
        # # plt.close ()
        
        #==========================Interpolation map of RMSE=======================
       
        # y,x=np.mgrid[1500:300:4j,30:110:3j]#300:2100:10j,30:150:7j #1:10:10j,1:10:10j 500:2000:6j,30:110:5j

        # points=np.hstack((x.flatten()[:,None],y.flatten()[:,None]))
        
        # y1,x1=np.mgrid[300:1500:500j,30:110:500j]#300:2100:1000j,30:150:1000j 1:10:1000j,1:10:1000j
        
        # z1=griddata(points 
        #             ,np.array(rmse)
        #             ,(x1,y1)
        #             ,method='cubic')
        
        # plt.figure(figsize=(10,8),dpi=400)
        
        # plt.imshow(z1
        #           ,extent=[np.min(x),np.max(x),np.min(y),np.max(y)]
        #           ,cmap='jet'
        #           ,aspect='auto')
        
        # print(points)

        # plt.colorbar(extend='both', label='RMSE of OOB (W/m²)')
        
        # plt.xlabel('MAX_DEPTH') 
        # plt.ylabel('N_ESTIMATORS')
        # # a=[i for i in range(3,12)]
        # plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\InterpolationMap1',s.split('_',6)[1])
        #             , bbox_inches='tight', dpi=500)
       
        # plt.show()
        # # plt.clf ()
        # # plt.close ()
            
            #==================================建模====================================
            
        rf=RandomForestRegressor(n_estimators=1100
                                      ,max_depth=80
                                       ,oob_score=True
                                      ,random_state=(0))   
        rf.fit(x_train,y_train)    
        # rf.fit(Drivers,LE)     
        # pred_oob = rf.oob_prediction_ #袋外预测值
        # print(len(pred_oob))
        # print(pred_oob)
        # rmse=np.sqrt(mean_squared_error(LE, pred_oob)) #袋外均方根误差
        # rmse_list.append(rmse)
        # Rmse_list.append(rmse)
        # rmse_df=pd.DataFrame({'rmse':rmse_list})
        # print(rmse_df)
        # print(rmse_list)
        
        rmse=np.sqrt(mean_squared_error(y_test,rf.predict(x_test)))
        rmse_list.append(rmse)
        Rmse_list.append(rmse)
        rmse_df=pd.DataFrame({'rmse':rmse_list})
        # print(rmse_df)
        # print(rmse_list)
        
        # r2=rf.oob_score_
        r2=r2_score(y_test,rf.predict(x_test))  
        r2_list.append(r2)
        R2_list.append(r2)
        r2_df=pd.DataFrame({'r2':r2_list})
        
        # bias=(pred_oob-LE).mean()
        bias=(rf.predict(x_test)-y_test).mean()
        bias_list.append(bias)
        Bias_list.append(bias)
        bias_df=pd.DataFrame({'bias':bias_list})
        
       
        # ===========================高斯核密度散点图===========================
        
        # post_gs=pd.DataFrame({'predict':pred_oob,'in_situ':LE,})
        post_gs=pd.DataFrame({'predict':rf.predict(x_test),'in_situ':y_test,}) 
        post_gs['index']=[i for i in range(post_gs.shape[0])]
        post_gs=post_gs.set_index('index')
        x=post_gs['in_situ']
        y=post_gs['predict']
        xy = np.vstack([x,y])#计算点密度
        z = gaussian_kde(xy)(xy)#高斯核密度
        idx = z.argsort()#根据密度对点进行排序,最密集的点在最后绘制
        x, y, z = x[idx], y[idx], z[idx]
        fw = 800
        ax = fig.add_subplot(len(gap_drople.columns)-1,1,u-2)
        scatter = ax.scatter(x,y,marker='o',c=z,s=15,label='LST',cmap='jet') # o是实心圆,c=是设置点的颜色,cmap设置色彩范围,'Spectral_r'和'Spectral'色彩映射相反
        divider = make_axes_locatable(ax) #画色域图
        # plt.scatter(x, y, c=z, s=7, cmap='jet')
        # plt.axis([0, fw, 0, fw])  # 设置线的范围
        # plt.title( file.split('_',6)[1], family = 'Times New Roman',size=21)
        # plt.text( 10, 700,len(feature), family = 'Times New Roman',size=21)
        # plt.text(10, 700, 'Driver numbers = %s' % len(feature), family = 'Times New Roman',size=21)
         # plt.text(10, 600, 'Size = %.f' % len(y), family = 'Times New Roman',size=18) # text的位置需要根据x,y的大小范围进行调整。
         # plt.text(10, 650, 'RMSE = %.3f W/m²' % rmse, family = 'Times New Roman',size=18)
         # plt.text(10, 700, 'R² = %.3f' % r2, family = 'Times New Roman',size=18)
         # plt.text(10, 750, 'BIAS = %.3f W/m²' % bias, family = 'Times New Roman',size=18)
        ax.set_xlabel('Station LE (W/m²)',family = 'Times New Roman',size=19)
        ax.set_ylabel('Estimated LE (W/m²)',family = 'Times New Roman',size=19)
        ax.plot([0,fw], [0,fw], 'gray', lw=2)  # 画的1:1线,线的颜色为black,线宽为0.8
        ax.set_xlim(0,fw)
        ax.set_ylim(0,fw)
        # ax.xaxis.set_tick_params(labelsize=19) 
        # ax.xaxis.set_tick_params(labelsize=19) 
        # plt.xticks(fontproperties='Times New Roman',size=19)
        # plt.yticks(fontproperties='Times New Roman',size=19)
        fig.set_tight_layout(True) 
 
        
        s_ori = pd.read_csv(os.path.join(path1,file))
        MDS_GAP=s_ori
        if 'SW_IN_F_MDS' in MDS_GAP.columns.to_list() and 'TA_F_MDS' in  MDS_GAP.columns.to_list() and 'VPD' in  MDS_GAP.columns.to_list():
            
            MDS_GAP['Year']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            MDS_GAP['Year'] = MDS_GAP['TIMESTAMP_END'].dt.year  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            
            MDS_GAP['DoY']=MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            doy=[]
            for k in MDS_GAP['TIMESTAMP_END']:
                doy += [k.strftime("%j")]
            MDS_GAP['DoY'] = doy  #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
            MDS_GAP['Hour'] = MDS_GAP['TIMESTAMP_END']
            MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
            MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
            hour=[]
            for l in MDS_GAP['TIMESTAMP_END']:
                hour += [int(l.strftime('%H'))+float(l.strftime('%M'))/60]
            MDS_GAP['Hour'] = hour          
            MDS_GAP.loc[:,'LE']=y_test
            MDS_GAP['LE'].to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) + '.txt'),sep='	',index = False)
            MDS_GAP['LE_F_MDS']=s_ori['LE_F_MDS']
            MDS_GAP.loc[MDS_GAP['LE']>=-9999,['LE']] = -9999
            MDS_GAP['LE'].fillna(MDS_GAP['LE_F_MDS'],inplace=True)
            
            MDS_GAP['Rg']=MDS_GAP['SW_IN_F_MDS']        
            MDS_GAP['Tair']=MDS_GAP['TA_F_MDS']
            MDS_GAP['VPD']=MDS_GAP['VPD_F_MDS']
            # MDS_GAP['NEE']=MDS_GAP['NEE_VUT_REF']
                
            MDS_GAP=MDS_GAP[['Year','DoY','Hour','LE','Rg','Tair','VPD']]#,'Tsoil','rH',
            MDS_GAP.loc[MDS_GAP['Rg'] > 1200 , ['Rg']] = -9999 # Drivers control Rg <= 1200W/m² Ta <= 2.5℃W/m² VPD <= 50hPa
            # MDS_GAP.loc[MDS_GAP['Tair'] > 2.5 , ['Tair']] ==-9999
            MDS_GAP.loc[MDS_GAP['VPD'] > 50 , ['VPD']] = -9999
            #将单位插到第零行的位置上r
            row = 0  # 插入的位置
            value = pd.DataFrame([['-', '-', '-','Wm-2', 'Wm-2', 'degC','hPa']],columns=MDS_GAP.columns)  # 插入的数据  'degC','%',
            df_tmp1 = MDS_GAP[:row]
            df_tmp2 = MDS_GAP[row:]
            # 插入合并数据表
            MDS_GAP = df_tmp1.append(value).append(df_tmp2)
            MDS_GAP = MDS_GAP.fillna(-9999) 
            MDS_GAP.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\MDS_TRY666', str(file.split('_',6)[1])  + '.txt'),sep='	',index = False)#+ str(gaplong)
   
        #==============================复制一下整个的 插补 保存 比较 导出
        
        gap_dropna_copy=gap_dropna.copy()
        gap_dropna_copy=gap_dropna_copy[feature]
        gap_dropna_copy=gap_dropna_copy.dropna()
        gap_dropna_copy.loc[:, 'LE_gap_filled'] = rf.predict(gap_dropna_copy)

        le=sole.copy()
        le['LE_F_MDS_QC'].replace([1,2,3], np.nan, inplace=True)
        le['LE_F_MDS_QC'].replace(0, -9999, inplace=True)
        le['LE_F_MDS_QC'].fillna(gap_dropna_copy['LE_gap_filled'], inplace=True)
        le['RMSE']=[rmse]*sole.shape[0]
        
        dic0={'TIMESTAMP_START':le['TIMESTAMP_START'].tolist()
            ,'TIMESTAMP_END':le['TIMESTAMP_END'].tolist() 
            ,'LE_Gap_filled'+ str(u): le['LE_F_MDS_QC'].tolist()
            ,'RMSE'+ str(u): le['RMSE']
            ,'Drivers'+ str(u): [' '.join(Drivers.columns.tolist())]*sole.shape[0]
            }
        df0 = pd.DataFrame(dic0)
        dic={'list_name':df0, 'rmse':df0['RMSE'+ str(u)][df0.index[0]]} 
        dic_list += [dic]
        sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False) 
        list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe count
        df = pd.concat(list_name,axis=1)
        df = df.loc[:,~df.columns.duplicated()]
        
        
        shunxu = [''.join(list(filter(str.isdigit,x))) for x in df.columns]
        shunxu0 = list(filter(None,shunxu))
        shunxu = list(set(shunxu0)) #set的方法会改变顺序 按照原来的index排个序
        shunxu.sort(key = shunxu0.index)
        print(shunxu)
        
        
    
    # 动态插补
    for latter in shunxu[1:]:
        a = df
        b=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] > -9999, ['LE_Gap_filled'+ str(shunxu[0]), 'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]] # 只是有LE数值的地方,用来填充上边的空集
    
        a['Drivers'+ str(shunxu[0])]=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == np.nan, ['Drivers'+ str(shunxu[0])]]
        a['Drivers'+ str(shunxu[0])].fillna( b['Drivers'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    
        a['RMSE' + str(shunxu[0])]=a.loc[a['LE_Gap_filled'+str(shunxu[0])] == np.nan, ['RMSE' + str(shunxu[0])]]
        a['RMSE' + str(shunxu[0])].fillna( b['RMSE'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
        
        b=a.loc[a['LE_Gap_filled'+ str(latter)] > -9999, ['LE_Gap_filled'+ str(latter), 'Drivers'+ str(latter), 'RMSE'+ str(latter)]] # 只是有LE数值的地方,用来填充上边的空集
    
        a['Drivers'+ str(latter)]=a.loc[a['LE_Gap_filled'+ str(latter)] == np.nan, ['Drivers'+ str(latter)]]
        a['Drivers'+ str(latter)].fillna( b['Drivers'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    
        a['RMSE' + str(latter)]=a.loc[a['LE_Gap_filled'+str(latter)] == np.nan, ['RMSE' + str(latter)]]
        a['RMSE' + str(latter)].fillna( b['RMSE'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    
        a['LE_Gap_filled'+str(shunxu[0])].fillna(a['LE_Gap_filled'+ str(latter)], inplace=True) # LE Update
        df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
        print(a['LE_Gap_filled'+str(shunxu[0])])
        a['Drivers'+str(shunxu[0])].fillna(a['Drivers'+ str(latter)],inplace=True)  # Drivers Update
    
        a['RMSE'+str(shunxu[0])].fillna(a['RMSE'+ str(latter)],inplace=True)  # Rmse Update
 
        # 加一下a的时间
        so=pd.read_csv(os.path.join(path1,file))
        so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]
        # print(a['TIMESTAMP_START'])
        # a.to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) + '.csv'),index = False)
        
    print(a)
    
    a['QC'] = np.nan 
    a.loc[a['LE_Gap_filled'+ str(shunxu[0])] != -9999, 'QC'] = 1
    a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -9999 , 'QC'] = 0
    a['LE_Gap_filled'+ str(shunxu[0])].replace(np.nan,-8888,inplace=True)   # 原本是空值的部分  由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
    a['LE_Gap_filled'+ str(shunxu[0])].replace(-9999,np.nan,inplace=True)   #        |          空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
    a['LE_Gap_filled'+ str(shunxu[0])].fillna(so['LE_F_MDS'],inplace=True)#  最后依旧是空值     
    a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -8888 , 'QC'] = -9999
    a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC',  'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
    a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')
    a['LE_Gap_filled'+ str(shunxu[0])].fillna(a['LE_F_MDS'],inplace=True)   
    a['LE_Gap_filled'+ str(shunxu[0])].replace(-8888,np.nan,inplace=True)    
    a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC',  'Drivers' + str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
    
    bianliangmen = pd.read_csv(os.path.join(path1,file))
    bianliangmen = bianliangmen.drop(['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS'],axis=1).columns
    for i in bianliangmen:
        a[str(i)]=np.nan
    # print(a.columns)     
    
    a['Drivers' + str(shunxu[0])].replace(np.nan,-9999,inplace=True)      
    b=a.loc[a['Drivers' + str(shunxu[0])]!=-9999]

    for i in b.columns[6:]:

        c=b[b['Drivers' + str(shunxu[0])].str.contains(i)]
        c[i].replace(np.nan,'+',inplace=True)
        a[i]=c[i]
        
    b=a.count(axis=1)-6
    b=pd.DataFrame(b)
    
    a['n_drivers']=b
    a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
    a['Drivers' + str(shunxu[0])].replace(-9999,np.nan,inplace=True)
    # print(a)
    a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(file.split('_',6)[1]) +'.csv'),index = False)

    #=============================== 变量个数 VS.插补率
    
    # fig = plt.subplot(8,5,36+dalei)    
    # plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\DoubleY',s.split('_',6)[1])
    #             , bbox_inches='tight', dpi=500)

    x = [x for x in reversed(range(3,len(gap_drople.columns)+1))] #reversed(range(len(df.index)+1),3)matplotlib does not support generators as input
    y1 = rmse_list
    y2 = filling_rate_list
    # fig = plt.figure(figsize=(12,8),dpi=400)
    
    ax = fig.add_subplot(len(gap_drople.columns)-1,1,len(gap_drople.columns)-1)

    line1=ax.plot(x, y1,color='red',linestyle='--',marker='o',linewidth=2.5)
    ax.set_ylabel('RMSE of 25% tesing set', {'family':'Times New Roman','weight':'normal','size':21},color='red')
    ax.set_xlabel('Number of drivers',{'family':'Times New Roman','weight':'normal','size':21})
    ax.tick_params(labelsize=19)
    # ax1.set_title("")
    ax2 = ax.twinx()  # this is a important function
    #ax2.set_ylim([-0.05,1.05]) # 设置y轴取值范围   
    # ax2.set_yticks([0.0,0.3,0.5,0.7,0.9]) # 设置刻度范围 
    # ax2.set_yticklabels([0.0,0.3,0.5,0.7,0.9]) # 设置刻度
    line2=ax2.plot(x, y2,color='blue',marker='o',linewidth=2.5)
    ax2.tick_params(labelsize=19)
    ax2.set_ylabel('Filling rate', {'family':'Times New Roman','weight':'normal','size':21},color='blue')
    # a2.invert_yaxis() #invert_yaxis()翻转纵轴,invert_xaxis()翻转横轴
    plt.tick_params(labelsize=19)
    plt.xticks(np.arange(5, 13, 1),fontproperties='Times New Roman',size=19)
    plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\828','try')
                      , bbox_inches='tight', dpi=500)
    plt.show()
    
        # total_number.append(int(sole.shape[0]))
        # post_dropna_number.append(int(train_option_dropna.shape[0]))
        # post_drop_le_abnormal_number.append(int(c.shape[0]))
        # test_number.append(int(c.shape[0]*0.25))
        # train_number.append(int(c.shape[0]*0.75))
        # # N_estimators.append(n_estimators)
        # # Max_depth.append(max_depth)
         
    # 绘制散点图file
    s_ori = pd.read_csv(os.path.join(path1,file))
    ori = s_ori.loc[s_ori['LE_F_MDS_QC']==0,['TIMESTAMP_START','LE_F_MDS']]
    filled = s_ori.loc[s_ori['LE_F_MDS_QC']!=0,['TIMESTAMP_START','LE_F_MDS']]
    s_ori['TIMESTAMP_START'] = pd.to_datetime(s_ori['TIMESTAMP_START'])
    s_ori['year'] = s_ori['TIMESTAMP_START'].dt.year
    gap_filled = a.loc[a['QC'] == 1,['TIMESTAMP_START','LE_Gap_filled'+ str(shunxu[0])]]
    
    fig1 ,ax = plt.subplots(5,1,sharex='col',figsize=(25,9),dpi=300)
    ax0 = ax[0]
    ax0.plot( 'LE_F_MDS', data=ori, linestyle='none',marker='o')
    ax1 = ax[1]
    ax1.plot(  'LE_F_MDS', data=filled, color='#ff7f0e',linestyle='none', marker='o')
    ax2 = ax[2]
    ax2.plot(  'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
    ax2.plot( 'LE_F_MDS', data=filled, alpha=0.6, linestyle='none', marker='o')
    ax3 = ax[3]
    # ax2.plot(  'LE_F_MDS', data=s_ori, alpha=0.6, linestyle='none', marker='o')
    ax3.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled, color='#FAA460', linestyle='none', marker='o' )
    ax4 = ax[4]
    ax4.plot(  'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
    ax4.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled,color='#FAA460', alpha=0.6, linestyle='none', marker='o' )
    
    ax0.set_ylabel('in-situ', fontsize=19)
    ax1.set_ylabel('MDS', fontsize=19)
    ax2.set_ylabel('FLUXNET2015', fontsize=19)
    ax3.set_ylabel('RF', fontsize=19)
    ax4.set_ylabel('Dynamic', fontsize=19)
    
    nianfen = int(file.split('_',6)[5].split('-',2)[0])
    nianfen1 = int(file.split('_',6)[5].split('-',2)[1])
    ax2.set_xticks([365*48*x  for x in range(nianfen1+2-nianfen)]) 
    ax2.set_xticklabels([x  for x in range(nianfen-1,nianfen1+1)],fontproperties='Times New Roman',size=19)
    ax4.set_xlabel('Year', fontsize=19)
    plt.show()
    
        
        #===============================================导出
         
        # dic={'SITES':site_list,'YEAR':year_list,'原始数目':total_number
        #           ,'去掉空值后':post_dropna_number
        #           ,'去掉LE异常值后':post_drop_le_abnormal_number
        #           ,'TRAIN_NUMBER':train_number
        #           ,'TEST_NUMBER':test_number
        #           # ,'n_estimators':N_estimators,'max_depth':Max_depth
        #           ,'RMSE':Rmse_list,'R2':R2_list,'BIAS':Bias_list
        #           ,'Drivers_column':Drivers_column
        #           ,'Filling_rate' : Filling_rate_list
        #         }
            
        # dic=pd.DataFrame(dic)
        # # print(dic)
        # dic.to_csv(r'D:\Fluxnet\OUTCOME\RMSE_ALL\RMSE_All_Day.csv')
        
        # dic_sole={
        #           'RMSE':rmse_list,'R2':r2_list,'BIAS':bias_list
        #          } 
        # dic_sole=pd.DataFrame(dic_sole)
        # dic_sole.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(s.split('_',6)[1])  +'.csv'),index = False)
     
        #===============================================Various length of gap
        
        # for j,k in zip([0.05,0.075,0.125],[6,24,48]): #一天 七天 一月 一共占总数据的0.25
        # #48,336,720
          
        #   df0=sole.copy()
        #   print(len(df0))
        #   df=df0[df0['LE_F_MDS_QC']==0]
        #   print(df['LE_F_MDS_QC'])
        #   print(len(df))
        #   print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
          
          
          
        #   #可以开始make gap的位置区间
        #   start_point=np.arange(df['LE_F_MDS_QC'].index[0],df['LE_F_MDS_QC'].index[-1]-k+1) #k是gap长度 
         
        #   #gap的个数
        #   gap_number=int(len(df)*j/k)
        #   print(gap_number)
          
        #   # 随机选择开始的位置
        #   # np.random.seed(1) # 每次的随机数都是一样的
          
        #   gap_posi=np.random.choice(start_point,gap_number*3) #多一点选择的余地
          
        #   posi=sorted(gap_posi) # 排一下顺序}
        #   print(posi)
          
        #   count=0
        #   gap_qujian=[]
          
        #   # 并不是每个随机开始的位置都可以用,不能和以前的gap开始的位置重叠,gap的位置数据量也要充足

        #   for m,n in enumerate(posi): # m是索引 n是开始的位置(其实也是索引)
             
        #       # 单个gap的区间
        #       # 意思是从第多少位到多少位是gap区间
        #       gap_danqujian_list =[h for h in np.arange(n,n+k)]
        #       print(gap_danqujian_list)
        #       print('==')
        #       # 整个DataFrame中的gap
        #       gap_df = df0.iloc[gap_danqujian_list]
        #       # print(gap_df)
        #       # gap区间要在限定的范围内
        #       if np.isin(gap_danqujian_list,start_point).all():
                 
        #           # 不同长度gap不能重叠
        #           if m>0 and n-posi[m-1] <= k: 
        #               continue
    
    
        #           # gap区间内要有足够的原始数据
        #           if len(gap_df.dropna()) / len(gap_df) < 0.5:
        #               continue
         
        #           gap_qujian.extend(gap_danqujian_list)
        #           print(gap_qujian)
        #           count += 1

 
        #       if count == gap_number: # 每种gap的数目都要达到gap_number,达到规定的比例才会停止
                  
        #           print('@@@@@@@@@@@@@@@@@@@@@')
        #           print(count)
        #           break
          
        #   # 要去掉索引对应的le为空的suoyin
    
        #   test_df=df0.iloc[gap_qujian] # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   print(test_df)
        #   print(len(test_df))
         
        #   test=test_df.loc[test_df['LE_F_MDS_QC']==0,].dropna(axis=0) # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   print(test)
        #   print(len(test))
  
        #   train_index=np.setdiff1d(df0.index,test_df.index) # setdiff1d 前面那个数组有 后边那个没有的值
        #   print(train_index)
        
        #   train_df=df0.iloc[train_index] # # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
        #   train=train_df.loc[train_df['LE_F_MDS_QC']==0,].dropna(axis=0)
        #   print(train)
        #   print(len(train))
        
        
        #   pd.set_option('display.max_columns', None)
        # # print(test.head(5))
        #   print(train.shape)
        #   print(test.shape)
        
        #   a=pd.DataFrame(test.isna().sum().sort_values(ascending=False))
            
        # # des=test.describe()
        # # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
        # # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
        # # test=test[(test['LE_F_MDS'] <=shangxu[3])
        # #           &(test['LE_F_MDS'] >=xiaxu[3])]
     
        
        #  # print(des)
        #  # des=train.describe()
        #  # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
        #  # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
        #  # train=train[(train['LE_F_MDS'] <=shangxu[3])
        #  #             &(train['LE_F_MDS'] >=xiaxu[3])]
        #  # print(xiaxu)
             
        #   train=train.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
        #   test=test.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
        
        #   # train_Drivers=train.drop(['LE_F_MDS'],axis=1)
        #   train_Drivers=train[feature]
        #   print(train_Drivers.index)
         
        #   # test_Drivers=test.drop(['LE_F_MDS'],axis=1) 
        #   test_Drivers=test[feature]
        #   print(test_Drivers.index)
         
        #   train_LE=train['LE_F_MDS']
        #   print(train_LE.index)
         
        #   test_LE=test['LE_F_MDS']
        #   print(test_LE.index)
         
        #   # x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
        #   #                                                ,test_size=0.25
        #   #                                                ,random_state=(0))                            
        #   print(train_Drivers.shape)
        #   print(test_Drivers.shape)
        #   print(train_LE.shape)
        #   print(test_LE.shape)
     
        #   # ==============================建模====================================
         
        #   rf1=RandomForestRegressor(n_estimators=1100
        #                             ,max_depth=80
        #                             ,random_state=(0))   
        #   rf1.fit(train_Drivers,train_LE)    
         
        #   rmse1=np.sqrt(mean_squared_error(test_LE,rf1.predict(test_Drivers)))
         
  
        #   rmse_list1.append(rmse1)
        #   rmse_df=pd.DataFrame({'rmse':rmse_list1})
        #   print(rmse_df)
         
        #   r2=r2_score(test_LE,rf1.predict(test_Drivers))  
        #   r2_list1.append(r2)
        #   r2_df=pd.DataFrame({'r2':r2_list1})
         
        #   bias=(rf1.predict(test_Drivers)-test_LE).mean()
        #   bias_list1.append(bias)
        #   bias_df=pd.DataFrame({'bias':bias_list1})
         
        #   site_list1+=[s.split('_',6)[1]]
        #   year_list1+=[int(s.split('_',6)[5].split('-',1)[1])
        #               -int(s.split('_',6)[5].split('-',1)[0])+1]  
         
        #   # total_number.append(int(b.shape[0]))
        #   # post_dropna_number.append(int(a.shape[0]))
        #   # post_drop_le_abnormal_number.append(int(c.shape[0]))
        #   test_number1.append(int(test.shape[0]))
        #   train_number1.append(int(train.shape[0]))
         
   
        #   dic2={'SITES':site_list1,'YEAR':year_list1
        #         # ,'原始数目':total_number
        #         # ,'去掉空值后':post_dropna_number
        #         # ,'去掉LE异常值后':post_drop_le_abnormal_number
        #         ,'TRAIN_NUMBER':train_number1
        #         ,'TEST_NUMBER':test_number1
        #         # ,'n_estimators':N_estimators,'max_depth':Max_depth
        #         ,'RMSE':rmse_list1,'R2':r2_list1,'BIAS':bias_list1
               
        #       }
         
        #   dic2=pd.DataFrame(dic2)
        #   print(dic2)
        #   dic2.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\GAP_diff', str(s.split('_',6)[1]) + '.csv'),index = False)


    
    #========================================读一下八个csv
    
#     dic_list=[]
    
#     for i in range(3,5):
        
#         df=pd.read_csv(os.path.join(outpath,str(s.split('_',6)[1]) + str(i) + '.csv'))
        
#         dic={'list_name':df, 'rmse':df['RMSE'][0]}
        
#         dic_list+=[dic]
        
#         print(dic_list)
#     print('=============================================')
        
#     # df3=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '3' +'.csv'))
#     # df4=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '4' +'.csv'))
#     # df5=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '5' +'.csv'))
#     # df6=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '6' +'.csv'))
#     # df7=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '7' +'.csv'))
#     # df8=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '8' +'.csv'))
#     # df9=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '9' +'.csv'))
#     # df10=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '10' +'.csv'))
#     # df11=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '11' +'.csv'))   
    
#     # dic=[{'list_name':df3, 'rmse':df3['RMSE'][0]}
#     #      ,{'list_name':df4, 'rmse':df4['RMSE'][0]}
#     #      ,{'list_name':df5, 'rmse':df5['RMSE'][0]}
#     #      ,{'list_name':df6, 'rmse':df6['RMSE'][0]}
#     #      ,{'list_name':df7, 'rmse':df7['RMSE'][0]}
#     #      ,{'list_name':df8, 'rmse':df8['RMSE'][0]}
#     #      ,{'list_name':df9, 'rmse':df9['RMSE'][0]}
#     #      ,{'list_name':df10, 'rmse':df10['RMSE'][0]}
#     #      ,{'list_name':df11, 'rmse':df11['RMSE'][0]}
#     #     ]
    
#     sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False)
#     print(sorted_dic)
#     list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe
#     print(list_name)
#     df=pd.concat(list_name,axis=1)
        
#     print(df)
#     df.to_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))


#     a=pd.read_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))
#     # pd.set_option('display.max_columns', None)
#     df=pd.DataFrame(a.isna().sum().sort_values(ascending=False))
#     print(a)
#     # 直接用fillna来填,可行, 但还要填drivers!!!
#     # 找rmse最低值 对应的来开始填补
#     print(df.columns)
    # 一
    # b=a.loc[a['LE_Gap_filled'] > -9999, ['LE_Gap_filled','Drivers','RMSE']]

    # a['Drivers']=a.loc[a['LE_Gap_filled'] == np.nan, ['Drivers']]
    # a['Drivers'].fillna( b['Drivers'] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
    # print(a['Drivers'].describe())

    # a['RMSE']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE']]
    # a['RMSE'].fillna( b['RMSE'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    # print(a['RMSE'].describe())

    # b=a.loc[a['LE_Gap_filled.1'] > -9999, ['LE_Gap_filled.1', 'Drivers.1', 'RMSE.1']] # 只是有LE数值的地方,用来填充上边的空集

    # a['Drivers.1']=a.loc[a['LE_Gap_filled.1'] == np.nan, ['Drivers.1']]
    # a['Drivers.1'].fillna( b['Drivers.1'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
    # print(a['Drivers.1'].describe())

    # a['RMSE.1']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.1']]
    # a['RMSE.1'].fillna( b['RMSE.1'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
    # print(a['RMSE.1'].describe())

    # a['LE_Gap_filled'].fillna(a['LE_Gap_filled.1'], inplace=True) # LE Update
    # df1=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
    # print(df1)

    # a['Drivers'].fillna(a['Drivers.1'],inplace=True)  # Drivers Update
    # print(a['Drivers'].describe())

    # a['RMSE'].fillna(a['RMSE.1'],inplace=True)  # Rmse Update
    # print(a['RMSE'].describe())


#     # 二
#     b=a.loc[a['LE_Gap_filled.2'] > -9999, ['LE_Gap_filled.2', 'Drivers.2', 'RMSE.2']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.2']=a.loc[a['LE_Gap_filled.2'] == np.nan, ['Drivers.2']]
#     a['Drivers.2'].fillna( b['Drivers.2'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.2'].describe())

#     a['RMSE.2']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.2']]
#     a['RMSE.2'].fillna( b['RMSE.2'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.2'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.2'], inplace=True) # LE Update
#     df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df2)

#     a['Drivers'].fillna(a['Drivers.2'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.2'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 三
#     b=a.loc[a['LE_Gap_filled.3'] > -9999, ['LE_Gap_filled.3', 'Drivers.3', 'RMSE.3']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.3']=a.loc[a['LE_Gap_filled.3'] == np.nan, ['Drivers.3']]
#     a['Drivers.3'].fillna( b['Drivers.3'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.3'].describe())

#     a['RMSE.3']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.3']]
#     a['RMSE.3'].fillna( b['RMSE.3'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.3'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.3'], inplace=True) # LE Update
#     df3=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df3)

#     a['Drivers'].fillna(a['Drivers.3'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.3'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 四
#     b=a.loc[a['LE_Gap_filled.4'] > -9999, ['LE_Gap_filled.4', 'Drivers.4', 'RMSE.4']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.4']=a.loc[a['LE_Gap_filled.4'] == np.nan, ['Drivers.4']]
#     a['Drivers.4'].fillna( b['Drivers.4'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.4'].describe())

#     a['RMSE.4']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.4']]
#     a['RMSE.4'].fillna( b['RMSE.4'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.4'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.4'], inplace=True) # LE Update
#     df4=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df4)

#     a['Drivers'].fillna(a['Drivers.4'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.4'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 五
#     b=a.loc[a['LE_Gap_filled.5'] > -9999, ['LE_Gap_filled.5', 'Drivers.5', 'RMSE.5']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.5']=a.loc[a['LE_Gap_filled.5'] == np.nan, ['Drivers.5']]
#     a['Drivers.5'].fillna( b['Drivers.5'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.5'].describe())

#     a['RMSE.5']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.5']]
#     a['RMSE.5'].fillna( b['RMSE.5'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.5'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.5'], inplace=True) # LE Update
#     df5=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df5)

#     a['Drivers'].fillna(a['Drivers.5'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.5'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 六
#     b=a.loc[a['LE_Gap_filled.6'] > -9999, ['LE_Gap_filled.6', 'Drivers.6', 'RMSE.6']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.6']=a.loc[a['LE_Gap_filled.6'] == np.nan, ['Drivers.6']]
#     a['Drivers.6'].fillna( b['Drivers.6'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.6'].describe())

#     a['RMSE.6']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.6']]
#     a['RMSE.6'].fillna( b['RMSE.6'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.5'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.6'], inplace=True) # LE Update
#     df6=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df6)

#     a['Drivers'].fillna(a['Drivers.6'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.6'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())


#     # 七
#     b=a.loc[a['LE_Gap_filled.7'] > -9999, ['LE_Gap_filled.7', 'Drivers.7', 'RMSE.7']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.7']=a.loc[a['LE_Gap_filled.7'] == np.nan, ['Drivers.7']]
#     a['Drivers.7'].fillna( b['Drivers.7'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.7'].describe())

#     a['RMSE.7']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.7']]
#     a['RMSE.7'].fillna( b['RMSE.7'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.7'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.7'], inplace=True) # LE Update
#     df7=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df7)

#     a['Drivers'].fillna(a['Drivers.7'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.7'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())

#     # 八
#     b=a.loc[a['LE_Gap_filled.8'] > -9999, ['LE_Gap_filled.8', 'Drivers.8', 'RMSE.8']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.8']=a.loc[a['LE_Gap_filled.8'] == np.nan, ['Drivers.8']]
#     a['Drivers.8'].fillna( b['Drivers.8'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.8'].describe())

#     a['RMSE.8']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.8']]
#     a['RMSE.8'].fillna( b['RMSE.8'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.8'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.8'], inplace=True) # LE Update
#     df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df8)

#     a['Drivers'].fillna(a['Drivers.8'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.8'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())
    
    
#     # 九
#     b=a.loc[a['LE_Gap_filled.9'] > -9999, ['LE_Gap_filled.9', 'Drivers.9', 'RMSE.9']] # 只是有LE数值的地方,用来填充上边的空集

#     a['Drivers.9']=a.loc[a['LE_Gap_filled.9'] == np.nan, ['Drivers.9']]
#     a['Drivers.9'].fillna( b['Drivers.9'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
#     print(a['Drivers.9'].describe())

#     a['RMSE.9']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.9']]
#     a['RMSE.9'].fillna( b['RMSE.9'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
#     print(a['RMSE.9'].describe())

#     a['LE_Gap_filled'].fillna(a['LE_Gap_filled.9'], inplace=True) # LE Update
#     df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
#     print(df8)

#     a['Drivers'].fillna(a['Drivers.9'],inplace=True)  # Drivers Update
#     print(a['Drivers'].describe())

#     a['RMSE'].fillna(a['RMSE.9'],inplace=True)  # Rmse Update
#     print(a['RMSE'].describe())





#     # 加一下a的时间

#     so=pd.read_csv(os.path.join(path1,s))
#     so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]

#     print(a['TIMESTAMP_START'])

#     print(a.shape)

#     a['QC'] = np.nan
#     a.loc[a['LE_Gap_filled'] > -9999, 'QC'] = 1
#     a.loc[a['LE_Gap_filled'] == -9999 , 'QC'] = 0
    
#     a['LE_Gap_filled'].replace(np.nan,-8888,inplace=True) # 原本是空值的部分  由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
#     a['LE_Gap_filled'].replace(-9999,np.nan,inplace=True) #       |          空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
#     a['LE_Gap_filled'].fillna(sole['LE_F_MDS'],inplace=True)#  最后依旧是空值     
     
#     a.loc[a['LE_Gap_filled'] == -8888 , 'QC'] = -9999

    
#     print(a.dropna().shape[0]/a.shape[0])
    
#     a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled', 'QC',  'Drivers', 'RMSE']]
    
#     a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')

 
#     a['LE_Gap_filled'].fillna(a['LE_F_MDS'],inplace=True)   
#     a['LE_Gap_filled'].replace(-8888,np.nan,inplace=True)    
    
#     a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled', 'QC',  'Drivers', 'RMSE']]
    
    
#     a['SW_IN_F_MDS']=np.nan
#     a['NETRAD']=np.nan
#     a['G_F_MDS']=np.nan
#     a['TA_F_MDS']=np.nan
#     a['RH']=np.nan
#     a['WD']=np.nan 
#     a['WS']=np.nan 
    
#     a['PA_F']=np.nan
#     a['VPD_F_MDS']=np.nan
#     a['NDVI']=np.nan
#     a['TS_F_MDS_1']=np.nan
#     a['SWC_F_MDS_1']=np.nan
#     a['TA_F_MDS']=np.nan
    
#     a['Drivers'].replace(np.nan,-9999,inplace=True)

    
#     b=a.loc[a['Drivers']!=-9999]
#     # print(b)
    
#     for i in b.columns[6:]:
        
#         # print(i)
        
#         c=b[b['Drivers'].str.contains(i)]

#         c[i].replace(np.nan,'+',inplace=True)
        
#         a[i]=c[i]
        
#     b=a.count(axis=1)-6
#     b=pd.DataFrame(b)
    
#     a['n_drivers']=b
    
#     a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
    
#     a['Drivers'].replace(-9999,np.nan,inplace=True)



#     # a.to_csv(os.path.join(path,sole+'.csv'),index = False)

    
#     a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(s.split('_',6)[1]) +'.csv'),index = False)
             



# #  创造空列
# # df["Empty_1"] = ""
# # df["Empty_2"] = np.nan
# # df['Empty_3'] = pd.Series() 
    














        
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值