山东省第二届数据应用创新大赛日照赛区-公积金贷款逾期预测-赛后总结

在这里插入图片描述
任务
从真实场景和实际应用出发,利用个人的基本身份信息、个人的住房公积金缴存和贷款等数据信息,需要参赛者建立准确的风险控制模型,来预测用户是否会逾期还款。

提交说明:

  1. 结果
    提交csv格式,编码为UTF-8,第一行为表头,如下例:
    id,label
    1,0.556
    2,0.987

    注:对于label字段,其中越接近0代表无逾期,越接近1代表逾期。

数据
训练集提供40000名,测试集提供15000名的缴存人基本信息、缴存信息,贷款信息。选手可以下载数据,在本地进行算法调试,在比赛页面提交结果。
数据样本如下:
在这里插入图片描述

1.导包

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# export CUDA_VISIBLE_DEVICES=0
# 打印 TF 可用的 GPU
print(os.environ['CUDA_VISIBLE_DEVICES'])

import warnings
import numpy as np
import pandas as pd
# import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False
import json
import matplotlib 
from scipy.stats import chi2
import scipy
import seaborn as sns

from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
# from xgboost import XGBClassifier
import time
from sklearn.ensemble import GradientBoostingClassifier
# from lightgbm import LGBMClassifier
from tqdm import tqdm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

2.工具函数
2.1统计相关系数较大的字段

def relation(df, poly_num=0.15):
     """
     DataFrame.corr(method='pearson', min_periods=1)
     参数说明:
     method:可选值为{‘pearson’, ‘kendall’, ‘spearman’}
     pearson:Pearson相关系数来衡量两个数据集合是否在一条线上面,即针对线性数据的相关系数计算,针对非线性                                           数据便会有误差。
     kendall:用于反映分类变量相关性的指标,即针对无序序列的相关系数,非正太分布的数据
     spearman:非线性的,非正太分析的数据的相关系数
     min_periods:样本最少的数据量
     返回值:各类型之间的相关系数DataFrame表格。
     """
    all_cate_2_col=[]
    method=['pearson','kendall','spearman']
    for m in method:
        poly_corrs = df[:40000].corr(method=m)['label'].sort_values()
        po_temp = []
        for i in range(len(poly_corrs)):
            if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in ['label']):
                po_temp.append(poly_corrs.index[i])      
        print(str(m)+'相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
        
        #取并集
        all_cate_2_col=list(set(all_cate_2_col).union(set(po_temp)))
        print(len(all_cate_2_col))
#         print(all_cate_2_col)
        
    return all_cate_2_col

2.2循环递归消除法RFECV,进行特征选择

# 循环递归消除法RFECV
def clf_rfecv(df,cate_2_cols,rank_num=1): 
    cate_2_cols=[col for col in df.columns if col not in ['id', 'label']]
    X=df[:40000][cate_2_cols]
    y=df[:40000]['label']
    print(X.shape)
    print(y.shape)
    # RFECV
    clf_rfecv = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9
    )

    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    rfecv = RFECV(
        estimator=clf_rfecv,          # 学习器
        step=1,                 # 移除特征个数
        cv=StratifiedKFold(5),  # 交叉验证次数
        scoring='accuracy',     # 学习器的评价标准
        verbose = 1,
        n_jobs = 12
        ).fit(X, y)
    X_RFECV = rfecv.transform(X)
    print("RFECV特征选择结果——————————————————————————————————————————————————")
    # 和传参对应,所选择的属性的个数
    print("有效特征个数: \n"+str(rfecv.n_features_))
    #     # 打印的是相应位置上属性的排名
    #     print("全部特征等级: \n"+str(rfecv.ranking_))
    #     # 属性选择的一种模糊表示,选择的是true,未选择的是false
    #     print(rfecv.support_)
    rfecv_cate_2_col=[]
    for i in range(len(cate_2_cols)):
        if(rfecv.ranking_[i]<=rank_num):
            print(cate_2_cols[i])
            rfecv_cate_2_col.append(cate_2_cols[i])
    print(len(rfecv_cate_2_col))
#     
    return rfecv_cate_2_col

2.3找到数值变化较少的字段

# 数值类型较少的数据
def find_weak_filed(df):
    weak_filed=[]
    for i in range(len(df.columns)):
#         print('-------------'+str(df.columns[i])+'---------------')
        else_sum=0
        for j in range(1,len(df[df.columns[i]].value_counts().index)):
            else_sum=else_sum+df[df.columns[i]].value_counts().values[j]
#         print(else_sum)
        if(else_sum<=50):
            weak_filed.append(df.columns[i])

    return weak_filed

2.4 统计单值,二值,多分类,连续型字段

def find_filed_class(df,n=20):
    cate_1_cols=[]
    cate_2_cols=[]
    cate_cols=[]
    num_cols1=[]
    for i in tqdm(range(len(df.columns))):
        if(len(df[df.columns[i]].value_counts().index)==1):
            cate_1_cols.append(df.columns[i])
        if(len(df[df.columns[i]].value_counts().index)==2 and df.columns[i]!='label'):
            cate_2_cols.append(df.columns[i])
        elif(2<len(df[df.columns[i]].value_counts().index)<=n and df.columns[i]!='DKLL'):
            cate_cols.append(df.columns[i])
        elif(len(df[df.columns[i]].value_counts().index)>n and df.columns[i]!='id'):
            num_cols1.append(df.columns[i])
    print(len(cate_1_cols))
    print(len(cate_2_cols))
    print(len(cate_cols))
    print(len(num_cols1))
    return cate_1_cols,cate_2_cols,cate_cols,num_cols1

2.5GBDT衡量特征的重要性,进行特征选择

# GBDT是如何衡量特征的重要性的?
# 计算所有的非叶子节点在分裂时加权不纯度的减少,减少得越多说明特征越重要。
# 不纯度的减少实际上就是该节点此次分裂的收益,因此我们也可以这样理解,节点分裂时收益越大,
# 该节点对应的特征的重要度越高。
# 基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 
# 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征,相关代码如下:

#GBDT作为基模型的特征选择
def GBDTselectfea(df,max_num=200):
    cols = [col for col in df.columns if col not in ['id', 'label']]
    X=df[:40000][cols]
    y=df[:40000]['label']
    print(X.shape)
    print(y.shape)
    if(X.shape[1]<max_num):
        max_num=X.shape[1]
    grd = SelectFromModel(GradientBoostingClassifier(),max_features=max_num)
    grd.fit_transform(X,y)
    # print(grd.feature_importances_)
    gbdt_fea_select = grd.get_support()
    gbdt_select=[]
    gbdt_fea_select = grd.get_support()
    print(gbdt_fea_select)
    for i in range(len(gbdt_fea_select)):
    #     print(gbdt_fea_select[i])
    #     print(X.columns)
        if(gbdt_fea_select[i]==True):
            print(X.columns[i])
            gbdt_select.append(X.columns[i])
    #     print(gbdt_select)
    print(len(gbdt_select))
    return gbdt_select

2.6构造多项式特征并计算相关系数较大的字段

def polynomial_features(df, poly_num=0.15 ,change=0,degreenum=2):
    """
    poly_num:相关性    change:0原始字段不变,1输出新增字段    degreenum:阶数
    """
    num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
    
    poly_features = df[:40000][num_gen_feats]
    # Create the polynomial object with specified degree
    poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
    poly_transformer.fit(poly_features)
    poly_features = poly_transformer.transform(poly_features)
    # 新特征是否与target有相关性。
    poly_features = pd.DataFrame(poly_features ,
        columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
    )
    poly_features['TARGET'] =df[:40000]['label']
    poly_corrs = poly_features.corr()['TARGET'].sort_values()

    po_temp = []
    for i in range(len(poly_corrs)):
        if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_cols + gen_feats + ['TARGET']):
            po_temp.append(poly_corrs.index[i])

    print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
    
    if(change == 1):
        dfpo = df[num_cols + gen_feats]
        dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
        dfpo_transformer.fit(dfpo)
        dfpo = dfpo_transformer.transform(dfpo)
        dfpo = pd.DataFrame(dfpo ,
            columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
        )
        # 新的字段拼接到df上
        df = dfpo[po_temp]

    return df

2.7展示连续数据的分布和其log后的分布

def Normal_distribution(df, value_vars, change=0):
    """
    value_vars:需要查看的字段       change:0不变,1变log,2新增log
    """
    for i in tqdm(range(len(value_vars))):
        plt.figure(figsize=(16,5))
        plt.suptitle(str(value_vars[i])+'Distribution', fontsize=10)

        plt.subplot(1,2,1)
        sub_plot_1 = sns.distplot(df[value_vars[i]])
        sub_plot_1.set_title(str(value_vars[i])+" Distribuition", fontsize=10)
        sub_plot_1.set_xlabel("数值")
        sub_plot_1.set_ylabel("Probability", fontsize=10)

        plt.subplot(1,2,2)    
        sub_plot_2 = sns.distplot(np.log(df[value_vars[i]]+1))
        sub_plot_2.set_title(str(value_vars[i])+"(Log) Distribuition", fontsize=10)
        sub_plot_2.set_xlabel("数值")
        sub_plot_2.set_ylabel("Probability", fontsize=10)
    if(change==1):
        for i in tqdm(range(len(value_vars))):
            df[value_vars[i]] = np.log(df[value_vars[i]]+1)
    if(change==2):
        for i in tqdm(range(len(value_vars))):
            df[str(value_vars[i])+'_log'] = np.log(df[value_vars[i]]+1)
        return df
    return df

2.8检测并删除数值完全一样的字段

#查看是否存在重复的行列
def if_field_is_same(df):
    all_df_cols = df.columns
    hight = len(df)
    del_filed=[]
#     # 删除存在重复的行
#     print('是否存在重复行: ',any(df.duplicated()))
#     if(any(df.duplicated())==True):
#         df.drop_duplicates(inplace = True)
    # 检查是否存在重复的列
    for i in tqdm(range(0,len(all_df_cols)-1,1)):
#       print("---------"+str(all_df_cols[i])+"---------")
        if(all_df_cols[i] not in del_filed):
            for j in range(i+1,len(all_df_cols),1):
                # print(all_df_cols[j])
                for k in range(hight):
                    # print(k)
                    if(df[all_df_cols[i]][k]!=df[all_df_cols[j]][k]):
                        # print("not_same")
                        break
                if(k==hight-1):
#                     print("字段 "+str(all_df_cols[i])+" 与字段 "+str(all_df_cols[j])+" 完全一样")
                    del_filed.append(all_df_cols[j])
    if(len(del_filed)==0):
        print('是否存在重复列: 否')
    else:
        del_filed = set(del_filed)
        print('存在重复列: '+str(len(del_filed))+'个\n为:'+str(del_filed))
        df=df.drop(del_filed,axis=1)
    return df

2.9绘制变量分布的散点图

# 变量的数值分布
def shuzhifenbu(cols, high=40):
    for i in range(len(cols)):
        plt.figure(figsize=(15,high))
        print(str(cols[i])+"的数值分布")
        plt.subplot(len(cols), 1, i+1)
        plt.title(cols[i])
        x = df[cols[i]]
        y = df.index
        plt.scatter(x, y , s=1)
        plt.show()
    return

2.10卡方分箱

# 卡方分箱
# 计算卡方值
def chi3(arr):
    '''
    计算卡方值
    arr:频数统计表,二维numpy数组。
    '''
    assert(arr.ndim==2)
    #计算每行总频数
    R_N = arr.sum(axis=1)
    #每列总频数
    C_N = arr.sum(axis=0)
    #总频数
    N = arr.sum()
    # 计算期望频数 C_i * R_j / N。
    E = np.ones(arr.shape)* C_N / N
    E = (E.T * R_N).T
    square = (arr-E)**2 / E
    #期望频数为0时,做除数没有意义,不计入卡方值
    square[E==0] = 0
    #卡方值
    v = square.sum()
    return v

# 确定卡方分箱点
def chiMerge(df,col,target,max_groups=None,threshold=None): 
    '''
    卡方分箱
    df: pandas dataframe数据集
    col: 需要分箱的变量名(数值型)
    target: 类标签
    max_groups: 最大分组数。
    threshold: 卡方阈值,如果未指定max_groups,默认使用置信度95%设置threshold。
    return: 包括各组的起始值的列表.
    '''
    freq_tab = pd.crosstab(df[col],df[target])
    #转成numpy数组用于计算。
    freq = freq_tab.values
    #初始分组切分点,每个变量值都是切分点。每组中只包含一个变量值.
    #分组区间是左闭右开的,如cutoffs = [1,2,3],则表示区间 [1,2) , [2,3) ,[3,3+)。
    cutoffs = freq_tab.index.values
    #如果没有指定最大分组
    if max_groups is None:     
        #如果没有指定卡方阈值,就以95%的置信度(自由度为类数目-1)设定阈值。
        if threshold is None:
            #类数目
            cls_num = freq.shape[-1]
            threshold = chi2.isf(0.05,df= cls_num - 1)
    while True:
        minvalue = None
        minidx = None
        #从第1组开始,依次取两组计算卡方值,并判断是否小于当前最小的卡方
        for i in range(len(freq) - 1):
            v = chi3(freq[i:i+2])
            if minvalue is None or (minvalue > v): #小于当前最小卡方,更新最小值
                minvalue = v
                minidx = i
        #如果最小卡方值小于阈值,则合并最小卡方值的相邻两组,并继续循环
        if  (max_groups is not None and  max_groups< len(freq) ) or (threshold is not None and minvalue < threshold):
            #minidx后一行合并到minidx
            tmp  = freq[minidx] + freq[minidx+1]
            freq[minidx] = tmp
            #删除minidx后一行
            freq = np.delete(freq,minidx+1,0)
            #删除对应的切分点
            cutoffs = np.delete(cutoffs,minidx+1,0)
        else: #最小卡方值不小于阈值,停止合并。
            break
    return cutoffs

# 生成分组后的新变量
def value2group(x,cutoffs):
    '''
    将变量的值转换成相应的组。
    x: 需要转换到分组的值
    cutoffs: 各组的起始值。
    return: x对应的组,如group1。从group1开始。
    '''
    #切分点从小到大排序。
    cutoffs = sorted(cutoffs)
    num_groups = len(cutoffs)
    #异常情况:小于第一组的起始值。这里直接放到第一组。
    #异常值建议在分组之前先处理妥善。
    if x < cutoffs[0]:
        return 'group1'
    for i in range(1,num_groups):
        if cutoffs[i-1] <= x < cutoffs[i]:
            return 'group{}'.format(i)
    #最后一组,也可能会包括一些非常大的异常值。
    return 'group{}'.format(num_groups)

# 实现WOE 编码
def calWOE(df ,var ,target):
    '''
    计算WOE编码
    param df:数据集pandas.dataframe
    param var:已分组的列名,无缺失值
    param target:响应变量(0,1)
    return:编码字典
    '''
    eps = 0.000001  #避免除以0
    gbi = pd.crosstab(df[var],df[target]) + eps
    gb = df[target].value_counts() + eps
    gbri = gbi/gb
    gbri['woe'] = np.log(gbri[1]/gbri[0])
    return gbri['woe'].to_dict()

# 实现IV值计算
def calIV(df,var,target):
    '''
    计算IV值
    param df:数据集pandas.dataframe
    param var:已分组的列名,无缺失值
    param target:响应变量(0,1)
    return:IV值
    '''
    eps = 0.000001  #避免除以0 
    gbi = pd.crosstab(df[var],df[target]) + eps
    gb = df[target].value_counts() + eps
    gbri = gbi/gb
    gbri['woe'] = np.log(gbri[1]/gbri[0])
    gbri['iv'] = (gbri[1] - gbri[0])*gbri['woe']
    return gbri['iv'].sum()

2.11 使用LGBMClassifier计算feature_importances_

# 筛选相关性>0的字段
def important_featrue(pre_train, pre_train_label):
    svc = LGBMClassifier(
        boosting_type='dart',   #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。
        learning_rate=0.23,      #0.05->0.918     0.07->0.924    0.08->0.926
        n_estimators=150,      #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。
        max_depth=31,           #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。
        num_leaves=1053,          #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth)
        subsample=0.2707,         #训练样本采样率,行
        colsample_bytree=0.95,   #训练特征采样率,列
        random_state=6,     #随机种子数
        min_data_in_leaf=124,        #     可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves
        reg_alpha= 0.2462,
        reg_lambda=0.3140,
    #     lambda_l1= 0.89,   # 0.1
    #     lambda_l2=0.69,  # 0.2
        min_split_gain=0.22,
        min_child_weight=0.84,
        metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss'
        n_jobs=12,              #并行运行多线程核心数
        verbose=-1
    )
    
    x_train = pre_train
    y_train = pre_train_label

    #fit
    svc.fit(x_train, y_train)

    feat_labels = x_train.columns[0:]
    fold_importance_df = pd.DataFrame()
    fold_importance_df["importance"] = svc.feature_importances_
    fold_importance_df["featrue_name"] = feat_labels
    importances = fold_importance_df["importance"]
    
    useful_featrue=[]
    for i in tqdm(range(len(fold_importance_df))):
        if(fold_importance_df['importance'][i]!=0):
    #         print(fold_importance_df['featrue_name'][i])
            useful_featrue.append(fold_importance_df['featrue_name'][i])
    # print(importance_0)
    useful_featrue = pd.DataFrame(useful_featrue, columns=['featrue_name']) 
    useful_featrue.to_csv('D:/useful_featrue.csv',index=0)
    print(len(useful_featrue))
    return

2.12 找到空字段

# 判断是否有空字段
"""
输入:df
输出:col_is_null有空值的字段;missing空值率
"""
def pankong(df):
    temp = []
    col_is_null = []
    j = 0
    temp=df.isnull().any()
    # print(temp)#返回每列是否有空值
    colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
    for i in range(len(colnull['isnulls'])):
        if(colnull['isnulls'][i] == True):
            print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
            col_is_null.append(colnull['colname'][i])
            j=j+1
    print("共有字段:"+str(len(colnull))+"个 "+"   含有空值的:"+str(j)+"个")
    
    if(j>0):
        missing = []
        missing = df.isnull().sum()/len(df)
        missing = missing[missing > 0]
        missing.sort_values(inplace=True)
        plt.figure(figsize=(20, 8), dpi=80)
        missing.plot.bar()
    
    return col_is_null,missing

2.13选出大于缺失率>0.1的字段,并删除

#选出大于缺失率>0.1的字段,并删除
def select_missing_rate(df,missing,rate=0.1):
    temp = []
    for i in range(len(missing)):
        if(missing.index[i]!='label'):
            if(missing.values[i]>rate):
                temp.append(missing.index[i])
    print(temp)
    if('label' in temp):
        temp.remove('label')
    df=df.drop(temp,axis=1)
    return df

# df = select_missing_rate(df,missing,rate=0.1)

2.14

# 用众数填空字段
def fill_kongzhi(df,fill="del"):
#     sub_label_cols=[col for col in df.columns if col not in ['id', 'label']]
#     df=df[sub_label_cols]
    temp = []
    exist_nan = []
    j = 0
    temp=df.isnull().any()
#     print(temp)#返回每列是否有空值
    colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
    for i in range(len(colnull['isnulls'])):
        if(colnull['isnulls'][i] == True):
            print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
            if(colnull['colname'][i] != 'label'):
                exist_nan.append(colnull['colname'][i])
            j=j+1
    print("共有字段:"+str(len(colnull))+"个 "+"   含有空值的:"+str(j)+"个")
    
    if(fill=="mode"):
        print(len(exist_nan))
        for j in range(len(exist_nan)):
            print(str(exist_nan[j])+"---众数为---"+str(df[exist_nan[j]].mode()))
            df[exist_nan[j]].fillna(df[exist_nan[j]].median(), inplace = True)
    if(fill=="del"):
        df=df.drop(exist_nan,axis=1)
    return df

3 数据准备和简单的特征工程
3.1载入数据

train_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/train.csv')
test_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/test.csv')
submit = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/submit.csv')
train_df.shape, test_df.shape, submit.shape
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df_label = train_df_copy['label']   #保存label
# train_df_copy=train_df_copy.drop(['label'],axis=1)
#合并训练集测试集

df = pd.concat([train_df_copy, test_df_copy], axis = 0).reset_index(drop = True)
# df = pd.concat((train_df_copy, test_df_copy), axis=0)
print(df.shape)

3.2将原始变量区分连续变量,多类别变量,两类别变量

train = train_df_copy
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
# train[num_cols]
# train[cate_cols]
# train['XUELI'].value_counts()

3.3查看原始变量的数值分布

shuzhifenbu(cate_cols,25)
shuzhifenbu(cate_2_cols,15)
shuzhifenbu(num_cols)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
#可见其中,训练集与测试集中数值分布有明显区别的字段为:DKLL、ZHIWU、DWSSHY、HYZK

3.4查看异常值
检测异常的方法一:均方差
在统计学中,如果一个数据分布近似正态,那么大约 68% 的数据值会在均值的一个标准差范围内,大约 95% 会在两个标准差范围内,大约 99.7% 会在三个标准差范围内。

# 暂时不删除异常值!
# def find_outliers_by_3segama(data,fea):
#     data_std = np.std(data[fea])
#     data_mean = np.mean(data[fea])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
#     return data

# for fea in num_cols:
#     data_train = find_outliers_by_3segama(train,fea)
#     print(train[fea+'_outliers'].value_counts())
#     print(train.groupby(fea+'_outliers')['label'].sum())
#     print('*'*10)
    
# #删除异常值
# for fea in num_cols:
#     train = train[train[fea+'_outliers']=='正常值']
#     train = train.reset_index(drop=True) 
# print(train)
# numerical_fea = list(df.select_dtypes(exclude=['object']).columns)
# category_fea = list(filter(lambda x: x not in numerical_fea,list(df.columns)))
# print(numerical_fea)
# print(category_fea)

此段值观测了异常值,未进行修改
检测异常的方法二:箱型图(未做)

3.5 出生年月

# 修改出生年月为年龄.'CSY'为出生的月份,CSNY为年龄的分箱值,age为年龄
# 先 import time 然后 time.gmtime(Unix timestamp)
# import time
def transform_csny_to_age(i):
#     print(i)
    if(len(str(i))>10):
        i=i/1000
    a = time.gmtime(int(i))
#     print("year:"+str(a[0])+" "+"month:"+str(a[1]))
    age = 2020-a[0]
#     print(age)
    return age

def transform_csny_to_month(i):
    if(len(str(i))>10):
        i=i/1000
    a = time.gmtime(int(i))
#     print("year:"+str(a[0])+" "+"month:"+str(a[1]))
    month = a[1]
    return month

# 月份
df['CSY'] = df['CSNY']
df['CSY'] = df['CSY'].transform(transform_csny_to_month)
# 年龄
# df['CSN'] = df['CSNY']
df['CSNY'] = df['CSNY'].transform(transform_csny_to_age)
sns.distplot(df['CSY'][df['CSY'] > 0])
print(df['CSY'].value_counts())
def get_age(df,col = 'age'):
     df[col+"_genFeat1"]=(df['age'] > 23).astype(int)
     df[col+"_genFeat2"]=(df['age'] > 28).astype(int)
     df[col+"_genFeat3"]=(df['age'] > 32).astype(int)
     df[col+"_genFeat4"]=(df['age'] > 36).astype(int)
     df[col+"_genFeat5"]=(df['age'] > 43).astype(int)
     df[col+"_genFeat6"]=(df['age'] > 50).astype(int)
     return df, [col + f'_genFeat{i}' for i in range(1, 7)]
      
df['age'] = df['CSNY']
df, genFeats1 = get_age(df, col = 'age')
sns.distplot(df['age'][df['age'] > 0])


#将数据按照年龄(青年:20-35;中年:36-60;老年:61-85)和性别(男女)分组,共分为6组
# print(set(train_test_data_copy["年龄"]))
def transform_age(x_age):
    # print(x_age)
    if x_age<23:
        return 1
    elif 23<=x_age<28:
        return 2
    elif 28<=x_age<32:
        return 3
    elif 32<=x_age<36:
        return 4
    elif 36<=x_age<43:
        return 5
    elif 43<=x_age<50:
        return 6
    elif x_age>=50:
        return 7
df['CSNY'] = df['CSNY'].transform(transform_age)
print(df['CSNY'].value_counts())

3.6 贷款余额、贷款发放额

def get_daikuanYE(df,col):
    df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
    df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
    df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]
    
df, genFeats2 = get_daikuanYE(df, col = 'DKYE')

def get_daikuanFFE(df,col):
    df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
    df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
    df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]
df, genFeats3 = get_daikuanFFE(df, col = 'DKFFE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['DKYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['DKFFE'][df['label'] == 1])
# 小额贷款(MicroCredit)是以个人或家庭为核心的经营类贷款,
# 其主要的服务对象为广大工商个体户、小作坊、小业主。
# 贷款的金额一般为20万元以下,1000元以上。
def transform_dkye(dkye):
    if 0<=dkye<1000:
        return 1
    elif 1000<=dkye<50000:
        return 2
    elif 50000<=dkye<100000:
        return 3
    elif 100000<=dkye<150000:
        return 4
    elif 150000<=dkye<200000:
        return 5
    elif 200000<=dkye<250000:
        return 6
    elif 250000<=dkye<300000:
        return 7
    elif dkye>=300000:
        return 8

df['DKYE_class'] = df['DKYE']
df['DKYE_class'] = df['DKYE_class'].transform(transform_dkye)


def transform_dkffe(dkye):
    if 0<=dkye<1000:
        return 1
    elif 1000<=dkye<50000:
        return 2
    elif 50000<=dkye<100000:
        return 3
    elif 100000<=dkye<150000:
        return 4
    elif 150000<=dkye<200000:
        return 5
    elif 200000<=dkye<250000:
        return 6
    elif 250000<=dkye<300000:
        return 7
    elif dkye>=300000:
        return 8
    
df['DKFFE_class'] = df['DKFFE']
df['DKFFE_class'] = df['DKFFE_class'].transform(transform_dkffe)

print(df['DKYE_class'].value_counts(),
df['DKFFE_class'].value_counts())

3.7个人月缴存额

def get_GRYJCE(df,col):
    df[col + '_genFeat1'] = (df[col] > 400).astype(int)
    df[col + '_genFeat2'] = (df[col] > 600).astype(int)
    df[col + '_genFeat3'] = (df[col] > 800).astype(int)
    df[col + '_genFeat4'] = (df[col] > 1000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 1200).astype(int)
    df[col + '_genFeat6'] = (df[col] > 1400).astype(int)
    df[col + '_genFeat7'] = (df[col] > 1600).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]

df, genFeats4 = get_GRYJCE(df, col = 'GRYJCE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRYJCE'][df['label'] == 1])

def transform_GRYJCE(dkye):
    if dkye<=400:
        return 1
    elif 400<dkye<=600:
        return 2
    elif 600<dkye<=800:
        return 3
    elif 800<dkye<=1000:
        return 4
    elif 1000<dkye<=1200:
        return 5
    elif 1200<dkye<=1400:
        return 6
    elif 1400<dkye<=1600:
        return 7
    elif dkye>1600:
        return 8

df['GRYJCE_class'] = df['GRYJCE']
df['GRYJCE_class'] = df['GRYJCE_class'].transform(transform_GRYJCE)

print(df['GRYJCE_class'].value_counts())

3.8个人缴款基数

def get_GRYJCE(df,col):
    df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 6000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 8000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 1200).astype(int)

    return df, [col + f'_genFeat{i}' for i in range(1, 6)]

df, genFeats5 = get_GRYJCE(df, col = 'GRJCJS')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRJCJS'][df['label'] == 1])


def transform_GRJCJS(dkye):
    # print(x_age)
    if 0<=dkye<2000:
        return 1
    elif 2000<=dkye<4000:
        return 2
    elif 4000<=dkye<6000:
        return 3
    elif 6000<=dkye<8000:
        return 4
    elif 8000<=dkye<12000:
        return 5
    elif dkye>=12000:
        return 6
df['GRJCJS_class'] = df['GRJCJS']
df['GRJCJS_class'] = df['GRJCJS_class'].transform(transform_GRJCJS)
print(df['GRJCJS_class'].value_counts())

3.9 个人账户余额、上年归结余额

def get_GRZHYE(df,col):
    df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 8000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 12000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 20000).astype(int)

    return df, [col + f'_genFeat{i}' for i in range(1, 6)]

df, genFeats6 = get_GRZHYE(df, col = 'GRZHYE')
df, genFeats7 = get_GRZHYE(df, col = 'GRZHSNJZYE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRZHYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['GRZHSNJZYE'][df['label'] == 1])

def transform_GRZHYE(dkye):
    # print(x_age)
    if 0<=dkye<2000:
        return 1
    elif 2000<=dkye<4000:
        return 2
    elif 4000<=dkye<8000:
        return 3
    elif 8000<=dkye<12000:
        return 4
    elif 12000<=dkye<20000:
        return 5
    elif dkye>=20000:
        return 6
df['GRZHYE_class'] = df['GRZHYE']
df['GRZHYE_class'] = df['GRZHYE_class'].transform(transform_GRZHYE)
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE']
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE_class'].transform(transform_GRZHYE)
print(df['GRZHYE_class'].value_counts(),
df['GRZHSNJZYE_class'].value_counts())

3.10 消除DKLL的扰动

# 消除DKLL的扰动
dkll = test_df_copy['DKLL'].value_counts()
dkll_value = pd.DataFrame(data={'colname': dkll.index,'value':dkll.values})
dkll_value[:6]
temp_dkll_value = dkll_value[:6]['colname']
print(temp_dkll_value)

# 找出df中所有值为最常出现的六种贷款利率的行作为训练集
dkll_index=[]
for i in tqdm(range(len(df))):
    for j in range(len(temp_dkll_value)):
        if (df['DKLL'][i]==temp_dkll_value[j]):
            dkll_index.append(i)              
print(len(dkll_index))
# print(dkll_index)

#dkll_index中索引在40000-54999区间,且DKLL不是最常见的六种的行最为测试集,预测其真实值
test_all_index = list(range(40000,55000))
test_index = [i for i in test_all_index if i not in dkll_index]
# print(test_index)
print(len(test_index))

# dkll训练集
tarin_df_dkll = []
tarin_df_dkll = df.loc[dkll_index]
print(tarin_df_dkll)
# dkll测试集
test_df_dkll= []
test_df_dkll = df.loc[test_index]
print(test_df_dkll)
pankong(tarin_df_dkll)
dkll_cols = [col for col in tarin_df_dkll.columns if col not in ['DKLL','label','id']]
X = tarin_df_dkll[dkll_cols]
Y = pd.get_dummies(tarin_df_dkll['DKLL'])
print(Y)

#决策树
# from sklearn import tree
# clf = tree.DecisionTreeClassifier(criterion='entropy')   

# 随机森林
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=200)

# # 导入KNN 分类器
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

clf.fit(X,Y)
test_df_dkll = test_df_dkll[dkll_cols]
res = clf.predict(test_df_dkll)
print(len(res))

res_temp = []
for i in range(len(res)):
#     print(res[i])
    if(res[i][0]==1):
        res_temp.append(2.292)
    elif(res[i][1]==1):
        res_temp.append(2.521)
    elif(res[i][2]==1):
        res_temp.append(2.708)
    elif(res[i][3]==1):
        res_temp.append(2.979)
    elif(res[i][4]==1):
        res_temp.append(3.250)
    elif(res[i][5]==1):
        res_temp.append(3.575)
    else:
        res_temp.append(2.708)
#         res_temp.append(1.111111111111111111)
print(len(res_temp))   
print(res_temp)

test_df_dkll['DKLL'] = res_temp
for i in (test_index):
    df.at[i,'DKLL'] = test_df_dkll['DKLL'][i]     #使用at来改变df
# df['DKLL']

plt.figure(figsize=(15,5))
print("DKLL的数值分布")
plt.title('DKLL')
x = df['DKLL']
y = df.index
plt.scatter(x, y , s=1)
plt.show()

df['DKLL_CLASS']=df['DKLL']

4. 特征工程

# 可以把生成后的新特征也归类到以下三种,生成更多的新特征!!!!!!!!!!!!!!!!!!!!!!!!!
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
old_fea=[]
old_fea = [col for col in df.columns if col not in ['id', 'label']]
print(old_fea)

4.1 构造业务特征

# 数据脱敏bai处理就是对敏感数据du进行变形处zhi理,其目的是保护隐私dao数据zhuan等信息的安全,
# 例如机构和企业收集的个人身份信息、手机号码、银行卡信息等敏感数据。

#个人月缴存额,单位月缴存额---新建相关字段
df['YEAR_GRYJCE'] = df['GRYJCE']*12    #一年的总个人缴存额
df['MONTH_GRYJCE_DWYJCE'] = df['GRYJCE'] + df['DWYJCE']    #一个月的总缴存额
df['YEAR_GRYJCE_DWYJCE'] = (df['GRYJCE'] + df['DWYJCE'])*12    #一年的总缴存额

#贷款余额,贷款发放额---新建相关字段
df['DKYE_TO_DKFFE'] = df['DKYE'] / df['DKFFE']    #已还本金占比
df['DKFFE_SUB_DKYE'] = df['DKFFE'] - df['DKYE']    #贷款未还本金  
df['DKFFE_SUB_DKYE_TO_DKFFE'] = (df['DKFFE'] - df['DKYE'])/ df['DKFFE']    #未还本金占比
df['WEIHUAN_TO_YIHUAN'] = df['DKFFE_SUB_DKYE']/df['DKYE']    #未还比已还
# df['YIHUAN_TO_WEIHUAN'] = df['DKYE']/df['DKFFE_SUB_DKYE']    #已还比未还
df['REAL_DKLL'] = df['DKLL']/100
df['DKFFE_SUB_DKYE_DKLL'] = (df['DKFFE'] - df['DKYE'])*df['REAL_DKLL']    #贷款未还本金*利率=未还利息  
df['DKFFE_SUB_DKYE_1_DKLL'] = (df['DKFFE'] - df['DKYE'])*(1+df['REAL_DKLL'])    #贷款未还本金*利率=未还本息和 
df['DKYE_DKLL'] = df['DKYE']*df['REAL_DKLL']    #贷款已还本金*利率=已还利息  
df['DKYE_1_DKLL'] = df['DKYE']*(1+df['REAL_DKLL'])    #贷款已还本金*1+利率=已还本息和 
df['DKFFE_DKLL'] = df['DKFFE']*df['REAL_DKLL']    #贷款总利息
df['DKFFE_1_DKLL'] = df['DKFFE']*(1+df['REAL_DKLL'])    #贷款总本息和
df['DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL'] = df['DKFFE_SUB_DKYE_1_DKLL'] / df['DKFFE_1_DKLL']    #未还本息和/贷款总本息和
df['DKYE_TO_DKFFE_1_DKLL'] = df['DKYE_1_DKLL']/ df['DKFFE_1_DKLL']    #已还本息和/贷款总本息和
df['DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKFFE_SUB_DKYE_DKLL']/ df['DKFFE_DKLL']    #未还利息/贷款总利息
df['DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKYE_DKLL']/ df['DKFFE_DKLL']    #已还利息/贷款总利息

# 个人账户当年归集余额 = 汇缴+补缴+结息+转入-提取额
#个人账户当年归结余额,个人账户上年转结余额,个人账户余额---新建相关字段
df['GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE_DWYJCE']    #个人账户当年归结余额 - 一年的总缴存额
df['GRZHDNGJYE_SUB_YEAR_GRYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE']    #个人账户当年归结余额 - 一年的总个人缴存额
df['GRZHDNGJYE_SUB_GRZHSNJZYE'] = df['GRZHDNGJYE'] + df['GRZHSNJZYE']    #账户余额(暂当做未脱敏的数据)
df['JIEXI'] = (df['GRYJCE'] + df['DWYJCE'])*12*0.015    #一年的结息额 (结息按1.5%)
df['BUJIAO_ZHUANRU_SUB_TIQVE']=df['GRZHDNGJYE']-df['YEAR_GRYJCE_DWYJCE']-df['JIEXI']#补缴+转入-提取额=个人账户当年归集余额-汇缴-结息
df['GRYJCE_TO_GRZHYE'] = df['GRYJCE']/df['GRZHYE']    #个人月缴存额/个人账户余额 
df['YEAR_GRYJCE_TO_GRZHYE'] = df['YEAR_GRYJCE']/df['GRZHYE']    #一年的总个人缴存额/个人账户余额 
df['MONTH_GRYJCE_DWYJCE_TO_GRZHYE'] = df['MONTH_GRYJCE_DWYJCE']/df['GRZHYE']    #一个月的总缴存额/个人账户余额 
df['GRZHDNGJYE_TO_GRZHYE'] = df['GRZHDNGJYE']/df['GRZHYE']    #个人账户当年归结余额/个人账户余额 
df['GRZHSNJZYE_TO_GRZHYE'] = df['GRZHSNJZYE']/df['GRZHYE']    #个人账户上年转结余额/个人账户余额 
df['BUJIAO_ZHUANRU_SUB_TIQVE'] = df['BUJIAO_ZHUANRU_SUB_TIQVE']/df['GRZHYE']    #(补缴+转入-提取额)/个人账户余额 
df['JIEXI_TO_YEAR_GRYJCE_DWYJCE'] = df['JIEXI']/df['YEAR_GRYJCE_DWYJCE']    #一年的结息额/一年的总缴存额 
df['JIEXI_TO_GRZHDNGJYE'] = df['JIEXI']/df['GRZHDNGJYE']    #一年的结息额/个人账户当年归结余额

# 个人缴款基数---新建相关字段
df['GJJJKBL'] = df['GRYJCE'] / df['GRJCJS']    #公积金缴款比例
# df['GRJCJS_TO_DKFFE_SUB_DKYE'] = df['GRJCJS']/df['DKFFE_SUB_DKYE']    #个人缴款基数/贷款未还本金  
df['GRJCJS_TO_DKYE'] = df['GRJCJS']/df['DKYE']    #个人缴款基数/已还本金 
df['GRJCJS_TO_DKFFE'] = df['GRJCJS']/df['DKFFE']    #个人缴款基数/贷款发放额 
df['GRJCJS_TO_GRZHDNGJYE'] = df['GRJCJS']/df['GRZHDNGJYE']    #个人缴款基数/个人账户当年归结余额 
df['GRJCJS_TO_GRZHSNJZYE'] = df['GRJCJS']/df['GRZHSNJZYE']    #个人缴款基数/个人账户上年转结余额 
df['GRJCJS_TO_GRZHYE'] = df['GRJCJS']/df['GRZHYE']    #个人缴款基数/个人账户余额

# 暂不清楚是否是噪声的字段
df['DKYE_DIV_GRYJCE_ADD_DWYJCE'] = df['DKYE'] / ((df['GRYJCE'] + df['DWYJCE'])*12)
df['GRYJCE_ADD_DWYJCE_TO_DKYE'] = (df['GRYJCE'] + df['DWYJCE']) / df['DKYE']
df['GRZHYE_diff_GRZHDNGJYE'] = df['GRZHYE'] - df['GRZHDNGJYE']
df['GRZHYE_diff_GRZHSNJZYE'] = df['GRZHYE'] - df['GRZHSNJZYE']

#  'YIHUAN_TO_WEIHUAN','GRJCJS_TO_DKFFE_SUB_DKYE'
gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL','DKYE_TO_DKFFE_1_DKLL',
'DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL','DKYE_DKLL_TO_DKFFE_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_YEAR_GRYJCE_DWYJCE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE']
#若有两个字段是一样的要只保留一个
#对于有正负数的字段要新建表示正负的字段

#将float转为三位小数
for i in range(len(df.columns)):
#     print(df.columns[i])
#     print(df[df.columns[i]].dtype)
    if(df.columns[i]!=['label',]):
        if(df[df.columns[i]].dtype=='float64'):
            df[df.columns[i]] = df[df.columns[i]].apply(lambda x:round(x,4))
            
print(df)
_,missing = pankong(df)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
# 保存以上处理过后的数据到
df.to_csv('D:/df_little_change.csv',index = False)
df = pd.read_csv('D:/df_little_change.csv')
print(df.shape)
print(df)

4.2类别特征count、count ratio、onehot编码等

for f in tqdm(cate_cols):
    # 将类型数据转换成01234...的数字
    df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique()))))

    # map()的功能是将一个自定义函数作用于Series对象的每个元素。
    # df[f + '_count']字段表示类型数据中,不同值在该字段中分别出现的次数
    df[f + '_count'] = df[f].map(df[f].value_counts())
    # 使用get_dummies方法将类型数据转换成独热编码
    df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1)

# 将两个字段联合起来
cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \
                     for j in range(i + 1, len(cate_cols))]

for f1, f2 in tqdm(cate_cols_combine):
    # ???两个类型字段中各种值的出现次数的相加
    df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['id'].transform('count')
    df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count']
    df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)

df=if_field_is_same(df)
print(df.shape)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

4.3 离散型单特征衍生

# Create Features based on anonymised prefix groups
prefix = cate_2_cols
for i, p in enumerate(prefix):
    print(i,p)
    #column_set[]是以'XINGBIE', 'ZHIWU', 'XUELI'开头的字段
    column_set = [x for x in df.columns.tolist() if x.startswith(prefix[i])]
    # Take NA count
    df[p + "_group_nan_sum"] = df[column_set].isnull().sum(axis=1) / df[column_set].shape[1]
    # Take SUM/Mean if numeric
    numeric_cols = [x for x in column_set if df[x].dtype != object]
    if numeric_cols:
        df[p + "_group_sum"] = df[column_set].sum(axis=1)
        df[p + "_group_mean"] = df[column_set].mean(axis=1)
        # Zero Count
        df[p + "_group_0_count"] = (df[column_set] == 0).astype(int).sum(axis=1) / (
                df[column_set].shape[1] - df[p + "_group_nan_sum"])
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)

df=if_field_is_same(df)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

#relation_cate_2_cols用来存放选取的相关性较大的二值字段
relation_cate_2_cols = relation(df[cate_2_cols+['label']], poly_num=0.05)
# print(relation_cate_2_cols)

检查哪些数据的值差不多一样,发作用不大

weak_filed = find_weak_filed(df[cate_cols+cate_2_cols])
print(len(weak_filed))
print(weak_filed)

特征选择

rfecv_cate_2_col=clf_rfecv(df,cate_2_cols)

relation_cate_2_cols 与 rfecv_cate_2_col取并集–>select_cate_2_col

# relation_cate_2_cols 与 rfecv_cate_2_col取并集
select_cate_2_col=list(set(relation_cate_2_cols).union(set(rfecv_cate_2_col)))
print(len(select_cate_2_col))
print(select_cate_2_col)

# 保存二值类数据到本地
df[select_cate_2_col].to_csv('D:/rizhao_select_cate_2_col.csv',index = False)
_,missing = pankong(df)
df = select_missing_rate(df,missing,rate=0.001)

4.4 df内只留下多类别数据和连续数据

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df,)
# 保存二值类数据到本地
df[cate_cols+num_cols1+['label']].to_csv('D:/rizhao_cate_cols_num_cols1.csv',index = False)
cate_cols_num_cols1_df = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(cate_cols_num_cols1_df.shape)
print(cate_cols_num_cols1_df)

过滤多分类字段

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(cate_cols_num_cols1_df,20)
df=cate_cols_num_cols1_df
# 相关系数
relation_cate_cols = relation(df[cate_cols+['label']], poly_num=0.05)

rfecv_cate_cols=clf_rfecv(df,cate_cols)

# relation_cate_cols 与 rfecv_cate_cols 取并集
select_cate_col=[]
select_cate_col=list(set(relation_cate_cols).union(set(rfecv_cate_cols)))
select_cate_col=list(set(select_cate_col).union(set(['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT'])))
print(len(select_cate_col))
print(select_cate_col)

# 保存多分类数据到本地
df[select_cate_col].to_csv('D:/rizhao_select_cate_col.csv',index = False)

# 保存上一部处理过的数据到本地
df[select_cate_col+num_cols1+['label']].to_csv('D:/rizhao_select_cate_col_num_cols1.csv',index = False)

4.5 类别特征与数值特征交叉

select_cate_col_num_cols1 = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(select_cate_col_num_cols1.shape)
print(select_cate_col_num_cols1)
df_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
select_cate_col = df_select_cate_col.columns
print(len(select_cate_col))

_,_,cate_cols,num_cols1 = find_filed_class(select_cate_col_num_cols1,20)
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
select_cate_col_num_cols1[select_cate_col]

多类别与数值第一次交叉

select_cate_col_num_cols1['label']=df['label']
relation_cate_cols = []
rfecv_cate_col=[]
i=0
for f1 in tqdm(select_cate_col):
    temp_cate_cols=[]
    g = select_cate_col_num_cols1.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['sum', 'mean', 'std']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))

print(len(relation_cate_cols))

多类别与数值第二次交叉

for f1 in tqdm(select_cate_col):
    temp_cate_cols=[]
    g = select_cate_col_num_cols1.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['max', 'min', 'var','count']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))
    
print(len(relation_cate_cols))
select_cate_col_num_cols1[relation_cate_cols]
# 保存上一部处理过的数据到本地
select_cate_col_num_cols1[relation_cate_cols+['label']].to_csv('D:/df_relation_cate_cols.csv',index = False)

df_relation_cate_cols = pd.read_csv('D:/df_relation_cate_cols.csv')
print(df_relation_cate_cols.shape)
print(df_relation_cate_cols)
_,missing = pankong(df_relation_cate_cols)

df_relation_cate_cols = select_missing_rate(df_relation_cate_cols,missing,rate=0.001)
df_relation_cate_cols = fill_kongzhi(df_relation_cate_cols,fill="mode")

_,missing = pankong(df_relation_cate_cols)
cate_1_cols,_,_,_ = find_filed_class(df_relation_cate_cols,20)
no_cate_1_cols = [col for col in df_relation_cate_cols.columns if col not in cate_1_cols]
df_relation_cate_cols=df_relation_cate_cols[no_cate_1_cols]
print(df_relation_cate_cols.shape)
gbdt_select_temp_df2=[]
for i in tqdm(range(int(len(df_relation_cate_cols.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df_relation_cate_cols.columns[i*400:i*400+400])
#     print(temp_col)
#     print(len(temp_col))
#     print(i)
    if(i<int(len(df_relation_cate_cols.columns)/400)):
        gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col+['label']],max_num=150))
    elif(i==int(len(df_relation_cate_cols.columns)/400)):
        gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col],max_num=150))
    print(len(gbdt_select_temp_df2))

print(len(gbdt_select_temp_df2))
print(gbdt_select_temp_df2)
gbdt_select_cate_num_mix = df_relation_cate_cols[gbdt_select_temp_df2+['label']]
gbdt_select_cate_num_mix=if_field_is_same(gbdt_select_cate_num_mix)
print(gbdt_select_cate_num_mix.shape)

# 保存上一部处理过的数据到本地
gbdt_select_cate_num_mix.to_csv('D:/gbdt_select_temp_df2.csv',index = False)

gbdt_select_cate_num_mix= pd.read_csv('D:/gbdt_select_temp_df2.csv')
print(gbdt_select_cate_num_mix.shape)
print(gbdt_select_cate_num_mix)

4.6 数值特征与数值特征交叉

num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
# # 计算中出现的空值主要来自于这里!!!
relation_num_cols = []
i=0
for f1 in tqdm(num_gen_feats):
    temp_num_cols=[]
    g = df.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['sum', 'mean', 'std']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_num_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_num_cols.extend(relation(df[temp_num_cols+['label']], poly_num=0.05))
print(relation_num_cols)
print(len(relation_num_cols))

# 保存上一部处理过的数据到本地
df[relation_num_cols+['label']].to_csv('D:/df_relation_num_cols.csv',index = False)

df_relation_num_cols = pd.read_csv('D:/df_relation_num_cols.csv')
print(df_relation_num_cols.shape)
print(df_relation_num_cols)

_,missing = pankong(df_relation_num_cols)

df_relation_num_cols = select_missing_rate(df_relation_num_cols,missing,rate=0.001)
df_relation_num_cols = fill_kongzhi(df_relation_num_cols,fill="mode")

_,missing = pankong(df_relation_num_cols)

cate_1_cols,_,_,_ = find_filed_class(df_relation_num_cols,20)
no_cate_1_cols = [col for col in df_relation_num_cols.columns if col not in cate_1_cols]
df_relation_num_cols=df_relation_num_cols[no_cate_1_cols]
print(df_relation_num_cols.shape)

gbdt_select_temp_df3=[]
for i in tqdm(range(int(len(df_relation_num_cols.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df_relation_num_cols.columns[i*400:i*400+400])
#     print(temp_col)
#     print(len(temp_col))
#     print(i)
    if(i<int(len(df_relation_num_cols.columns)/400)):
        gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col+['label']],max_num=150))
    elif(i==int(len(df_relation_num_cols.columns)/400)):
        gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col],max_num=150))
    print(len(gbdt_select_temp_df3))

print(len(gbdt_select_temp_df3))
print(gbdt_select_temp_df3)
gbdt_select_num_num_mix = df_relation_num_cols[gbdt_select_temp_df3+['label']]
gbdt_select_num_num_mix=if_field_is_same(gbdt_select_num_num_mix)
print(gbdt_select_num_num_mix.shape)
# 保存上一部处理过的数据到本地
gbdt_select_num_num_mix.to_csv('D:/gbdt_select_num_num_mix.csv',index = False)
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)

4.7 多项式特征

num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
def polynomial_features111(df, poly_num=0.15 ,change=0,degreenum=2):
    """
    poly_num:相关性    change:0原始字段不变,1输出新增字段    degreenum:阶数
    """
    num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
    'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
    'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
    'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
    'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
    
    poly_features = df[:40000][num_gen_feats]
    # Create the polynomial object with specified degree
    poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
    poly_transformer.fit(poly_features)
    poly_features = poly_transformer.transform(poly_features)
    # 新特征是否与target有相关性。
    poly_features = pd.DataFrame(poly_features ,
        columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
    )
    poly_features['TARGET'] =df[:40000]['label']
    poly_corrs = poly_features.corr()['TARGET'].sort_values()

    po_temp = []
    for i in range(len(poly_corrs)):
        if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_gen_feats + ['TARGET']):
            po_temp.append(poly_corrs.index[i])

    print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
    
    if(change == 1):
        dfpo = df[num_gen_feats]
        dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
        dfpo_transformer.fit(dfpo)
        dfpo = dfpo_transformer.transform(dfpo)
        dfpo = pd.DataFrame(dfpo ,
            columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
        )
        # 新的字段拼接到df上
        df = dfpo[po_temp]

    return df,po_temp

# Make a new dataframe for polynomial features
df_poly,poly_field = polynomial_features111(df[num_gen_feats+['label']],poly_num=0.01 ,change=1,degreenum=2)
print(df_poly.shape)
print(len(poly_field))
gbdt_poly_df=[]
df_poly['label']=df['label']
gbdt_poly_df.extend(GBDTselectfea(df_poly[poly_field+['label']],max_num=100))
# 保存上一部处理过的数据到本地
df_poly[gbdt_poly_df].to_csv('D:/df_gbdt_poly_fea.csv',index = False)
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)

4.8 连续型变量分析log
1.查看某一个数值型变量的分布,查看变量是否符合正态分布, 如果不符合正太分布的变量可以log化后再观察下是否符合正态分布。 2.如果想统一处理一批数据变标准化 必须把这些之前已经正态化的数据提出 3.正态化的原因:一些情况下正态非正态可以让模型更快的收敛, 一些模型要求数据正态(eg. GMM、KNN),保证数据不要过偏态即可,过于偏态可能会影响模型预测结果。

value_vars = ['GRZHYE','GRJCJS', 'GRYJCE', 'YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
 'JIEXI','DKYE_DIV_GRYJCE_ADD_DWYJCE','GJJJKBL']
df = Normal_distribution(df, value_vars, 0)

4.9 拼接上面筛选的所有特征

# 原始字段和一些新字段
print("-----------------原始字段和一些新字段--------------------")
df_little_change = pd.read_csv('D:/df_little_change.csv')
print(df_little_change.shape)
print(df_little_change)
# 二值类数据
print("-----------------二值类数据--------------------")
rizhao_select_cate_2_col = pd.read_csv('D:/rizhao_select_cate_2_col.csv')
rizhao_select_cate_2_col['id'] = df_little_change['id']
print(rizhao_select_cate_2_col.shape)
print(rizhao_select_cate_2_col)
# 多值类数据
print("-----------------多值类数据--------------------")
rizhao_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
rizhao_select_cate_col['id'] = df_little_change['id']
print(rizhao_select_cate_col.shape)
print(rizhao_select_cate_col)
# 类别与数值交叉
print("-----------------类别与数值交叉--------------------")
gbdt_select_temp_df2= pd.read_csv('D:/gbdt_select_temp_df2.csv')
gbdt_select_temp_df2['id'] = df_little_change['id']
print(gbdt_select_temp_df2.shape)
print(gbdt_select_temp_df2)
# 数值与数值交叉
print("-----------------数值与数值交叉--------------------")
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
gbdt_select_num_num_mix['id'] = df_little_change['id']
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)
# 多项式数据
print("-----------------多项式数据--------------------")
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
df_gbdt_poly_fea['id'] = df_little_change['id']
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)
df = pd.merge(df_little_change,rizhao_select_cate_2_col ,on='id')
print(df.shape)
df = pd.merge(df,rizhao_select_cate_col ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_temp_df2 ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_num_num_mix ,on='id')
print(df.shape)
df = pd.merge(df,df_gbdt_poly_fea ,on='id')
print(df.shape)
_,missing = pankong(df)
df = fill_kongzhi(df)
_,missing = pankong(df)
print(len(df.columns))
print(len(set(df.columns)))
df=if_field_is_same(df)
df.shape
print(len(df.columns))
print(len(set(df.columns)))
col_temp=[]
for i in range(len(df.columns)):
    print(df.columns[i])
    if(df.columns[i] not in col_temp):
        if(" " in df.columns[i]):
            col_temp.append(df.columns[i].replace(" ", "_*_"))
        else:
            col_temp.append(df.columns[i])
            
print(len(col_temp))
print(col_temp)


df.columns = col_temp
for i in range(len(df.columns)):
    print(df.columns[i])
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_concat.csv',index = False)

5. 模型调参
方法一:
第一步:学习率和迭代次数

import pandas as pd
import lightgbm as lgb
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
    
cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':12,
          'learning_rate':0.1,
          'num_leaves':32, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
    }
    
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

第二步:确定max_depth和num_leaves

from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}             
gsearch1 = GridSearchCV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',objective='binary',metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6, 
        bagging_fraction = 0.8,
        feature_fraction = 0.8), 
    param_grid = params_test1, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch1.fit(X_train,y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

第三步:确定min_data_in_leaf和max_bin in

params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}          
gsearch2 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,     
        num_leaves=30,
        bagging_fraction = 0.8,
        feature_fraction = 0.8),
    param_grid = params_test2, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

第四步:确定feature_fraction、bagging_fraction、bagging_freq

params_test3={'feature_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
              'bagging_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
              'bagging_freq': range(0,101,10)}
              
gsearch3 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,   
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71), 
    param_grid = params_test3, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

第五步:确定lambda_l1和lambda_l2

# params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
#               'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]}

params_test4={'lambda_l1': [0.8,0.85,0.9,0.95],
              'lambda_l2': [0.8,0.85,0.9,0.95]}
              
gsearch4 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                         
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq=0, 
        feature_fraction= 0.8), 
    param_grid = params_test4, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

第六步:确定 min_split_gain

params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
              
gsearch5 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9), 
    param_grid = params_test5, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_
# subsample
params_test6={'subsample':[0.0,0.1,0.2,0.3,0.4]}
              
gsearch6 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0), 
    param_grid = params_test6, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch6.fit(X_train,y_train)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_
# colsample_bytree
params_test7={'colsample_bytree':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]}
              
gsearch7 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0), 
    param_grid = params_test7, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch7.fit(X_train,y_train)
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_ 
# min_child_weight
params_test8={'min_child_weight':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
              
gsearch8 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0,
        colsample_bytree=0), 
    param_grid = params_test8, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch8.fit(X_train,y_train)
gsearch8.cv_results_, gsearch8.best_params_, gsearch8.best_score_ 

对调参没有经验,所以不知道以上超惨设置好之后,学习率和迭代次数怎么调整?所以索性写了一个循环,找到大概的较好的值

# subsample_freq  
params_test9={'learning_rate':[0.02,0.03,0.04,0.05,0.06,0.07,0.08],
             'n_estimators':[1000,2000,5000,8000,10000,20000,30000]}
              
gsearch9 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0,
        colsample_bytree=0,
        min_child_weight=0), 
    param_grid = params_test9, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch9.fit(X_train,y_train)
gsearch9.cv_results_, gsearch9.best_params_, gsearch9.best_score_ 

第七步:降低学习率,增加迭代次数,验证模型

auc_list=[]
tpr_list=[]
for j in [0.018,0.019,0.02,0.21,0.22,0.023,0.24,0.025]:
    auc_list=[]
    tpr_list=[]
    for i in [5000,6000,7000,8000,10000,12000,15000,18000,20000,25000]:
        model=lgb.LGBMClassifier(
            learning_rate=j, 
            n_estimators=i, 
            max_depth=6,                                          
            num_leaves=30,
            max_bin=25,
            min_data_in_leaf=71,
            bagging_fraction=0.65,
            bagging_freq= 0, 
            feature_fraction= 0.8,
            lambda_l1=0.9,
            lambda_l2=0.9,
            min_split_gain=0,
            subsample=0,
            colsample_bytree=0,
            min_child_weight=0
        )
        model.fit(X_train,y_train)
        y_pre=model.predict_proba(X_test)[:, 1]
        print("---------------------------------------------------")
        print("learning_rate:"+str(j)+"  "+"n_estimators:"+str(i))
        auc=round(roc_auc_score(y_test,y_pre), 6)
        tpr=round(tpr_weight_funtion(y_test,y_pre), 6)
        if(auc in auc_list and tpr in tpr_list):
            print("---break---")
            break
        auc_list.append(auc)
        tpr_list.append(tpr)
        print("auc:",auc)
        print("tpr:",tpr)
        print("---------------------------------------------------")


# 0.06-10000-0.470364-0.941146
# 0.06-20000-0.470364-0.941146
# 0.05-20000-0.476182-0.941146

方法二:

import pandas as pd
import lightgbm as lgb
from sklearn import metrics
# from sklearn.datasets import load_breast_cancer
# from sklearn.cross_validation import train_test_split
 
# canceData=load_breast_cancer()
# X=canceData.data
# y=canceData.target
# X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

### 数据转换
print('数据转换')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
 
### 设置初始参数--不含交叉验证参数
print('设置参数')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1
          }
 
### 交叉验证(调参)
print('交叉验证')
max_auc = float('0')
best_params = {}
 
# 准确率
print("调参1:提高准确率")
for num_leaves in range(5,100,5):
    for max_depth in range(3,8,1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
 
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if mean_auc >= max_auc:
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if 'num_leaves' and 'max_depth' in best_params.keys():          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
 
# 过拟合
print("调参2:降低过拟合")
for max_bin in range(5,256,10):
    for min_data_in_leaf in range(1,102,10):
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if mean_auc >= max_auc:
                max_auc = mean_auc
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']
 
print("调参3:降低过拟合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:
    for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if mean_auc >= max_auc:
                max_auc=mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq
 
if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
 
 
print("调参4:降低过拟合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if mean_auc >= max_auc:
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
 
print("调参5:降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                        params,
                        lgb_train,
                        seed=1,
                        nfold=5,
                        metrics=['auc'],
                        early_stopping_rounds=10,
                        verbose_eval=True
                        )
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
 
print(best_params)

方法三:贝叶斯

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error,  make_scorer, accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
answers = []
mean_score = 0
mean_f1_score = 0
n_folds = 5
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1024)

cols = [col for col in df.columns if col not in ['label','id']]
# X=df[:40000][cols]
# y=df[:40000]['label']

for tr, te in sk.split(df[:40000][cols], df[:40000]['label']):
    X = df[:40000][cols].iloc[tr]
    y = df[:40000]['label'].iloc[tr]

print(X.shape)
print(y.shape)

#定义优化参数
def rf_cv(n_estimators,learning_rate):
    val = cross_val_score(
        LGBMClassifier(
            learning_rate=min(learning_rate,0.15),
            n_estimators=int(n_estimators), 

#             boosting_type='dart',   #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。
#             learning_rate=min(learning_rate,0.4),      #0.05->0.918     0.07->0.924    0.08->0.926
#             n_estimators=int(n_estimators),      #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。
#             max_depth=int(max_depth),           #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。
#             num_leaves=int(num_leaves),          #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth)
#             subsample = min(subsample,0.9),         #训练样本采样率,行
#             colsample_bytree = min(colsample_bytree,0.9),   #训练特征采样率,列
#             random_state=int(random_state),     #随机种子数
#             min_data_in_leaf=int(min_data_in_leaf),        #     可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves
#             reg_alpha= min(reg_alpha,0.999),
#             reg_lambda= min(reg_lambda,0.999),
#             lambda_l1= 0.1,   # 0.1
#             lambda_l2=0.2,  # 0.2
#             min_split_gain=min(min_split_gain,0.9),
#             min_child_weight=min(min_child_weight,0.9),
#             metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss'
            n_jobs=6,              #并行运行多线程核心数
            verbose=-1
        ),
        X,y,scoring="accuracy",cv=5
    ).mean()
    return val

#贝叶斯优化
rf_bo = BayesianOptimization(rf_cv,
     { 
         "n_estimators":(1000,20000),
         "learning_rate":(0.001,0.1)
#          "colsample_bytree":(0.85,0.97),
#          "min_data_in_leaf":(100,2000)
#          "subsample":(0.7,0.9),
#          "max_depth":(25,40),
#          "num_leaves":(31,35)
#          "reg_alpha":(0.2,0.5),
#          "reg_lambda":(0.3,0.5),
#          "lambda_l1":(0.6,0.95),
#          "lambda_l2":(0.5,0.8),
#          "random_state":(0,1024),
#          "min_split_gain":(0.2,0.6),
#          "min_child_weight":(0.6,0.9)
     })
#开始优化
num_iter = 100
init_points = 5
rf_bo.maximize(init_points=init_points,n_iter=num_iter)#显示优化结果

rf_bo.max

以上步骤后数据位55000行,1971列,调参之后的训练结果不是很好,甚至少于曾经用原始数据+贝叶斯优化的结果。
因此,在这里准备再次进行特征选择

#GBDT
gbdt_col=[]
for i in tqdm(range(int(len(df.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df.columns[i*400:i*400+400])
    if(i<int(len(df.columns)/400)):
        gbdt_col.extend(GBDTselectfea(df[temp_col+['label']],max_num=300))
    elif(i==int(len(df.columns)/400)):
        gbdt_col.extend(GBDTselectfea(df[temp_col],max_num=250))
print(len(gbdt_col))
print(gbdt_col)
# 循环递归消除
rfecv_col=[]
for i in tqdm(range(int(len(df.columns)/200+1))):
    temp_col=[]
    temp_col.extend(df.columns[i*200:i*200+200])
    rfecv_col.extend(clf_rfecv(df[:40000][temp_col+['label']],temp_col+['label'],5))
print(len(rfecv_col))
print(rfecv_col)
gbdt_col=gbdt_col.extend(['id','label'])
gbdt_rfecv_col=list(set(rfecv_col).union(set(gbdt_col)))
print(len(gbdt_rfecv_col))
df=df[gbdt_rfecv_col]
print(df.shape)
print(df)
# 加载数据
df= pd.read_csv('D:/df_rfecv.csv')
print(df.shape)
print(df)
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_rfecv.csv',index = False)

6 训练

oof = np.zeros(train_df.shape[0])
# feat_imp_df = pd.DataFrame({'feat': cols, 'imp': 0})
test_df['prob'] = 0
clf = LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    learning_rate=0.015,
    n_estimators=6500,
#     metrics='auc',
    max_depth=6,                                          
    num_leaves=30,
    max_bin=25,
    min_data_in_leaf=71,
    bagging_fraction=0.65,
    bagging_freq= 0, 
    feature_fraction= 0.8,
    lambda_l1=0.9,
    lambda_l2=0.9,
    min_split_gain=0,
    metric=None,
    n_jobs=6,              #并行运行多线程核心数
    verbose=-1
)

val_aucs = []
seeds = [1023, 2048, 2098]
for seed in seeds:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        print('--------------------- {} fold ---------------------'.format(i))
        t = time.time()
        trn_x, trn_y = train_df[cols].iloc[trn_idx].reset_index(drop=True), train_df['label'].values[trn_idx]
        val_x, val_y = train_df[cols].iloc[val_idx].reset_index(drop=True), train_df['label'].values[val_idx]
        clf.fit(
            trn_x, trn_y,
            eval_set=[(val_x, val_y)],
    #         categorical_feature=cate_cols,
            eval_metric='auc',
            early_stopping_rounds=200,
            verbose=200
        )
    #     feat_imp_df['imp'] += clf.feature_importances_ / skf.n_splits
        oof[val_idx] = clf.predict_proba(val_x)[:, 1]
        test_df['prob'] += clf.predict_proba(test_df[cols])[:, 1] / skf.n_splits / len(seeds)

    cv_auc = roc_auc_score(train_df['label'], oof)
    val_aucs.append(cv_auc)
    print('\ncv_auc: ', cv_auc)
print(val_aucs, np.mean(val_aucs))

评价指标:TPR

def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]

    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

tpr = round(tpr_weight_funtion(train_df['label'], oof), 6)
tpr, round(np.mean(val_aucs), 5)
# print(test_df)
submit['id'] = test_df['id']
submit['label'] = test_df['prob']

submit.to_csv('D:/submit12.csv'.format(tpr, round(np.mean(val_aucs), 6)), index = False)
submit.head()

在这里插入图片描述
这是本人第二次参赛,回想第一次参赛啥也不懂,这次对相关技术了解得更多,以后要继续努力,加强基础知识的学习,同时也要经常关注各类比赛,将比赛与个人的研究方向相结合。希望各位大神多多指教,其中很多都是个人自己的想法,并不确定其中的正确性和原理,各种方法组合到一起知否能达到最优本人也不是很清楚。希望大家多多指教。

评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值