山东省第二届数据应用创新大赛日照赛区-公积金贷款逾期预测-赛后总结

最新推荐文章于 2025-05-08 11:01:04 发布

苏侠客852

最新推荐文章于 2025-05-08 11:01:04 发布

阅读量1k

点赞数 1

文章标签：机器学习

本文链接：https://blog.csdn.net/qq_38366112/article/details/114299291

版权

在这里插入图片描述
任务
从真实场景和实际应用出发，利用个人的基本身份信息、个人的住房公积金缴存和贷款等数据信息，需要参赛者建立准确的风险控制模型，来预测用户是否会逾期还款。

提交说明：

结果
提交csv格式，编码为UTF-8，第一行为表头，如下例：
id,label
1,0.556
2,0.987
…
注：对于label字段，其中越接近0代表无逾期，越接近1代表逾期。

数据
训练集提供40000名，测试集提供15000名的缴存人基本信息、缴存信息，贷款信息。选手可以下载数据，在本地进行算法调试，在比赛页面提交结果。
数据样本如下：
在这里插入图片描述

1.导包

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# export CUDA_VISIBLE_DEVICES=0
# 打印 TF 可用的 GPU
print(os.environ['CUDA_VISIBLE_DEVICES'])

import warnings
import numpy as np
import pandas as pd
# import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False
import json
import matplotlib 
from scipy.stats import chi2
import scipy
import seaborn as sns

from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
# from xgboost import XGBClassifier
import time
from sklearn.ensemble import GradientBoostingClassifier
# from lightgbm import LGBMClassifier
from tqdm import tqdm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

2.工具函数
2.1统计相关系数较大的字段

def relation(df, poly_num=0.15):
     """
     DataFrame.corr(method='pearson', min_periods=1)
     参数说明：
     method：可选值为{‘pearson’, ‘kendall’, ‘spearman’}
     pearson：Pearson相关系数来衡量两个数据集合是否在一条线上面，即针对线性数据的相关系数计算，针对非线性                                           数据便会有误差。
     kendall：用于反映分类变量相关性的指标，即针对无序序列的相关系数，非正太分布的数据
     spearman：非线性的，非正太分析的数据的相关系数
     min_periods：样本最少的数据量
     返回值：各类型之间的相关系数DataFrame表格。
     """
    all_cate_2_col=[]
    method=['pearson','kendall','spearman']
    for m in method:
        poly_corrs = df[:40000].corr(method=m)['label'].sort_values()
        po_temp = []
        for i in range(len(poly_corrs)):
            if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in ['label']):
                po_temp.append(poly_corrs.index[i])      
        print(str(m)+'相关性>'+str(poly_num)+'的字段为：\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
        
        #取并集
        all_cate_2_col=list(set(all_cate_2_col).union(set(po_temp)))
        print(len(all_cate_2_col))
#         print(all_cate_2_col)
        
    return all_cate_2_col

2.2循环递归消除法RFECV，进行特征选择

# 循环递归消除法RFECV
def clf_rfecv(df,cate_2_cols,rank_num=1): 
    cate_2_cols=[col for col in df.columns if col not in ['id', 'label']]
    X=df[:40000][cate_2_cols]
    y=df[:40000]['label']
    print(X.shape)
    print(y.shape)
    # RFECV
    clf_rfecv = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9
    )

    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    rfecv = RFECV(
        estimator=clf_rfecv,          # 学习器
        step=1,                 # 移除特征个数
        cv=StratifiedKFold(5),  # 交叉验证次数
        scoring='accuracy',     # 学习器的评价标准
        verbose = 1,
        n_jobs = 12
        ).fit(X, y)
    X_RFECV = rfecv.transform(X)
    print("RFECV特征选择结果——————————————————————————————————————————————————")
    # 和传参对应，所选择的属性的个数
    print("有效特征个数: \n"+str(rfecv.n_features_))
    #     # 打印的是相应位置上属性的排名
    #     print("全部特征等级: \n"+str(rfecv.ranking_))
    #     # 属性选择的一种模糊表示，选择的是true，未选择的是false
    #     print(rfecv.support_)
    rfecv_cate_2_col=[]
    for i in range(len(cate_2_cols)):
        if(rfecv.ranking_[i]<=rank_num):
            print(cate_2_cols[i])
            rfecv_cate_2_col.append(cate_2_cols[i])
    print(len(rfecv_cate_2_col))
#     
    return rfecv_cate_2_col

2.3找到数值变化较少的字段

# 数值类型较少的数据
def find_weak_filed(df):
    weak_filed=[]
    for i in range(len(df.columns)):
#         print('-------------'+str(df.columns[i])+'---------------')
        else_sum=0
        for j in range(1,len(df[df.columns[i]].value_counts().index)):
            else_sum=else_sum+df[df.columns[i]].value_counts().values[j]
#         print(else_sum)
        if(else_sum<=50):
            weak_filed.append(df.columns[i])

    return weak_filed

2.4 统计单值，二值，多分类，连续型字段

def find_filed_class(df,n=20):
    cate_1_cols=[]
    cate_2_cols=[]
    cate_cols=[]
    num_cols1=[]
    for i in tqdm(range(len(df.columns))):
        if(len(df[df.columns[i]].value_counts().index)==1):
            cate_1_cols.append(df.columns[i])
        if(len(df[df.columns[i]].value_counts().index)==2 and df.columns[i]!='label'):
            cate_2_cols.append(df.columns[i])
        elif(2<len(df[df.columns[i]].value_counts().index)<=n and df.columns[i]!='DKLL'):
            cate_cols.append(df.columns[i])
        elif(len(df[df.columns[i]].value_counts().index)>n and df.columns[i]!='id'):
            num_cols1.append(df.columns[i])
    print(len(cate_1_cols))
    print(len(cate_2_cols))
    print(len(cate_cols))
    print(len(num_cols1))
    return cate_1_cols,cate_2_cols,cate_cols,num_cols1

2.5GBDT衡量特征的重要性，进行特征选择

# GBDT是如何衡量特征的重要性的?
# 计算所有的非叶子节点在分裂时加权不纯度的减少，减少得越多说明特征越重要。
# 不纯度的减少实际上就是该节点此次分裂的收益，因此我们也可以这样理解，节点分裂时收益越大，
# 该节点对应的特征的重要度越高。
# 基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 
# 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征，相关代码如下：

#GBDT作为基模型的特征选择
def GBDTselectfea(df,max_num=200):
    cols = [col for col in df.columns if col not in ['id', 'label']]
    X=df[:40000][cols]
    y=df[:40000]['label']
    print(X.shape)
    print(y.shape)
    if(X.shape[1]<max_num):
        max_num=X.shape[1]
    grd = SelectFromModel(GradientBoostingClassifier(),max_features=max_num)
    grd.fit_transform(X,y)
    # print(grd.feature_importances_)
    gbdt_fea_select = grd.get_support()
    gbdt_select=[]
    gbdt_fea_select = grd.get_support()
    print(gbdt_fea_select)
    for i in range(len(gbdt_fea_select)):
    #     print(gbdt_fea_select[i])
    #     print(X.columns)
        if(gbdt_fea_select[i]==True):
            print(X.columns[i])
            gbdt_select.append(X.columns[i])
    #     print(gbdt_select)
    print(len(gbdt_select))
    return gbdt_select

2.6构造多项式特征并计算相关系数较大的字段

def polynomial_features(df, poly_num=0.15 ,change=0,degreenum=2):
    """
    poly_num:相关性    change:0原始字段不变，1输出新增字段    degreenum:阶数
    """
    num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
    
    poly_features = df[:40000][num_gen_feats]
    # Create the polynomial object with specified degree
    poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
    poly_transformer.fit(poly_features)
    poly_features = poly_transformer.transform(poly_features)
    # 新特征是否与target有相关性。
    poly_features = pd.DataFrame(poly_features ,
        columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
    )
    poly_features['TARGET'] =df[:40000]['label']
    poly_corrs = poly_features.corr()['TARGET'].sort_values()

    po_temp = []
    for i in range(len(poly_corrs)):
        if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_cols + gen_feats + ['TARGET']):
            po_temp.append(poly_corrs.index[i])

    print('相关性>'+str(poly_num)+'的字段为：\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
    
    if(change == 1):
        dfpo = df[num_cols + gen_feats]
        dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
        dfpo_transformer.fit(dfpo)
        dfpo = dfpo_transformer.transform(dfpo)
        dfpo = pd.DataFrame(dfpo ,
            columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
        )
        # 新的字段拼接到df上
        df = dfpo[po_temp]

    return df

2.7展示连续数据的分布和其log后的分布

def Normal_distribution(df, value_vars, change=0):
    """
    value_vars:需要查看的字段       change:0不变，1变log,2新增log
    """
    for i in tqdm(range(len(value_vars))):
        plt.figure(figsize=(16,5))
        plt.suptitle(str(value_vars[i])+'Distribution', fontsize=10)

        plt.subplot(1,2,1)
        sub_plot_1 = sns.distplot(df[value_vars[i]])
        sub_plot_1.set_title(str(value_vars[i])+" Distribuition", fontsize=10)
        sub_plot_1.set_xlabel("数值")
        sub_plot_1.set_ylabel("Probability", fontsize=10)

        plt.subplot(1,2,2)    
        sub_plot_2 = sns.distplot(np.log(df[value_vars[i]]+1))
        sub_plot_2.set_title(str(value_vars[i])+"(Log) Distribuition", fontsize=10)
        sub_plot_2.set_xlabel("数值")
        sub_plot_2.set_ylabel("Probability", fontsize=10)
    if(change==1):
        for i in tqdm(range(len(value_vars))):
            df[value_vars[i]] = np.log(df[value_vars[i]]+1)
    if(change==2):
        for i in tqdm(range(len(value_vars))):
            df[str(value_vars[i])+'_log'] = np.log(df[value_vars[i]]+1)
        return df
    return df

2.8检测并删除数值完全一样的字段

#查看是否存在重复的行列
def if_field_is_same(df):
    all_df_cols = df.columns
    hight = len(df)
    del_filed=[]
#     # 删除存在重复的行
#     print('是否存在重复行: ',any(df.duplicated()))
#     if(any(df.duplicated())==True):
#         df.drop_duplicates(inplace = True)
    # 检查是否存在重复的列
    for i in tqdm(range(0,len(all_df_cols)-1,1)):
#       print("---------"+str(all_df_cols[i])+"---------")
        if(all_df_cols[i] not in del_filed):
            for j in range(i+1,len(all_df_cols),1):
                # print(all_df_cols[j])
                for k in range(hight):
                    # print(k)
                    if(df[all_df_cols[i]][k]!=df[all_df_cols[j]][k]):
                        # print("not_same")
                        break
                if(k==hight-1):
#                     print("字段 "+str(all_df_cols[i])+" 与字段 "+str(all_df_cols[j])+" 完全一样")
                    del_filed.append(all_df_cols[j])
    if(len(del_filed)==0):
        print('是否存在重复列: 否')
    else:
        del_filed = set(del_filed)
        print('存在重复列: '+str(len(del_filed))+'个\n为：'+str(del_filed))
        df=df.drop(del_filed,axis=1)
    return df

2.9绘制变量分布的散点图

# 变量的数值分布
def shuzhifenbu(cols, high=40):
    for i in range(len(cols)):
        plt.figure(figsize=(15,high))
        print(str(cols[i])+"的数值分布")
        plt.subplot(len(cols), 1, i+1)
        plt.title(cols[i])
        x = df[cols[i]]
        y = df.index
        plt.scatter(x, y , s=1)
        plt.show()
    return

2.10卡方分箱

# 卡方分箱
# 计算卡方值
def chi3(arr):
    '''
    计算卡方值
    arr:频数统计表,二维numpy数组。
    '''
    assert(arr.ndim==2)
    #计算每行总频数
    R_N = arr.sum(axis=1)
    #每列总频数
    C_N = arr.sum(axis=0)
    #总频数
    N = arr.sum()
    # 计算期望频数 C_i * R_j / N。
    E = np.ones(arr.shape)* C_N / N
    E = (E.T * R_N).T
    square = (arr-E)**2 / E
    #期望频数为0时，做除数没有意义，不计入卡方值
    square[E==0] = 0
    #卡方值
    v = square.sum()
    return v

# 确定卡方分箱点
def chiMerge(df,col,target,max_groups=None,threshold=None): 
    '''
    卡方分箱
    df: pandas dataframe数据集
    col: 需要分箱的变量名（数值型）
    target: 类标签
    max_groups: 最大分组数。
    threshold: 卡方阈值，如果未指定max_groups，默认使用置信度95%设置threshold。
    return: 包括各组的起始值的列表.
    '''
    freq_tab = pd.crosstab(df[col],df[target])
    #转成numpy数组用于计算。
    freq = freq_tab.values
    #初始分组切分点，每个变量值都是切分点。每组中只包含一个变量值.
    #分组区间是左闭右开的，如cutoffs = [1,2,3]，则表示区间 [1,2) , [2,3) ,[3,3+)。
    cutoffs = freq_tab.index.values
    #如果没有指定最大分组
    if max_groups is None:     
        #如果没有指定卡方阈值，就以95%的置信度（自由度为类数目-1）设定阈值。
        if threshold is None:
            #类数目
            cls_num = freq.shape[-1]
            threshold = chi2.isf(0.05,df= cls_num - 1)
    while True:
        minvalue = None
        minidx = None
        #从第1组开始，依次取两组计算卡方值，并判断是否小于当前最小的卡方
        for i in range(len(freq) - 1):
            v = chi3(freq[i:i+2])
            if minvalue is None or (minvalue > v): #小于当前最小卡方，更新最小值
                minvalue = v
                minidx = i
        #如果最小卡方值小于阈值，则合并最小卡方值的相邻两组，并继续循环
        if  (max_groups is not None and  max_groups< len(freq) ) or (threshold is not None and minvalue < threshold):
            #minidx后一行合并到minidx
            tmp  = freq[minidx] + freq[minidx+1]
            freq[minidx] = tmp
            #删除minidx后一行
            freq = np.delete(freq,minidx+1,0)
            #删除对应的切分点
            cutoffs = np.delete(cutoffs,minidx+1,0)
        else: #最小卡方值不小于阈值，停止合并。
            break
    return cutoffs

# 生成分组后的新变量
def value2group(x,cutoffs):
    '''
    将变量的值转换成相应的组。
    x: 需要转换到分组的值
    cutoffs: 各组的起始值。
    return: x对应的组，如group1。从group1开始。
    '''
    #切分点从小到大排序。
    cutoffs = sorted(cutoffs)
    num_groups = len(cutoffs)
    #异常情况：小于第一组的起始值。这里直接放到第一组。
    #异常值建议在分组之前先处理妥善。
    if x < cutoffs[0]:
        return 'group1'
    for i in range(1,num_groups):
        if cutoffs[i-1] <= x < cutoffs[i]:
            return 'group{}'.format(i)
    #最后一组，也可能会包括一些非常大的异常值。
    return 'group{}'.format(num_groups)

# 实现WOE 编码
def calWOE(df ,var ,target):
    '''
    计算WOE编码
    param df：数据集pandas.dataframe
    param var：已分组的列名，无缺失值
    param target：响应变量（0,1）
    return：编码字典
    '''
    eps = 0.000001  #避免除以0
    gbi = pd.crosstab(df[var],df[target]) + eps
    gb = df[target].value_counts() + eps
    gbri = gbi/gb
    gbri['woe'] = np.log(gbri[1]/gbri[0])
    return gbri['woe'].to_dict()

# 实现IV值计算
def calIV(df,var,target):
    '''
    计算IV值
    param df：数据集pandas.dataframe
    param var：已分组的列名，无缺失值
    param target：响应变量（0,1）
    return：IV值
    '''
    eps = 0.000001  #避免除以0 
    gbi = pd.crosstab(df[var],df[target]) + eps
    gb = df[target].value_counts() + eps
    gbri = gbi/gb
    gbri['woe'] = np.log(gbri[1]/gbri[0])
    gbri['iv'] = (gbri[1] - gbri[0])*gbri['woe']
    return gbri['iv'].sum()

2.11 使用LGBMClassifier计算feature_importances_

# 筛选相关性>0的字段
def important_featrue(pre_train, pre_train_label):
    svc = LGBMClassifier(
        boosting_type='dart',   #提升树的类型，常用的梯度提升方法包括gbdt、dart、goss、rf。
        learning_rate=0.23,      #0.05->0.918     0.07->0.924    0.08->0.926
        n_estimators=150,      #拟合的树的棵树，可以理解为训练的轮数。弱学习器的个数，其中gbdt原理是利用通过梯度不断拟合新的弱学习器，直到达到设定的弱学习器的数量。
        max_depth=31,           #最大树的深度。每个弱学习器也就是决策树的最大深度。其中，-1表示不限制。
        num_leaves=1053,          #树的最大叶子数，控制模型复杂性的最重要参数之一。对比在xgboost中，一般为2^(max_depth)
        subsample=0.2707,         #训练样本采样率，行
        colsample_bytree=0.95,   #训练特征采样率，列
        random_state=6,     #随机种子数
        min_data_in_leaf=124,        #     可防止在叶子树中过度拟合，最佳值取决于训练样本和的数量num_leaves
        reg_alpha= 0.2462,
        reg_lambda=0.3140,
    #     lambda_l1= 0.89,   # 0.1
    #     lambda_l2=0.69,  # 0.2
        min_split_gain=0.22,
        min_child_weight=0.84,
        metric='auc',#模型度量标准，"rmse"、"auc"、'binary_logloss'
        n_jobs=12,              #并行运行多线程核心数
        verbose=-1
    )
    
    x_train = pre_train
    y_train = pre_train_label

    #fit
    svc.fit(x_train, y_train)

    feat_labels = x_train.columns[0:]
    fold_importance_df = pd.DataFrame()
    fold_importance_df["importance"] = svc.feature_importances_
    fold_importance_df["featrue_name"] = feat_labels
    importances = fold_importance_df["importance"]
    
    useful_featrue=[]
    for i in tqdm(range(len(fold_importance_df))):
        if(fold_importance_df['importance'][i]!=0):
    #         print(fold_importance_df['featrue_name'][i])
            useful_featrue.append(fold_importance_df['featrue_name'][i])
    # print(importance_0)
    useful_featrue = pd.DataFrame(useful_featrue, columns=['featrue_name']) 
    useful_featrue.to_csv('D:/useful_featrue.csv',index=0)
    print(len(useful_featrue))
    return

2.12 找到空字段

# 判断是否有空字段
"""
输入：df
输出：col_is_null有空值的字段；missing空值率
"""
def pankong(df):
    temp = []
    col_is_null = []
    j = 0
    temp=df.isnull().any()
    # print(temp)#返回每列是否有空值
    colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
    for i in range(len(colnull['isnulls'])):
        if(colnull['isnulls'][i] == True):
            print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
            col_is_null.append(colnull['colname'][i])
            j=j+1
    print("共有字段:"+str(len(colnull))+"个 "+"   含有空值的:"+str(j)+"个")
    
    if(j>0):
        missing = []
        missing = df.isnull().sum()/len(df)
        missing = missing[missing > 0]
        missing.sort_values(inplace=True)
        plt.figure(figsize=(20, 8), dpi=80)
        missing.plot.bar()
    
    return col_is_null,missing

2.13选出大于缺失率>0.1的字段,并删除

#选出大于缺失率>0.1的字段,并删除
def select_missing_rate(df,missing,rate=0.1):
    temp = []
    for i in range(len(missing)):
        if(missing.index[i]!='label'):
            if(missing.values[i]>rate):
                temp.append(missing.index[i])
    print(temp)
    if('label' in temp):
        temp.remove('label')
    df=df.drop(temp,axis=1)
    return df

# df = select_missing_rate(df,missing,rate=0.1)

2.14

# 用众数填空字段
def fill_kongzhi(df,fill="del"):
#     sub_label_cols=[col for col in df.columns if col not in ['id', 'label']]
#     df=df[sub_label_cols]
    temp = []
    exist_nan = []
    j = 0
    temp=df.isnull().any()
#     print(temp)#返回每列是否有空值
    colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
    for i in range(len(colnull['isnulls'])):
        if(colnull['isnulls'][i] == True):
            print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
            if(colnull['colname'][i] != 'label'):
                exist_nan.append(colnull['colname'][i])
            j=j+1
    print("共有字段:"+str(len(colnull))+"个 "+"   含有空值的:"+str(j)+"个")
    
    if(fill=="mode"):
        print(len(exist_nan))
        for j in range(len(exist_nan)):
            print(str(exist_nan[j])+"---众数为---"+str(df[exist_nan[j]].mode()))
            df[exist_nan[j]].fillna(df[exist_nan[j]].median(), inplace = True)
    if(fill=="del"):
        df=df.drop(exist_nan,axis=1)
    return df

3 数据准备和简单的特征工程
3.1载入数据

train_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/train.csv')
test_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/test.csv')
submit = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/submit.csv')
train_df.shape, test_df.shape, submit.shape

train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df_label = train_df_copy['label']   #保存label
# train_df_copy=train_df_copy.drop(['label'],axis=1)
#合并训练集测试集

df = pd.concat([train_df_copy, test_df_copy], axis = 0).reset_index(drop = True)
# df = pd.concat((train_df_copy, test_df_copy), axis=0)
print(df.shape)

3.2将原始变量区分连续变量，多类别变量，两类别变量

train = train_df_copy
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
# train[num_cols]
# train[cate_cols]
# train['XUELI'].value_counts()

3.3查看原始变量的数值分布

shuzhifenbu(cate_cols,25)
shuzhifenbu(cate_2_cols,15)
shuzhifenbu(num_cols)

在这里插入图片描述

#可见其中，训练集与测试集中数值分布有明显区别的字段为：DKLL、ZHIWU、DWSSHY、HYZK

3.4查看异常值
检测异常的方法一：均方差
在统计学中，如果一个数据分布近似正态，那么大约 68% 的数据值会在均值的一个标准差范围内，大约 95% 会在两个标准差范围内，大约 99.7% 会在三个标准差范围内。

# 暂时不删除异常值！
# def find_outliers_by_3segama(data,fea):
#     data_std = np.std(data[fea])
#     data_mean = np.mean(data[fea])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
#     return data

# for fea in num_cols:
#     data_train = find_outliers_by_3segama(train,fea)
#     print(train[fea+'_outliers'].value_counts())
#     print(train.groupby(fea+'_outliers')['label'].sum())
#     print('*'*10)
    
# #删除异常值
# for fea in num_cols:
#     train = train[train[fea+'_outliers']=='正常值']
#     train = train.reset_index(drop=True) 
# print(train)

# numerical_fea = list(df.select_dtypes(exclude=['object']).columns)
# category_fea = list(filter(lambda x: x not in numerical_fea,list(df.columns)))
# print(numerical_fea)
# print(category_fea)

此段值观测了异常值，未进行修改
检测异常的方法二：箱型图（未做）

3.5 出生年月

# 修改出生年月为年龄.'CSY'为出生的月份，CSNY为年龄的分箱值，age为年龄
# 先 import time 然后 time.gmtime(Unix timestamp)
# import time
def transform_csny_to_age(i):
#     print(i)
    if(len(str(i))>10):
        i=i/1000
    a = time.gmtime(int(i))
#     print("year:"+str(a[0])+" "+"month:"+str(a[1]))
    age = 2020-a[0]
#     print(age)
    return age

def transform_csny_to_month(i):
    if(len(str(i))>10):
        i=i/1000
    a = time.gmtime(int(i))
#     print("year:"+str(a[0])+" "+"month:"+str(a[1]))
    month = a[1]
    return month

# 月份
df['CSY'] = df['CSNY']
df['CSY'] = df['CSY'].transform(transform_csny_to_month)
# 年龄
# df['CSN'] = df['CSNY']
df['CSNY'] = df['CSNY'].transform(transform_csny_to_age)
sns.distplot(df['CSY'][df['CSY'] > 0])
print(df['CSY'].value_counts())

def get_age(df,col = 'age'):
     df[col+"_genFeat1"]=(df['age'] > 23).astype(int)
     df[col+"_genFeat2"]=(df['age'] > 28).astype(int)
     df[col+"_genFeat3"]=(df['age'] > 32).astype(int)
     df[col+"_genFeat4"]=(df['age'] > 36).astype(int)
     df[col+"_genFeat5"]=(df['age'] > 43).astype(int)
     df[col+"_genFeat6"]=(df['age'] > 50).astype(int)
     return df, [col + f'_genFeat{i}' for i in range(1, 7)]
      
df['age'] = df['CSNY']
df, genFeats1 = get_age(df, col = 'age')
sns.distplot(df['age'][df['age'] > 0])


#将数据按照年龄（青年：20-35；中年：36-60；老年：61-85）和性别（男女）分组，共分为6组
# print(set(train_test_data_copy["年龄"]))
def transform_age(x_age):
    # print(x_age)
    if x_age<23:
        return 1
    elif 23<=x_age<28:
        return 2
    elif 28<=x_age<32:
        return 3
    elif 32<=x_age<36:
        return 4
    elif 36<=x_age<43:
        return 5
    elif 43<=x_age<50:
        return 6
    elif x_age>=50:
        return 7
df['CSNY'] = df['CSNY'].transform(transform_age)
print(df['CSNY'].value_counts())

3.6 贷款余额、贷款发放额

def get_daikuanYE(df,col):
    df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
    df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
    df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]
    
df, genFeats2 = get_daikuanYE(df, col = 'DKYE')

def get_daikuanFFE(df,col):
    df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
    df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
    df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]
df, genFeats3 = get_daikuanFFE(df, col = 'DKFFE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['DKYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['DKFFE'][df['label'] == 1])

# 小额贷款（MicroCredit）是以个人或家庭为核心的经营类贷款，
# 其主要的服务对象为广大工商个体户、小作坊、小业主。
# 贷款的金额一般为20万元以下，1000元以上。
def transform_dkye(dkye):
    if 0<=dkye<1000:
        return 1
    elif 1000<=dkye<50000:
        return 2
    elif 50000<=dkye<100000:
        return 3
    elif 100000<=dkye<150000:
        return 4
    elif 150000<=dkye<200000:
        return 5
    elif 200000<=dkye<250000:
        return 6
    elif 250000<=dkye<300000:
        return 7
    elif dkye>=300000:
        return 8

df['DKYE_class'] = df['DKYE']
df['DKYE_class'] = df['DKYE_class'].transform(transform_dkye)


def transform_dkffe(dkye):
    if 0<=dkye<1000:
        return 1
    elif 1000<=dkye<50000:
        return 2
    elif 50000<=dkye<100000:
        return 3
    elif 100000<=dkye<150000:
        return 4
    elif 150000<=dkye<200000:
        return 5
    elif 200000<=dkye<250000:
        return 6
    elif 250000<=dkye<300000:
        return 7
    elif dkye>=300000:
        return 8
    
df['DKFFE_class'] = df['DKFFE']
df['DKFFE_class'] = df['DKFFE_class'].transform(transform_dkffe)

print(df['DKYE_class'].value_counts(),
df['DKFFE_class'].value_counts())

3.7个人月缴存额

def get_GRYJCE(df,col):
    df[col + '_genFeat1'] = (df[col] > 400).astype(int)
    df[col + '_genFeat2'] = (df[col] > 600).astype(int)
    df[col + '_genFeat3'] = (df[col] > 800).astype(int)
    df[col + '_genFeat4'] = (df[col] > 1000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 1200).astype(int)
    df[col + '_genFeat6'] = (df[col] > 1400).astype(int)
    df[col + '_genFeat7'] = (df[col] > 1600).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]

df, genFeats4 = get_GRYJCE(df, col = 'GRYJCE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRYJCE'][df['label'] == 1])

def transform_GRYJCE(dkye):
    if dkye<=400:
        return 1
    elif 400<dkye<=600:
        return 2
    elif 600<dkye<=800:
        return 3
    elif 800<dkye<=1000:
        return 4
    elif 1000<dkye<=1200:
        return 5
    elif 1200<dkye<=1400:
        return 6
    elif 1400<dkye<=1600:
        return 7
    elif dkye>1600:
        return 8

df['GRYJCE_class'] = df['GRYJCE']
df['GRYJCE_class'] = df['GRYJCE_class'].transform(transform_GRYJCE)

print(df['GRYJCE_class'].value_counts())

3.8个人缴款基数

def get_GRYJCE(df,col):
    df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 6000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 8000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 1200).astype(int)

    return df, [col + f'_genFeat{i}' for i in range(1, 6)]

df, genFeats5 = get_GRYJCE(df, col = 'GRJCJS')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRJCJS'][df['label'] == 1])


def transform_GRJCJS(dkye):
    # print(x_age)
    if 0<=dkye<2000:
        return 1
    elif 2000<=dkye<4000:
        return 2
    elif 4000<=dkye<6000:
        return 3
    elif 6000<=dkye<8000:
        return 4
    elif 8000<=dkye<12000:
        return 5
    elif dkye>=12000:
        return 6
df['GRJCJS_class'] = df['GRJCJS']
df['GRJCJS_class'] = df['GRJCJS_class'].transform(transform_GRJCJS)
print(df['GRJCJS_class'].value_counts())

3.9 个人账户余额、上年归结余额

def get_GRZHYE(df,col):
    df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 8000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 12000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 20000).astype(int)

    return df, [col + f'_genFeat{i}' for i in range(1, 6)]

df, genFeats6 = get_GRZHYE(df, col = 'GRZHYE')
df, genFeats7 = get_GRZHYE(df, col = 'GRZHSNJZYE')

plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRZHYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['GRZHSNJZYE'][df['label'] == 1])

def transform_GRZHYE(dkye):
    # print(x_age)
    if 0<=dkye<2000:
        return 1
    elif 2000<=dkye<4000:
        return 2
    elif 4000<=dkye<8000:
        return 3
    elif 8000<=dkye<12000:
        return 4
    elif 12000<=dkye<20000:
        return 5
    elif dkye>=20000:
        return 6
df['GRZHYE_class'] = df['GRZHYE']
df['GRZHYE_class'] = df['GRZHYE_class'].transform(transform_GRZHYE)
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE']
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE_class'].transform(transform_GRZHYE)
print(df['GRZHYE_class'].value_counts(),
df['GRZHSNJZYE_class'].value_counts())

3.10 消除DKLL的扰动

# 消除DKLL的扰动
dkll = test_df_copy['DKLL'].value_counts()
dkll_value = pd.DataFrame(data={'colname': dkll.index,'value':dkll.values})
dkll_value[:6]
temp_dkll_value = dkll_value[:6]['colname']
print(temp_dkll_value)

# 找出df中所有值为最常出现的六种贷款利率的行作为训练集
dkll_index=[]
for i in tqdm(range(len(df))):
    for j in range(len(temp_dkll_value)):
        if (df['DKLL'][i]==temp_dkll_value[j]):
            dkll_index.append(i)              
print(len(dkll_index))
# print(dkll_index)

#dkll_index中索引在40000-54999区间，且DKLL不是最常见的六种的行最为测试集，预测其真实值
test_all_index = list(range(40000,55000))
test_index = [i for i in test_all_index if i not in dkll_index]
# print(test_index)
print(len(test_index))

# dkll训练集
tarin_df_dkll = []
tarin_df_dkll = df.loc[dkll_index]
print(tarin_df_dkll)
# dkll测试集
test_df_dkll= []
test_df_dkll = df.loc[test_index]
print(test_df_dkll)

pankong(tarin_df_dkll)

dkll_cols = [col for col in tarin_df_dkll.columns if col not in ['DKLL','label','id']]
X = tarin_df_dkll[dkll_cols]
Y = pd.get_dummies(tarin_df_dkll['DKLL'])
print(Y)

#决策树
# from sklearn import tree
# clf = tree.DecisionTreeClassifier(criterion='entropy')   

# 随机森林
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=200)

# # 导入KNN 分类器
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

clf.fit(X,Y)
test_df_dkll = test_df_dkll[dkll_cols]
res = clf.predict(test_df_dkll)
print(len(res))

res_temp = []
for i in range(len(res)):
#     print(res[i])
    if(res[i][0]==1):
        res_temp.append(2.292)
    elif(res[i][1]==1):
        res_temp.append(2.521)
    elif(res[i][2]==1):
        res_temp.append(2.708)
    elif(res[i][3]==1):
        res_temp.append(2.979)
    elif(res[i][4]==1):
        res_temp.append(3.250)
    elif(res[i][5]==1):
        res_temp.append(3.575)
    else:
        res_temp.append(2.708)
#         res_temp.append(1.111111111111111111)
print(len(res_temp))   
print(res_temp)

test_df_dkll['DKLL'] = res_temp
for i in (test_index):
    df.at[i,'DKLL'] = test_df_dkll['DKLL'][i]     #使用at来改变df
# df['DKLL']

plt.figure(figsize=(15,5))
print("DKLL的数值分布")
plt.title('DKLL')
x = df['DKLL']
y = df.index
plt.scatter(x, y , s=1)
plt.show()

df['DKLL_CLASS']=df['DKLL']

4. 特征工程

# 可以把生成后的新特征也归类到以下三种，生成更多的新特征！！！！！！！！！！！！！！！！！！！！！！！！！
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
old_fea=[]
old_fea = [col for col in df.columns if col not in ['id', 'label']]
print(old_fea)

4.1 构造业务特征

# 数据脱敏bai处理就是对敏感数据du进行变形处zhi理，其目的是保护隐私dao数据zhuan等信息的安全，
# 例如机构和企业收集的个人身份信息、手机号码、银行卡信息等敏感数据。

#个人月缴存额，单位月缴存额---新建相关字段
df['YEAR_GRYJCE'] = df['GRYJCE']*12    #一年的总个人缴存额
df['MONTH_GRYJCE_DWYJCE'] = df['GRYJCE'] + df['DWYJCE']    #一个月的总缴存额
df['YEAR_GRYJCE_DWYJCE'] = (df['GRYJCE'] + df['DWYJCE'])*12    #一年的总缴存额

#贷款余额，贷款发放额---新建相关字段
df['DKYE_TO_DKFFE'] = df['DKYE'] / df['DKFFE']    #已还本金占比
df['DKFFE_SUB_DKYE'] = df['DKFFE'] - df['DKYE']    #贷款未还本金  
df['DKFFE_SUB_DKYE_TO_DKFFE'] = (df['DKFFE'] - df['DKYE'])/ df['DKFFE']    #未还本金占比
df['WEIHUAN_TO_YIHUAN'] = df['DKFFE_SUB_DKYE']/df['DKYE']    #未还比已还
# df['YIHUAN_TO_WEIHUAN'] = df['DKYE']/df['DKFFE_SUB_DKYE']    #已还比未还
df['REAL_DKLL'] = df['DKLL']/100
df['DKFFE_SUB_DKYE_DKLL'] = (df['DKFFE'] - df['DKYE'])*df['REAL_DKLL']    #贷款未还本金*利率=未还利息  
df['DKFFE_SUB_DKYE_1_DKLL'] = (df['DKFFE'] - df['DKYE'])*(1+df['REAL_DKLL'])    #贷款未还本金*利率=未还本息和 
df['DKYE_DKLL'] = df['DKYE']*df['REAL_DKLL']    #贷款已还本金*利率=已还利息  
df['DKYE_1_DKLL'] = df['DKYE']*(1+df['REAL_DKLL'])    #贷款已还本金*1+利率=已还本息和 
df['DKFFE_DKLL'] = df['DKFFE']*df['REAL_DKLL']    #贷款总利息
df['DKFFE_1_DKLL'] = df['DKFFE']*(1+df['REAL_DKLL'])    #贷款总本息和
df['DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL'] = df['DKFFE_SUB_DKYE_1_DKLL'] / df['DKFFE_1_DKLL']    #未还本息和/贷款总本息和
df['DKYE_TO_DKFFE_1_DKLL'] = df['DKYE_1_DKLL']/ df['DKFFE_1_DKLL']    #已还本息和/贷款总本息和
df['DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKFFE_SUB_DKYE_DKLL']/ df['DKFFE_DKLL']    #未还利息/贷款总利息
df['DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKYE_DKLL']/ df['DKFFE_DKLL']    #已还利息/贷款总利息

# 个人账户当年归集余额 = 汇缴+补缴+结息+转入-提取额
#个人账户当年归结余额，个人账户上年转结余额，个人账户余额---新建相关字段
df['GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE_DWYJCE']    #个人账户当年归结余额 - 一年的总缴存额
df['GRZHDNGJYE_SUB_YEAR_GRYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE']    #个人账户当年归结余额 - 一年的总个人缴存额
df['GRZHDNGJYE_SUB_GRZHSNJZYE'] = df['GRZHDNGJYE'] + df['GRZHSNJZYE']    #账户余额（暂当做未脱敏的数据）
df['JIEXI'] = (df['GRYJCE'] + df['DWYJCE'])*12*0.015    #一年的结息额 (结息按1.5%)
df['BUJIAO_ZHUANRU_SUB_TIQVE']=df['GRZHDNGJYE']-df['YEAR_GRYJCE_DWYJCE']-df['JIEXI']#补缴+转入-提取额=个人账户当年归集余额-汇缴-结息
df['GRYJCE_TO_GRZHYE'] = df['GRYJCE']/df['GRZHYE']    #个人月缴存额/个人账户余额 
df['YEAR_GRYJCE_TO_GRZHYE'] = df['YEAR_GRYJCE']/df['GRZHYE']    #一年的总个人缴存额/个人账户余额 
df['MONTH_GRYJCE_DWYJCE_TO_GRZHYE'] = df['MONTH_GRYJCE_DWYJCE']/df['GRZHYE']    #一个月的总缴存额/个人账户余额 
df['GRZHDNGJYE_TO_GRZHYE'] = df['GRZHDNGJYE']/df['GRZHYE']    #个人账户当年归结余额/个人账户余额 
df['GRZHSNJZYE_TO_GRZHYE'] = df['GRZHSNJZYE']/df['GRZHYE']    #个人账户上年转结余额/个人账户余额 
df['BUJIAO_ZHUANRU_SUB_TIQVE'] = df['BUJIAO_ZHUANRU_SUB_TIQVE']/df['GRZHYE']    #(补缴+转入-提取额)/个人账户余额 
df['JIEXI_TO_YEAR_GRYJCE_DWYJCE'] = df['JIEXI']/df['YEAR_GRYJCE_DWYJCE']    #一年的结息额/一年的总缴存额 
df['JIEXI_TO_GRZHDNGJYE'] = df['JIEXI']/df['GRZHDNGJYE']    #一年的结息额/个人账户当年归结余额

# 个人缴款基数---新建相关字段
df['GJJJKBL'] = df['GRYJCE'] / df['GRJCJS']    #公积金缴款比例
# df['GRJCJS_TO_DKFFE_SUB_DKYE'] = df['GRJCJS']/df['DKFFE_SUB_DKYE']    #个人缴款基数/贷款未还本金  
df['GRJCJS_TO_DKYE'] = df['GRJCJS']/df['DKYE']    #个人缴款基数/已还本金 
df['GRJCJS_TO_DKFFE'] = df['GRJCJS']/df['DKFFE']    #个人缴款基数/贷款发放额 
df['GRJCJS_TO_GRZHDNGJYE'] = df['GRJCJS']/df['GRZHDNGJYE']    #个人缴款基数/个人账户当年归结余额 
df['GRJCJS_TO_GRZHSNJZYE'] = df['GRJCJS']/df['GRZHSNJZYE']    #个人缴款基数/个人账户上年转结余额 
df['GRJCJS_TO_GRZHYE'] = df['GRJCJS']/df['GRZHYE']    #个人缴款基数/个人账户余额

# 暂不清楚是否是噪声的字段
df['DKYE_DIV_GRYJCE_ADD_DWYJCE'] = df['DKYE'] / ((df['GRYJCE'] + df['DWYJCE'])*12)
df['GRYJCE_ADD_DWYJCE_TO_DKYE'] = (df['GRYJCE'] + df['DWYJCE']) / df['DKYE']
df['GRZHYE_diff_GRZHDNGJYE'] = df['GRZHYE'] - df['GRZHDNGJYE']
df['GRZHYE_diff_GRZHSNJZYE'] = df['GRZHYE'] - df['GRZHSNJZYE']

#  'YIHUAN_TO_WEIHUAN','GRJCJS_TO_DKFFE_SUB_DKYE'
gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL','DKYE_TO_DKFFE_1_DKLL',
'DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL','DKYE_DKLL_TO_DKFFE_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_YEAR_GRYJCE_DWYJCE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE']
#若有两个字段是一样的要只保留一个
#对于有正负数的字段要新建表示正负的字段

#将float转为三位小数
for i in range(len(df.columns)):
#     print(df.columns[i])
#     print(df[df.columns[i]].dtype)
    if(df.columns[i]!=['label',]):
        if(df[df.columns[i]].dtype=='float64'):
            df[df.columns[i]] = df[df.columns[i]].apply(lambda x:round(x,4))
            
print(df)

_,missing = pankong(df)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
# 保存以上处理过后的数据到
df.to_csv('D:/df_little_change.csv',index = False)

df = pd.read_csv('D:/df_little_change.csv')
print(df.shape)
print(df)

4.2类别特征count、count ratio、onehot编码等

for f in tqdm(cate_cols):
    # 将类型数据转换成01234...的数字
    df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique()))))

    # map()的功能是将一个自定义函数作用于Series对象的每个元素。
    # df[f + '_count']字段表示类型数据中，不同值在该字段中分别出现的次数
    df[f + '_count'] = df[f].map(df[f].value_counts())
    # 使用get_dummies方法将类型数据转换成独热编码
    df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1)

# 将两个字段联合起来
cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \
                     for j in range(i + 1, len(cate_cols))]

for f1, f2 in tqdm(cate_cols_combine):
    # ？？？两个类型字段中各种值的出现次数的相加
    df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['id'].transform('count')
    df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count']
    df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)

df=if_field_is_same(df)
print(df.shape)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

4.3 离散型单特征衍生

# Create Features based on anonymised prefix groups
prefix = cate_2_cols
for i, p in enumerate(prefix):
    print(i,p)
    #column_set[]是以'XINGBIE', 'ZHIWU', 'XUELI'开头的字段
    column_set = [x for x in df.columns.tolist() if x.startswith(prefix[i])]
    # Take NA count
    df[p + "_group_nan_sum"] = df[column_set].isnull().sum(axis=1) / df[column_set].shape[1]
    # Take SUM/Mean if numeric
    numeric_cols = [x for x in column_set if df[x].dtype != object]
    if numeric_cols:
        df[p + "_group_sum"] = df[column_set].sum(axis=1)
        df[p + "_group_mean"] = df[column_set].mean(axis=1)
        # Zero Count
        df[p + "_group_0_count"] = (df[column_set] == 0).astype(int).sum(axis=1) / (
                df[column_set].shape[1] - df[p + "_group_nan_sum"])

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)

df=if_field_is_same(df)

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)

#relation_cate_2_cols用来存放选取的相关性较大的二值字段
relation_cate_2_cols = relation(df[cate_2_cols+['label']], poly_num=0.05)
# print(relation_cate_2_cols)

检查哪些数据的值差不多一样，发作用不大

weak_filed = find_weak_filed(df[cate_cols+cate_2_cols])
print(len(weak_filed))
print(weak_filed)

特征选择

rfecv_cate_2_col=clf_rfecv(df,cate_2_cols)

relation_cate_2_cols 与 rfecv_cate_2_col取并集–>select_cate_2_col

# relation_cate_2_cols 与 rfecv_cate_2_col取并集
select_cate_2_col=list(set(relation_cate_2_cols).union(set(rfecv_cate_2_col)))
print(len(select_cate_2_col))
print(select_cate_2_col)

# 保存二值类数据到本地
df[select_cate_2_col].to_csv('D:/rizhao_select_cate_2_col.csv',index = False)
_,missing = pankong(df)
df = select_missing_rate(df,missing,rate=0.001)

4.4 df内只留下多类别数据和连续数据

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df,)
# 保存二值类数据到本地
df[cate_cols+num_cols1+['label']].to_csv('D:/rizhao_cate_cols_num_cols1.csv',index = False)

cate_cols_num_cols1_df = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(cate_cols_num_cols1_df.shape)
print(cate_cols_num_cols1_df)

过滤多分类字段

cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(cate_cols_num_cols1_df,20)
df=cate_cols_num_cols1_df
# 相关系数
relation_cate_cols = relation(df[cate_cols+['label']], poly_num=0.05)

rfecv_cate_cols=clf_rfecv(df,cate_cols)

# relation_cate_cols 与 rfecv_cate_cols 取并集
select_cate_col=[]
select_cate_col=list(set(relation_cate_cols).union(set(rfecv_cate_cols)))
select_cate_col=list(set(select_cate_col).union(set(['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT'])))
print(len(select_cate_col))
print(select_cate_col)

# 保存多分类数据到本地
df[select_cate_col].to_csv('D:/rizhao_select_cate_col.csv',index = False)

# 保存上一部处理过的数据到本地
df[select_cate_col+num_cols1+['label']].to_csv('D:/rizhao_select_cate_col_num_cols1.csv',index = False)

4.5 类别特征与数值特征交叉

select_cate_col_num_cols1 = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(select_cate_col_num_cols1.shape)
print(select_cate_col_num_cols1)
df_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
select_cate_col = df_select_cate_col.columns
print(len(select_cate_col))

_,_,cate_cols,num_cols1 = find_filed_class(select_cate_col_num_cols1,20)

num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
select_cate_col_num_cols1[select_cate_col]

多类别与数值第一次交叉

select_cate_col_num_cols1['label']=df['label']
relation_cate_cols = []
rfecv_cate_col=[]
i=0
for f1 in tqdm(select_cate_col):
    temp_cate_cols=[]
    g = select_cate_col_num_cols1.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['sum', 'mean', 'std']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))

print(len(relation_cate_cols))

多类别与数值第二次交叉

for f1 in tqdm(select_cate_col):
    temp_cate_cols=[]
    g = select_cate_col_num_cols1.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['max', 'min', 'var','count']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))
    
print(len(relation_cate_cols))

select_cate_col_num_cols1[relation_cate_cols]

# 保存上一部处理过的数据到本地
select_cate_col_num_cols1[relation_cate_cols+['label']].to_csv('D:/df_relation_cate_cols.csv',index = False)

df_relation_cate_cols = pd.read_csv('D:/df_relation_cate_cols.csv')
print(df_relation_cate_cols.shape)
print(df_relation_cate_cols)

_,missing = pankong(df_relation_cate_cols)

df_relation_cate_cols = select_missing_rate(df_relation_cate_cols,missing,rate=0.001)
df_relation_cate_cols = fill_kongzhi(df_relation_cate_cols,fill="mode")

_,missing = pankong(df_relation_cate_cols)

cate_1_cols,_,_,_ = find_filed_class(df_relation_cate_cols,20)
no_cate_1_cols = [col for col in df_relation_cate_cols.columns if col not in cate_1_cols]
df_relation_cate_cols=df_relation_cate_cols[no_cate_1_cols]
print(df_relation_cate_cols.shape)

gbdt_select_temp_df2=[]
for i in tqdm(range(int(len(df_relation_cate_cols.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df_relation_cate_cols.columns[i*400:i*400+400])
#     print(temp_col)
#     print(len(temp_col))
#     print(i)
    if(i<int(len(df_relation_cate_cols.columns)/400)):
        gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col+['label']],max_num=150))
    elif(i==int(len(df_relation_cate_cols.columns)/400)):
        gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col],max_num=150))
    print(len(gbdt_select_temp_df2))

print(len(gbdt_select_temp_df2))
print(gbdt_select_temp_df2)

gbdt_select_cate_num_mix = df_relation_cate_cols[gbdt_select_temp_df2+['label']]
gbdt_select_cate_num_mix=if_field_is_same(gbdt_select_cate_num_mix)
print(gbdt_select_cate_num_mix.shape)

# 保存上一部处理过的数据到本地
gbdt_select_cate_num_mix.to_csv('D:/gbdt_select_temp_df2.csv',index = False)

gbdt_select_cate_num_mix= pd.read_csv('D:/gbdt_select_temp_df2.csv')
print(gbdt_select_cate_num_mix.shape)
print(gbdt_select_cate_num_mix)

4.6 数值特征与数值特征交叉

num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]

# # 计算中出现的空值主要来自于这里！！！
relation_num_cols = []
i=0
for f1 in tqdm(num_gen_feats):
    temp_num_cols=[]
    g = df.groupby(f1)
#     print(g)
    for f2 in num_gen_feats:
        for stat in ['sum', 'mean', 'std']:
            # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
            df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
            temp_num_cols.append('{}_{}_{}'.format(f1, f2, stat))
    # 相关系数
    relation_num_cols.extend(relation(df[temp_num_cols+['label']], poly_num=0.05))

print(relation_num_cols)
print(len(relation_num_cols))

# 保存上一部处理过的数据到本地
df[relation_num_cols+['label']].to_csv('D:/df_relation_num_cols.csv',index = False)

df_relation_num_cols = pd.read_csv('D:/df_relation_num_cols.csv')
print(df_relation_num_cols.shape)
print(df_relation_num_cols)

_,missing = pankong(df_relation_num_cols)

df_relation_num_cols = select_missing_rate(df_relation_num_cols,missing,rate=0.001)
df_relation_num_cols = fill_kongzhi(df_relation_num_cols,fill="mode")

_,missing = pankong(df_relation_num_cols)

cate_1_cols,_,_,_ = find_filed_class(df_relation_num_cols,20)
no_cate_1_cols = [col for col in df_relation_num_cols.columns if col not in cate_1_cols]
df_relation_num_cols=df_relation_num_cols[no_cate_1_cols]
print(df_relation_num_cols.shape)

gbdt_select_temp_df3=[]
for i in tqdm(range(int(len(df_relation_num_cols.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df_relation_num_cols.columns[i*400:i*400+400])
#     print(temp_col)
#     print(len(temp_col))
#     print(i)
    if(i<int(len(df_relation_num_cols.columns)/400)):
        gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col+['label']],max_num=150))
    elif(i==int(len(df_relation_num_cols.columns)/400)):
        gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col],max_num=150))
    print(len(gbdt_select_temp_df3))

print(len(gbdt_select_temp_df3))
print(gbdt_select_temp_df3)

gbdt_select_num_num_mix = df_relation_num_cols[gbdt_select_temp_df3+['label']]
gbdt_select_num_num_mix=if_field_is_same(gbdt_select_num_num_mix)
print(gbdt_select_num_num_mix.shape)

# 保存上一部处理过的数据到本地
gbdt_select_num_num_mix.to_csv('D:/gbdt_select_num_num_mix.csv',index = False)

gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)

4.7 多项式特征

num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]

def polynomial_features111(df, poly_num=0.15 ,change=0,degreenum=2):
    """
    poly_num:相关性    change:0原始字段不变，1输出新增字段    degreenum:阶数
    """
    num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
    'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
    'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
    'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
    'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
    
    poly_features = df[:40000][num_gen_feats]
    # Create the polynomial object with specified degree
    poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
    poly_transformer.fit(poly_features)
    poly_features = poly_transformer.transform(poly_features)
    # 新特征是否与target有相关性。
    poly_features = pd.DataFrame(poly_features ,
        columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
    )
    poly_features['TARGET'] =df[:40000]['label']
    poly_corrs = poly_features.corr()['TARGET'].sort_values()

    po_temp = []
    for i in range(len(poly_corrs)):
        if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_gen_feats + ['TARGET']):
            po_temp.append(poly_corrs.index[i])

    print('相关性>'+str(poly_num)+'的字段为：\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
    
    if(change == 1):
        dfpo = df[num_gen_feats]
        dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
        dfpo_transformer.fit(dfpo)
        dfpo = dfpo_transformer.transform(dfpo)
        dfpo = pd.DataFrame(dfpo ,
            columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
        )
        # 新的字段拼接到df上
        df = dfpo[po_temp]

    return df,po_temp

# Make a new dataframe for polynomial features
df_poly,poly_field = polynomial_features111(df[num_gen_feats+['label']],poly_num=0.01 ,change=1,degreenum=2)
print(df_poly.shape)
print(len(poly_field))

gbdt_poly_df=[]
df_poly['label']=df['label']
gbdt_poly_df.extend(GBDTselectfea(df_poly[poly_field+['label']],max_num=100))

# 保存上一部处理过的数据到本地
df_poly[gbdt_poly_df].to_csv('D:/df_gbdt_poly_fea.csv',index = False)

df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)

4.8 连续型变量分析log
1.查看某一个数值型变量的分布，查看变量是否符合正态分布，如果不符合正太分布的变量可以log化后再观察下是否符合正态分布。 2.如果想统一处理一批数据变标准化必须把这些之前已经正态化的数据提出 3.正态化的原因：一些情况下正态非正态可以让模型更快的收敛，一些模型要求数据正态（eg. GMM、KNN）,保证数据不要过偏态即可，过于偏态可能会影响模型预测结果。

value_vars = ['GRZHYE','GRJCJS', 'GRYJCE', 'YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
 'JIEXI','DKYE_DIV_GRYJCE_ADD_DWYJCE','GJJJKBL']
df = Normal_distribution(df, value_vars, 0)

4.9 拼接上面筛选的所有特征

# 原始字段和一些新字段
print("-----------------原始字段和一些新字段--------------------")
df_little_change = pd.read_csv('D:/df_little_change.csv')
print(df_little_change.shape)
print(df_little_change)
# 二值类数据
print("-----------------二值类数据--------------------")
rizhao_select_cate_2_col = pd.read_csv('D:/rizhao_select_cate_2_col.csv')
rizhao_select_cate_2_col['id'] = df_little_change['id']
print(rizhao_select_cate_2_col.shape)
print(rizhao_select_cate_2_col)
# 多值类数据
print("-----------------多值类数据--------------------")
rizhao_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
rizhao_select_cate_col['id'] = df_little_change['id']
print(rizhao_select_cate_col.shape)
print(rizhao_select_cate_col)
# 类别与数值交叉
print("-----------------类别与数值交叉--------------------")
gbdt_select_temp_df2= pd.read_csv('D:/gbdt_select_temp_df2.csv')
gbdt_select_temp_df2['id'] = df_little_change['id']
print(gbdt_select_temp_df2.shape)
print(gbdt_select_temp_df2)
# 数值与数值交叉
print("-----------------数值与数值交叉--------------------")
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
gbdt_select_num_num_mix['id'] = df_little_change['id']
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)
# 多项式数据
print("-----------------多项式数据--------------------")
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
df_gbdt_poly_fea['id'] = df_little_change['id']
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)

df = pd.merge(df_little_change,rizhao_select_cate_2_col ,on='id')
print(df.shape)
df = pd.merge(df,rizhao_select_cate_col ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_temp_df2 ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_num_num_mix ,on='id')
print(df.shape)
df = pd.merge(df,df_gbdt_poly_fea ,on='id')
print(df.shape)

_,missing = pankong(df)
df = fill_kongzhi(df)
_,missing = pankong(df)
print(len(df.columns))
print(len(set(df.columns)))
df=if_field_is_same(df)

df.shape

print(len(df.columns))
print(len(set(df.columns)))

col_temp=[]
for i in range(len(df.columns)):
    print(df.columns[i])
    if(df.columns[i] not in col_temp):
        if(" " in df.columns[i]):
            col_temp.append(df.columns[i].replace(" ", "_*_"))
        else:
            col_temp.append(df.columns[i])
            
print(len(col_temp))
print(col_temp)


df.columns = col_temp
for i in range(len(df.columns)):
    print(df.columns[i])

# 保存上一部处理过的数据到本地
df.to_csv('D:/df_concat.csv',index = False)

5. 模型调参
方法一：
第一步：学习率和迭代次数

import pandas as pd
import lightgbm as lgb
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
    
cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':12,
          'learning_rate':0.1,
          'num_leaves':32, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
    }
    
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

第二步：确定max_depth和num_leaves

from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}             
gsearch1 = GridSearchCV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',objective='binary',metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6, 
        bagging_fraction = 0.8,
        feature_fraction = 0.8), 
    param_grid = params_test1, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch1.fit(X_train,y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

第三步：确定min_data_in_leaf和max_bin in

params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}          
gsearch2 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,     
        num_leaves=30,
        bagging_fraction = 0.8,
        feature_fraction = 0.8),
    param_grid = params_test2, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

第四步：确定feature_fraction、bagging_fraction、bagging_freq

params_test3={'feature_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
              'bagging_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
              'bagging_freq': range(0,101,10)}
              
gsearch3 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,   
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71), 
    param_grid = params_test3, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

第五步：确定lambda_l1和lambda_l2

# params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
#               'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]}

params_test4={'lambda_l1': [0.8,0.85,0.9,0.95],
              'lambda_l2': [0.8,0.85,0.9,0.95]}
              
gsearch4 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                         
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq=0, 
        feature_fraction= 0.8), 
    param_grid = params_test4, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

第六步：确定 min_split_gain

params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
              
gsearch5 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9), 
    param_grid = params_test5, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

# subsample
params_test6={'subsample':[0.0,0.1,0.2,0.3,0.4]}
              
gsearch6 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0), 
    param_grid = params_test6, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch6.fit(X_train,y_train)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

# colsample_bytree
params_test7={'colsample_bytree':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]}
              
gsearch7 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0), 
    param_grid = params_test7, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch7.fit(X_train,y_train)
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_

# min_child_weight
params_test8={'min_child_weight':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
              
gsearch8 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        learning_rate=0.1, 
        n_estimators=154, 
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0,
        colsample_bytree=0), 
    param_grid = params_test8, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch8.fit(X_train,y_train)
gsearch8.cv_results_, gsearch8.best_params_, gsearch8.best_score_

对调参没有经验，所以不知道以上超惨设置好之后，学习率和迭代次数怎么调整？所以索性写了一个循环，找到大概的较好的值

# subsample_freq  
params_test9={'learning_rate':[0.02,0.03,0.04,0.05,0.06,0.07,0.08],
             'n_estimators':[1000,2000,5000,8000,10000,20000,30000]}
              
gsearch9 = GridSearchCV(
    estimator = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metrics='auc',
        max_depth=6,                                          
        num_leaves=30,
        max_bin=25,
        min_data_in_leaf=71,
        bagging_fraction=0.65,
        bagging_freq= 0, 
        feature_fraction= 0.8,
        lambda_l1=0.9,
        lambda_l2=0.9,
        min_split_gain=0,
        subsample=0,
        colsample_bytree=0,
        min_child_weight=0), 
    param_grid = params_test9, 
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
gsearch9.fit(X_train,y_train)
gsearch9.cv_results_, gsearch9.best_params_, gsearch9.best_score_

第七步：降低学习率，增加迭代次数，验证模型

auc_list=[]
tpr_list=[]
for j in [0.018,0.019,0.02,0.21,0.22,0.023,0.24,0.025]:
    auc_list=[]
    tpr_list=[]
    for i in [5000,6000,7000,8000,10000,12000,15000,18000,20000,25000]:
        model=lgb.LGBMClassifier(
            learning_rate=j, 
            n_estimators=i, 
            max_depth=6,                                          
            num_leaves=30,
            max_bin=25,
            min_data_in_leaf=71,
            bagging_fraction=0.65,
            bagging_freq= 0, 
            feature_fraction= 0.8,
            lambda_l1=0.9,
            lambda_l2=0.9,
            min_split_gain=0,
            subsample=0,
            colsample_bytree=0,
            min_child_weight=0
        )
        model.fit(X_train,y_train)
        y_pre=model.predict_proba(X_test)[:, 1]
        print("---------------------------------------------------")
        print("learning_rate:"+str(j)+"  "+"n_estimators:"+str(i))
        auc=round(roc_auc_score(y_test,y_pre), 6)
        tpr=round(tpr_weight_funtion(y_test,y_pre), 6)
        if(auc in auc_list and tpr in tpr_list):
            print("---break---")
            break
        auc_list.append(auc)
        tpr_list.append(tpr)
        print("auc:",auc)
        print("tpr:",tpr)
        print("---------------------------------------------------")


# 0.06-10000-0.470364-0.941146
# 0.06-20000-0.470364-0.941146
# 0.05-20000-0.476182-0.941146

方法二：

import pandas as pd
import lightgbm as lgb
from sklearn import metrics
# from sklearn.datasets import load_breast_cancer
# from sklearn.cross_validation import train_test_split
 
# canceData=load_breast_cancer()
# X=canceData.data
# y=canceData.target
# X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

### 数据转换
print('数据转换')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
 
### 设置初始参数--不含交叉验证参数
print('设置参数')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1
          }
 
### 交叉验证(调参)
print('交叉验证')
max_auc = float('0')
best_params = {}
 
# 准确率
print("调参1：提高准确率")
for num_leaves in range(5,100,5):
    for max_depth in range(3,8,1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
 
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if mean_auc >= max_auc:
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if 'num_leaves' and 'max_depth' in best_params.keys():          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
 
# 过拟合
print("调参2：降低过拟合")
for max_bin in range(5,256,10):
    for min_data_in_leaf in range(1,102,10):
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if mean_auc >= max_auc:
                max_auc = mean_auc
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']
 
print("调参3：降低过拟合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:
    for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if mean_auc >= max_auc:
                max_auc=mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq
 
if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
 
 
print("调参4：降低过拟合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if mean_auc >= max_auc:
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
 
print("调参5：降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                        params,
                        lgb_train,
                        seed=1,
                        nfold=5,
                        metrics=['auc'],
                        early_stopping_rounds=10,
                        verbose_eval=True
                        )
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
 
print(best_params)

方法三：贝叶斯

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error,  make_scorer, accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
answers = []
mean_score = 0
mean_f1_score = 0
n_folds = 5
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1024)

cols = [col for col in df.columns if col not in ['label','id']]
# X=df[:40000][cols]
# y=df[:40000]['label']

for tr, te in sk.split(df[:40000][cols], df[:40000]['label']):
    X = df[:40000][cols].iloc[tr]
    y = df[:40000]['label'].iloc[tr]

print(X.shape)
print(y.shape)

#定义优化参数
def rf_cv(n_estimators,learning_rate):
    val = cross_val_score(
        LGBMClassifier(
            learning_rate=min(learning_rate,0.15),
            n_estimators=int(n_estimators), 

#             boosting_type='dart',   #提升树的类型，常用的梯度提升方法包括gbdt、dart、goss、rf。
#             learning_rate=min(learning_rate,0.4),      #0.05->0.918     0.07->0.924    0.08->0.926
#             n_estimators=int(n_estimators),      #拟合的树的棵树，可以理解为训练的轮数。弱学习器的个数，其中gbdt原理是利用通过梯度不断拟合新的弱学习器，直到达到设定的弱学习器的数量。
#             max_depth=int(max_depth),           #最大树的深度。每个弱学习器也就是决策树的最大深度。其中，-1表示不限制。
#             num_leaves=int(num_leaves),          #树的最大叶子数，控制模型复杂性的最重要参数之一。对比在xgboost中，一般为2^(max_depth)
#             subsample = min(subsample,0.9),         #训练样本采样率，行
#             colsample_bytree = min(colsample_bytree,0.9),   #训练特征采样率，列
#             random_state=int(random_state),     #随机种子数
#             min_data_in_leaf=int(min_data_in_leaf),        #     可防止在叶子树中过度拟合，最佳值取决于训练样本和的数量num_leaves
#             reg_alpha= min(reg_alpha,0.999),
#             reg_lambda= min(reg_lambda,0.999),
#             lambda_l1= 0.1,   # 0.1
#             lambda_l2=0.2,  # 0.2
#             min_split_gain=min(min_split_gain,0.9),
#             min_child_weight=min(min_child_weight,0.9),
#             metric='auc',#模型度量标准，"rmse"、"auc"、'binary_logloss'
            n_jobs=6,              #并行运行多线程核心数
            verbose=-1
        ),
        X,y,scoring="accuracy",cv=5
    ).mean()
    return val

#贝叶斯优化
rf_bo = BayesianOptimization(rf_cv,
     { 
         "n_estimators":(1000,20000),
         "learning_rate":(0.001,0.1)
#          "colsample_bytree":(0.85,0.97),
#          "min_data_in_leaf":(100,2000)
#          "subsample":(0.7,0.9),
#          "max_depth":(25,40),
#          "num_leaves":(31,35)
#          "reg_alpha":(0.2,0.5),
#          "reg_lambda":(0.3,0.5),
#          "lambda_l1":(0.6,0.95),
#          "lambda_l2":(0.5,0.8),
#          "random_state":(0,1024),
#          "min_split_gain":(0.2,0.6),
#          "min_child_weight":(0.6,0.9)
     })

#开始优化
num_iter = 100
init_points = 5
rf_bo.maximize(init_points=init_points,n_iter=num_iter)#显示优化结果

rf_bo.max

以上步骤后数据位55000行，1971列，调参之后的训练结果不是很好，甚至少于曾经用原始数据+贝叶斯优化的结果。
因此，在这里准备再次进行特征选择

#GBDT
gbdt_col=[]
for i in tqdm(range(int(len(df.columns)/400+1))):
    temp_col=[]
    temp_col.extend(df.columns[i*400:i*400+400])
    if(i<int(len(df.columns)/400)):
        gbdt_col.extend(GBDTselectfea(df[temp_col+['label']],max_num=300))
    elif(i==int(len(df.columns)/400)):
        gbdt_col.extend(GBDTselectfea(df[temp_col],max_num=250))
print(len(gbdt_col))
print(gbdt_col)

# 循环递归消除
rfecv_col=[]
for i in tqdm(range(int(len(df.columns)/200+1))):
    temp_col=[]
    temp_col.extend(df.columns[i*200:i*200+200])
    rfecv_col.extend(clf_rfecv(df[:40000][temp_col+['label']],temp_col+['label'],5))
print(len(rfecv_col))
print(rfecv_col)

gbdt_col=gbdt_col.extend(['id','label'])
gbdt_rfecv_col=list(set(rfecv_col).union(set(gbdt_col)))
print(len(gbdt_rfecv_col))

df=df[gbdt_rfecv_col]
print(df.shape)
print(df)

# 加载数据
df= pd.read_csv('D:/df_rfecv.csv')
print(df.shape)
print(df)

# 保存上一部处理过的数据到本地
df.to_csv('D:/df_rfecv.csv',index = False)

6 训练

oof = np.zeros(train_df.shape[0])
# feat_imp_df = pd.DataFrame({'feat': cols, 'imp': 0})
test_df['prob'] = 0
clf = LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    learning_rate=0.015,
    n_estimators=6500,
#     metrics='auc',
    max_depth=6,                                          
    num_leaves=30,
    max_bin=25,
    min_data_in_leaf=71,
    bagging_fraction=0.65,
    bagging_freq= 0, 
    feature_fraction= 0.8,
    lambda_l1=0.9,
    lambda_l2=0.9,
    min_split_gain=0,
    metric=None,
    n_jobs=6,              #并行运行多线程核心数
    verbose=-1
)

val_aucs = []
seeds = [1023, 2048, 2098]
for seed in seeds:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        print('--------------------- {} fold ---------------------'.format(i))
        t = time.time()
        trn_x, trn_y = train_df[cols].iloc[trn_idx].reset_index(drop=True), train_df['label'].values[trn_idx]
        val_x, val_y = train_df[cols].iloc[val_idx].reset_index(drop=True), train_df['label'].values[val_idx]
        clf.fit(
            trn_x, trn_y,
            eval_set=[(val_x, val_y)],
    #         categorical_feature=cate_cols,
            eval_metric='auc',
            early_stopping_rounds=200,
            verbose=200
        )
    #     feat_imp_df['imp'] += clf.feature_importances_ / skf.n_splits
        oof[val_idx] = clf.predict_proba(val_x)[:, 1]
        test_df['prob'] += clf.predict_proba(test_df[cols])[:, 1] / skf.n_splits / len(seeds)

    cv_auc = roc_auc_score(train_df['label'], oof)
    val_aucs.append(cv_auc)
    print('\ncv_auc: ', cv_auc)
print(val_aucs, np.mean(val_aucs))

评价指标:TPR

def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]

    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

tpr = round(tpr_weight_funtion(train_df['label'], oof), 6)
tpr, round(np.mean(val_aucs), 5)

# print(test_df)
submit['id'] = test_df['id']
submit['label'] = test_df['prob']

submit.to_csv('D:/submit12.csv'.format(tpr, round(np.mean(val_aucs), 6)), index = False)
submit.head()

在这里插入图片描述
这是本人第二次参赛，回想第一次参赛啥也不懂，这次对相关技术了解得更多，以后要继续努力，加强基础知识的学习，同时也要经常关注各类比赛，将比赛与个人的研究方向相结合。希望各位大神多多指教，其中很多都是个人自己的想法，并不确定其中的正确性和原理，各种方法组合到一起知否能达到最优本人也不是很清楚。希望大家多多指教。