任务
从真实场景和实际应用出发,利用个人的基本身份信息、个人的住房公积金缴存和贷款等数据信息,需要参赛者建立准确的风险控制模型,来预测用户是否会逾期还款。
提交说明:
- 结果
提交csv格式,编码为UTF-8,第一行为表头,如下例:
id,label
1,0.556
2,0.987
…
注:对于label字段,其中越接近0代表无逾期,越接近1代表逾期。
数据
训练集提供40000名,测试集提供15000名的缴存人基本信息、缴存信息,贷款信息。选手可以下载数据,在本地进行算法调试,在比赛页面提交结果。
数据样本如下:
1.导包
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# export CUDA_VISIBLE_DEVICES=0
# 打印 TF 可用的 GPU
print(os.environ['CUDA_VISIBLE_DEVICES'])
import warnings
import numpy as np
import pandas as pd
# import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False
import json
import matplotlib
from scipy.stats import chi2
import scipy
import seaborn as sns
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
# from xgboost import XGBClassifier
import time
from sklearn.ensemble import GradientBoostingClassifier
# from lightgbm import LGBMClassifier
from tqdm import tqdm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
2.工具函数
2.1统计相关系数较大的字段
def relation(df, poly_num=0.15):
"""
DataFrame.corr(method='pearson', min_periods=1)
参数说明:
method:可选值为{‘pearson’, ‘kendall’, ‘spearman’}
pearson:Pearson相关系数来衡量两个数据集合是否在一条线上面,即针对线性数据的相关系数计算,针对非线性 数据便会有误差。
kendall:用于反映分类变量相关性的指标,即针对无序序列的相关系数,非正太分布的数据
spearman:非线性的,非正太分析的数据的相关系数
min_periods:样本最少的数据量
返回值:各类型之间的相关系数DataFrame表格。
"""
all_cate_2_col=[]
method=['pearson','kendall','spearman']
for m in method:
poly_corrs = df[:40000].corr(method=m)['label'].sort_values()
po_temp = []
for i in range(len(poly_corrs)):
if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in ['label']):
po_temp.append(poly_corrs.index[i])
print(str(m)+'相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
#取并集
all_cate_2_col=list(set(all_cate_2_col).union(set(po_temp)))
print(len(all_cate_2_col))
# print(all_cate_2_col)
return all_cate_2_col
2.2循环递归消除法RFECV,进行特征选择
# 循环递归消除法RFECV
def clf_rfecv(df,cate_2_cols,rank_num=1):
cate_2_cols=[col for col in df.columns if col not in ['id', 'label']]
X=df[:40000][cate_2_cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
# RFECV
clf_rfecv = LGBMClassifier(
boosting_type='gbdt',
objective='binary',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9
)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
rfecv = RFECV(
estimator=clf_rfecv, # 学习器
step=1, # 移除特征个数
cv=StratifiedKFold(5), # 交叉验证次数
scoring='accuracy', # 学习器的评价标准
verbose = 1,
n_jobs = 12
).fit(X, y)
X_RFECV = rfecv.transform(X)
print("RFECV特征选择结果——————————————————————————————————————————————————")
# 和传参对应,所选择的属性的个数
print("有效特征个数: \n"+str(rfecv.n_features_))
# # 打印的是相应位置上属性的排名
# print("全部特征等级: \n"+str(rfecv.ranking_))
# # 属性选择的一种模糊表示,选择的是true,未选择的是false
# print(rfecv.support_)
rfecv_cate_2_col=[]
for i in range(len(cate_2_cols)):
if(rfecv.ranking_[i]<=rank_num):
print(cate_2_cols[i])
rfecv_cate_2_col.append(cate_2_cols[i])
print(len(rfecv_cate_2_col))
#
return rfecv_cate_2_col
2.3找到数值变化较少的字段
# 数值类型较少的数据
def find_weak_filed(df):
weak_filed=[]
for i in range(len(df.columns)):
# print('-------------'+str(df.columns[i])+'---------------')
else_sum=0
for j in range(1,len(df[df.columns[i]].value_counts().index)):
else_sum=else_sum+df[df.columns[i]].value_counts().values[j]
# print(else_sum)
if(else_sum<=50):
weak_filed.append(df.columns[i])
return weak_filed
2.4 统计单值,二值,多分类,连续型字段
def find_filed_class(df,n=20):
cate_1_cols=[]
cate_2_cols=[]
cate_cols=[]
num_cols1=[]
for i in tqdm(range(len(df.columns))):
if(len(df[df.columns[i]].value_counts().index)==1):
cate_1_cols.append(df.columns[i])
if(len(df[df.columns[i]].value_counts().index)==2 and df.columns[i]!='label'):
cate_2_cols.append(df.columns[i])
elif(2<len(df[df.columns[i]].value_counts().index)<=n and df.columns[i]!='DKLL'):
cate_cols.append(df.columns[i])
elif(len(df[df.columns[i]].value_counts().index)>n and df.columns[i]!='id'):
num_cols1.append(df.columns[i])
print(len(cate_1_cols))
print(len(cate_2_cols))
print(len(cate_cols))
print(len(num_cols1))
return cate_1_cols,cate_2_cols,cate_cols,num_cols1
2.5GBDT衡量特征的重要性,进行特征选择
# GBDT是如何衡量特征的重要性的?
# 计算所有的非叶子节点在分裂时加权不纯度的减少,减少得越多说明特征越重要。
# 不纯度的减少实际上就是该节点此次分裂的收益,因此我们也可以这样理解,节点分裂时收益越大,
# 该节点对应的特征的重要度越高。
# 基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。
# 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征,相关代码如下:
#GBDT作为基模型的特征选择
def GBDTselectfea(df,max_num=200):
cols = [col for col in df.columns if col not in ['id', 'label']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
if(X.shape[1]<max_num):
max_num=X.shape[1]
grd = SelectFromModel(GradientBoostingClassifier(),max_features=max_num)
grd.fit_transform(X,y)
# print(grd.feature_importances_)
gbdt_fea_select = grd.get_support()
gbdt_select=[]
gbdt_fea_select = grd.get_support()
print(gbdt_fea_select)
for i in range(len(gbdt_fea_select)):
# print(gbdt_fea_select[i])
# print(X.columns)
if(gbdt_fea_select[i]==True):
print(X.columns[i])
gbdt_select.append(X.columns[i])
# print(gbdt_select)
print(len(gbdt_select))
return gbdt_select
2.6构造多项式特征并计算相关系数较大的字段
def polynomial_features(df, poly_num=0.15 ,change=0,degreenum=2):
"""
poly_num:相关性 change:0原始字段不变,1输出新增字段 degreenum:阶数
"""
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
poly_features = df[:40000][num_gen_feats]
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
# 新特征是否与target有相关性。
poly_features = pd.DataFrame(poly_features ,
columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
)
poly_features['TARGET'] =df[:40000]['label']
poly_corrs = poly_features.corr()['TARGET'].sort_values()
po_temp = []
for i in range(len(poly_corrs)):
if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_cols + gen_feats + ['TARGET']):
po_temp.append(poly_corrs.index[i])
print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
if(change == 1):
dfpo = df[num_cols + gen_feats]
dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
dfpo_transformer.fit(dfpo)
dfpo = dfpo_transformer.transform(dfpo)
dfpo = pd.DataFrame(dfpo ,
columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
)
# 新的字段拼接到df上
df = dfpo[po_temp]
return df
2.7展示连续数据的分布和其log后的分布
def Normal_distribution(df, value_vars, change=0):
"""
value_vars:需要查看的字段 change:0不变,1变log,2新增log
"""
for i in tqdm(range(len(value_vars))):
plt.figure(figsize=(16,5))
plt.suptitle(str(value_vars[i])+'Distribution', fontsize=10)
plt.subplot(1,2,1)
sub_plot_1 = sns.distplot(df[value_vars[i]])
sub_plot_1.set_title(str(value_vars[i])+" Distribuition", fontsize=10)
sub_plot_1.set_xlabel("数值")
sub_plot_1.set_ylabel("Probability", fontsize=10)
plt.subplot(1,2,2)
sub_plot_2 = sns.distplot(np.log(df[value_vars[i]]+1))
sub_plot_2.set_title(str(value_vars[i])+"(Log) Distribuition", fontsize=10)
sub_plot_2.set_xlabel("数值")
sub_plot_2.set_ylabel("Probability", fontsize=10)
if(change==1):
for i in tqdm(range(len(value_vars))):
df[value_vars[i]] = np.log(df[value_vars[i]]+1)
if(change==2):
for i in tqdm(range(len(value_vars))):
df[str(value_vars[i])+'_log'] = np.log(df[value_vars[i]]+1)
return df
return df
2.8检测并删除数值完全一样的字段
#查看是否存在重复的行列
def if_field_is_same(df):
all_df_cols = df.columns
hight = len(df)
del_filed=[]
# # 删除存在重复的行
# print('是否存在重复行: ',any(df.duplicated()))
# if(any(df.duplicated())==True):
# df.drop_duplicates(inplace = True)
# 检查是否存在重复的列
for i in tqdm(range(0,len(all_df_cols)-1,1)):
# print("---------"+str(all_df_cols[i])+"---------")
if(all_df_cols[i] not in del_filed):
for j in range(i+1,len(all_df_cols),1):
# print(all_df_cols[j])
for k in range(hight):
# print(k)
if(df[all_df_cols[i]][k]!=df[all_df_cols[j]][k]):
# print("not_same")
break
if(k==hight-1):
# print("字段 "+str(all_df_cols[i])+" 与字段 "+str(all_df_cols[j])+" 完全一样")
del_filed.append(all_df_cols[j])
if(len(del_filed)==0):
print('是否存在重复列: 否')
else:
del_filed = set(del_filed)
print('存在重复列: '+str(len(del_filed))+'个\n为:'+str(del_filed))
df=df.drop(del_filed,axis=1)
return df
2.9绘制变量分布的散点图
# 变量的数值分布
def shuzhifenbu(cols, high=40):
for i in range(len(cols)):
plt.figure(figsize=(15,high))
print(str(cols[i])+"的数值分布")
plt.subplot(len(cols), 1, i+1)
plt.title(cols[i])
x = df[cols[i]]
y = df.index
plt.scatter(x, y , s=1)
plt.show()
return
2.10卡方分箱
# 卡方分箱
# 计算卡方值
def chi3(arr):
'''
计算卡方值
arr:频数统计表,二维numpy数组。
'''
assert(arr.ndim==2)
#计算每行总频数
R_N = arr.sum(axis=1)
#每列总频数
C_N = arr.sum(axis=0)
#总频数
N = arr.sum()
# 计算期望频数 C_i * R_j / N。
E = np.ones(arr.shape)* C_N / N
E = (E.T * R_N).T
square = (arr-E)**2 / E
#期望频数为0时,做除数没有意义,不计入卡方值
square[E==0] = 0
#卡方值
v = square.sum()
return v
# 确定卡方分箱点
def chiMerge(df,col,target,max_groups=None,threshold=None):
'''
卡方分箱
df: pandas dataframe数据集
col: 需要分箱的变量名(数值型)
target: 类标签
max_groups: 最大分组数。
threshold: 卡方阈值,如果未指定max_groups,默认使用置信度95%设置threshold。
return: 包括各组的起始值的列表.
'''
freq_tab = pd.crosstab(df[col],df[target])
#转成numpy数组用于计算。
freq = freq_tab.values
#初始分组切分点,每个变量值都是切分点。每组中只包含一个变量值.
#分组区间是左闭右开的,如cutoffs = [1,2,3],则表示区间 [1,2) , [2,3) ,[3,3+)。
cutoffs = freq_tab.index.values
#如果没有指定最大分组
if max_groups is None:
#如果没有指定卡方阈值,就以95%的置信度(自由度为类数目-1)设定阈值。
if threshold is None:
#类数目
cls_num = freq.shape[-1]
threshold = chi2.isf(0.05,df= cls_num - 1)
while True:
minvalue = None
minidx = None
#从第1组开始,依次取两组计算卡方值,并判断是否小于当前最小的卡方
for i in range(len(freq) - 1):
v = chi3(freq[i:i+2])
if minvalue is None or (minvalue > v): #小于当前最小卡方,更新最小值
minvalue = v
minidx = i
#如果最小卡方值小于阈值,则合并最小卡方值的相邻两组,并继续循环
if (max_groups is not None and max_groups< len(freq) ) or (threshold is not None and minvalue < threshold):
#minidx后一行合并到minidx
tmp = freq[minidx] + freq[minidx+1]
freq[minidx] = tmp
#删除minidx后一行
freq = np.delete(freq,minidx+1,0)
#删除对应的切分点
cutoffs = np.delete(cutoffs,minidx+1,0)
else: #最小卡方值不小于阈值,停止合并。
break
return cutoffs
# 生成分组后的新变量
def value2group(x,cutoffs):
'''
将变量的值转换成相应的组。
x: 需要转换到分组的值
cutoffs: 各组的起始值。
return: x对应的组,如group1。从group1开始。
'''
#切分点从小到大排序。
cutoffs = sorted(cutoffs)
num_groups = len(cutoffs)
#异常情况:小于第一组的起始值。这里直接放到第一组。
#异常值建议在分组之前先处理妥善。
if x < cutoffs[0]:
return 'group1'
for i in range(1,num_groups):
if cutoffs[i-1] <= x < cutoffs[i]:
return 'group{}'.format(i)
#最后一组,也可能会包括一些非常大的异常值。
return 'group{}'.format(num_groups)
# 实现WOE 编码
def calWOE(df ,var ,target):
'''
计算WOE编码
param df:数据集pandas.dataframe
param var:已分组的列名,无缺失值
param target:响应变量(0,1)
return:编码字典
'''
eps = 0.000001 #避免除以0
gbi = pd.crosstab(df[var],df[target]) + eps
gb = df[target].value_counts() + eps
gbri = gbi/gb
gbri['woe'] = np.log(gbri[1]/gbri[0])
return gbri['woe'].to_dict()
# 实现IV值计算
def calIV(df,var,target):
'''
计算IV值
param df:数据集pandas.dataframe
param var:已分组的列名,无缺失值
param target:响应变量(0,1)
return:IV值
'''
eps = 0.000001 #避免除以0
gbi = pd.crosstab(df[var],df[target]) + eps
gb = df[target].value_counts() + eps
gbri = gbi/gb
gbri['woe'] = np.log(gbri[1]/gbri[0])
gbri['iv'] = (gbri[1] - gbri[0])*gbri['woe']
return gbri['iv'].sum()
2.11 使用LGBMClassifier计算feature_importances_
# 筛选相关性>0的字段
def important_featrue(pre_train, pre_train_label):
svc = LGBMClassifier(
boosting_type='dart', #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。
learning_rate=0.23, #0.05->0.918 0.07->0.924 0.08->0.926
n_estimators=150, #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。
max_depth=31, #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。
num_leaves=1053, #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth)
subsample=0.2707, #训练样本采样率,行
colsample_bytree=0.95, #训练特征采样率,列
random_state=6, #随机种子数
min_data_in_leaf=124, # 可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves
reg_alpha= 0.2462,
reg_lambda=0.3140,
# lambda_l1= 0.89, # 0.1
# lambda_l2=0.69, # 0.2
min_split_gain=0.22,
min_child_weight=0.84,
metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss'
n_jobs=12, #并行运行多线程核心数
verbose=-1
)
x_train = pre_train
y_train = pre_train_label
#fit
svc.fit(x_train, y_train)
feat_labels = x_train.columns[0:]
fold_importance_df = pd.DataFrame()
fold_importance_df["importance"] = svc.feature_importances_
fold_importance_df["featrue_name"] = feat_labels
importances = fold_importance_df["importance"]
useful_featrue=[]
for i in tqdm(range(len(fold_importance_df))):
if(fold_importance_df['importance'][i]!=0):
# print(fold_importance_df['featrue_name'][i])
useful_featrue.append(fold_importance_df['featrue_name'][i])
# print(importance_0)
useful_featrue = pd.DataFrame(useful_featrue, columns=['featrue_name'])
useful_featrue.to_csv('D:/useful_featrue.csv',index=0)
print(len(useful_featrue))
return
2.12 找到空字段
# 判断是否有空字段
"""
输入:df
输出:col_is_null有空值的字段;missing空值率
"""
def pankong(df):
temp = []
col_is_null = []
j = 0
temp=df.isnull().any()
# print(temp)#返回每列是否有空值
colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
for i in range(len(colnull['isnulls'])):
if(colnull['isnulls'][i] == True):
print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
col_is_null.append(colnull['colname'][i])
j=j+1
print("共有字段:"+str(len(colnull))+"个 "+" 含有空值的:"+str(j)+"个")
if(j>0):
missing = []
missing = df.isnull().sum()/len(df)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
plt.figure(figsize=(20, 8), dpi=80)
missing.plot.bar()
return col_is_null,missing
2.13选出大于缺失率>0.1的字段,并删除
#选出大于缺失率>0.1的字段,并删除
def select_missing_rate(df,missing,rate=0.1):
temp = []
for i in range(len(missing)):
if(missing.index[i]!='label'):
if(missing.values[i]>rate):
temp.append(missing.index[i])
print(temp)
if('label' in temp):
temp.remove('label')
df=df.drop(temp,axis=1)
return df
# df = select_missing_rate(df,missing,rate=0.1)
2.14
# 用众数填空字段
def fill_kongzhi(df,fill="del"):
# sub_label_cols=[col for col in df.columns if col not in ['id', 'label']]
# df=df[sub_label_cols]
temp = []
exist_nan = []
j = 0
temp=df.isnull().any()
# print(temp)#返回每列是否有空值
colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values})
for i in range(len(colnull['isnulls'])):
if(colnull['isnulls'][i] == True):
print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i]))
if(colnull['colname'][i] != 'label'):
exist_nan.append(colnull['colname'][i])
j=j+1
print("共有字段:"+str(len(colnull))+"个 "+" 含有空值的:"+str(j)+"个")
if(fill=="mode"):
print(len(exist_nan))
for j in range(len(exist_nan)):
print(str(exist_nan[j])+"---众数为---"+str(df[exist_nan[j]].mode()))
df[exist_nan[j]].fillna(df[exist_nan[j]].median(), inplace = True)
if(fill=="del"):
df=df.drop(exist_nan,axis=1)
return df
3 数据准备和简单的特征工程
3.1载入数据
train_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/train.csv')
test_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/test.csv')
submit = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/submit.csv')
train_df.shape, test_df.shape, submit.shape
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df_label = train_df_copy['label'] #保存label
# train_df_copy=train_df_copy.drop(['label'],axis=1)
#合并训练集测试集
df = pd.concat([train_df_copy, test_df_copy], axis = 0).reset_index(drop = True)
# df = pd.concat((train_df_copy, test_df_copy), axis=0)
print(df.shape)
3.2将原始变量区分连续变量,多类别变量,两类别变量
train = train_df_copy
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
# train[num_cols]
# train[cate_cols]
# train['XUELI'].value_counts()
3.3查看原始变量的数值分布
shuzhifenbu(cate_cols,25)
shuzhifenbu(cate_2_cols,15)
shuzhifenbu(num_cols)
#可见其中,训练集与测试集中数值分布有明显区别的字段为:DKLL、ZHIWU、DWSSHY、HYZK
3.4查看异常值
检测异常的方法一:均方差
在统计学中,如果一个数据分布近似正态,那么大约 68% 的数据值会在均值的一个标准差范围内,大约 95% 会在两个标准差范围内,大约 99.7% 会在三个标准差范围内。
# 暂时不删除异常值!
# def find_outliers_by_3segama(data,fea):
# data_std = np.std(data[fea])
# data_mean = np.mean(data[fea])
# outliers_cut_off = data_std * 3
# lower_rule = data_mean - outliers_cut_off
# upper_rule = data_mean + outliers_cut_off
# data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
# return data
# for fea in num_cols:
# data_train = find_outliers_by_3segama(train,fea)
# print(train[fea+'_outliers'].value_counts())
# print(train.groupby(fea+'_outliers')['label'].sum())
# print('*'*10)
# #删除异常值
# for fea in num_cols:
# train = train[train[fea+'_outliers']=='正常值']
# train = train.reset_index(drop=True)
# print(train)
# numerical_fea = list(df.select_dtypes(exclude=['object']).columns)
# category_fea = list(filter(lambda x: x not in numerical_fea,list(df.columns)))
# print(numerical_fea)
# print(category_fea)
此段值观测了异常值,未进行修改
检测异常的方法二:箱型图(未做)
3.5 出生年月
# 修改出生年月为年龄.'CSY'为出生的月份,CSNY为年龄的分箱值,age为年龄
# 先 import time 然后 time.gmtime(Unix timestamp)
# import time
def transform_csny_to_age(i):
# print(i)
if(len(str(i))>10):
i=i/1000
a = time.gmtime(int(i))
# print("year:"+str(a[0])+" "+"month:"+str(a[1]))
age = 2020-a[0]
# print(age)
return age
def transform_csny_to_month(i):
if(len(str(i))>10):
i=i/1000
a = time.gmtime(int(i))
# print("year:"+str(a[0])+" "+"month:"+str(a[1]))
month = a[1]
return month
# 月份
df['CSY'] = df['CSNY']
df['CSY'] = df['CSY'].transform(transform_csny_to_month)
# 年龄
# df['CSN'] = df['CSNY']
df['CSNY'] = df['CSNY'].transform(transform_csny_to_age)
sns.distplot(df['CSY'][df['CSY'] > 0])
print(df['CSY'].value_counts())
def get_age(df,col = 'age'):
df[col+"_genFeat1"]=(df['age'] > 23).astype(int)
df[col+"_genFeat2"]=(df['age'] > 28).astype(int)
df[col+"_genFeat3"]=(df['age'] > 32).astype(int)
df[col+"_genFeat4"]=(df['age'] > 36).astype(int)
df[col+"_genFeat5"]=(df['age'] > 43).astype(int)
df[col+"_genFeat6"]=(df['age'] > 50).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 7)]
df['age'] = df['CSNY']
df, genFeats1 = get_age(df, col = 'age')
sns.distplot(df['age'][df['age'] > 0])
#将数据按照年龄(青年:20-35;中年:36-60;老年:61-85)和性别(男女)分组,共分为6组
# print(set(train_test_data_copy["年龄"]))
def transform_age(x_age):
# print(x_age)
if x_age<23:
return 1
elif 23<=x_age<28:
return 2
elif 28<=x_age<32:
return 3
elif 32<=x_age<36:
return 4
elif 36<=x_age<43:
return 5
elif 43<=x_age<50:
return 6
elif x_age>=50:
return 7
df['CSNY'] = df['CSNY'].transform(transform_age)
print(df['CSNY'].value_counts())
3.6 贷款余额、贷款发放额
def get_daikuanYE(df,col):
df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 8)]
df, genFeats2 = get_daikuanYE(df, col = 'DKYE')
def get_daikuanFFE(df,col):
df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 8)]
df, genFeats3 = get_daikuanFFE(df, col = 'DKFFE')
plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['DKYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['DKFFE'][df['label'] == 1])
# 小额贷款(MicroCredit)是以个人或家庭为核心的经营类贷款,
# 其主要的服务对象为广大工商个体户、小作坊、小业主。
# 贷款的金额一般为20万元以下,1000元以上。
def transform_dkye(dkye):
if 0<=dkye<1000:
return 1
elif 1000<=dkye<50000:
return 2
elif 50000<=dkye<100000:
return 3
elif 100000<=dkye<150000:
return 4
elif 150000<=dkye<200000:
return 5
elif 200000<=dkye<250000:
return 6
elif 250000<=dkye<300000:
return 7
elif dkye>=300000:
return 8
df['DKYE_class'] = df['DKYE']
df['DKYE_class'] = df['DKYE_class'].transform(transform_dkye)
def transform_dkffe(dkye):
if 0<=dkye<1000:
return 1
elif 1000<=dkye<50000:
return 2
elif 50000<=dkye<100000:
return 3
elif 100000<=dkye<150000:
return 4
elif 150000<=dkye<200000:
return 5
elif 200000<=dkye<250000:
return 6
elif 250000<=dkye<300000:
return 7
elif dkye>=300000:
return 8
df['DKFFE_class'] = df['DKFFE']
df['DKFFE_class'] = df['DKFFE_class'].transform(transform_dkffe)
print(df['DKYE_class'].value_counts(),
df['DKFFE_class'].value_counts())
3.7个人月缴存额
def get_GRYJCE(df,col):
df[col + '_genFeat1'] = (df[col] > 400).astype(int)
df[col + '_genFeat2'] = (df[col] > 600).astype(int)
df[col + '_genFeat3'] = (df[col] > 800).astype(int)
df[col + '_genFeat4'] = (df[col] > 1000).astype(int)
df[col + '_genFeat5'] = (df[col] > 1200).astype(int)
df[col + '_genFeat6'] = (df[col] > 1400).astype(int)
df[col + '_genFeat7'] = (df[col] > 1600).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 8)]
df, genFeats4 = get_GRYJCE(df, col = 'GRYJCE')
plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRYJCE'][df['label'] == 1])
def transform_GRYJCE(dkye):
if dkye<=400:
return 1
elif 400<dkye<=600:
return 2
elif 600<dkye<=800:
return 3
elif 800<dkye<=1000:
return 4
elif 1000<dkye<=1200:
return 5
elif 1200<dkye<=1400:
return 6
elif 1400<dkye<=1600:
return 7
elif dkye>1600:
return 8
df['GRYJCE_class'] = df['GRYJCE']
df['GRYJCE_class'] = df['GRYJCE_class'].transform(transform_GRYJCE)
print(df['GRYJCE_class'].value_counts())
3.8个人缴款基数
def get_GRYJCE(df,col):
df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
df[col + '_genFeat3'] = (df[col] > 6000).astype(int)
df[col + '_genFeat4'] = (df[col] > 8000).astype(int)
df[col + '_genFeat5'] = (df[col] > 1200).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 6)]
df, genFeats5 = get_GRYJCE(df, col = 'GRJCJS')
plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRJCJS'][df['label'] == 1])
def transform_GRJCJS(dkye):
# print(x_age)
if 0<=dkye<2000:
return 1
elif 2000<=dkye<4000:
return 2
elif 4000<=dkye<6000:
return 3
elif 6000<=dkye<8000:
return 4
elif 8000<=dkye<12000:
return 5
elif dkye>=12000:
return 6
df['GRJCJS_class'] = df['GRJCJS']
df['GRJCJS_class'] = df['GRJCJS_class'].transform(transform_GRJCJS)
print(df['GRJCJS_class'].value_counts())
3.9 个人账户余额、上年归结余额
def get_GRZHYE(df,col):
df[col + '_genFeat1'] = (df[col] > 2000).astype(int)
df[col + '_genFeat2'] = (df[col] > 4000).astype(int)
df[col + '_genFeat3'] = (df[col] > 8000).astype(int)
df[col + '_genFeat4'] = (df[col] > 12000).astype(int)
df[col + '_genFeat5'] = (df[col] > 20000).astype(int)
return df, [col + f'_genFeat{i}' for i in range(1, 6)]
df, genFeats6 = get_GRZHYE(df, col = 'GRZHYE')
df, genFeats7 = get_GRZHYE(df, col = 'GRZHSNJZYE')
plt.figure(figsize = (8, 2))
plt.subplot(1,2,1)
sns.distplot(df['GRZHYE'][df['label'] == 1])
plt.subplot(1,2,2)
sns.distplot(df['GRZHSNJZYE'][df['label'] == 1])
def transform_GRZHYE(dkye):
# print(x_age)
if 0<=dkye<2000:
return 1
elif 2000<=dkye<4000:
return 2
elif 4000<=dkye<8000:
return 3
elif 8000<=dkye<12000:
return 4
elif 12000<=dkye<20000:
return 5
elif dkye>=20000:
return 6
df['GRZHYE_class'] = df['GRZHYE']
df['GRZHYE_class'] = df['GRZHYE_class'].transform(transform_GRZHYE)
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE']
df['GRZHSNJZYE_class'] = df['GRZHSNJZYE_class'].transform(transform_GRZHYE)
print(df['GRZHYE_class'].value_counts(),
df['GRZHSNJZYE_class'].value_counts())
3.10 消除DKLL的扰动
# 消除DKLL的扰动
dkll = test_df_copy['DKLL'].value_counts()
dkll_value = pd.DataFrame(data={'colname': dkll.index,'value':dkll.values})
dkll_value[:6]
temp_dkll_value = dkll_value[:6]['colname']
print(temp_dkll_value)
# 找出df中所有值为最常出现的六种贷款利率的行作为训练集
dkll_index=[]
for i in tqdm(range(len(df))):
for j in range(len(temp_dkll_value)):
if (df['DKLL'][i]==temp_dkll_value[j]):
dkll_index.append(i)
print(len(dkll_index))
# print(dkll_index)
#dkll_index中索引在40000-54999区间,且DKLL不是最常见的六种的行最为测试集,预测其真实值
test_all_index = list(range(40000,55000))
test_index = [i for i in test_all_index if i not in dkll_index]
# print(test_index)
print(len(test_index))
# dkll训练集
tarin_df_dkll = []
tarin_df_dkll = df.loc[dkll_index]
print(tarin_df_dkll)
# dkll测试集
test_df_dkll= []
test_df_dkll = df.loc[test_index]
print(test_df_dkll)
pankong(tarin_df_dkll)
dkll_cols = [col for col in tarin_df_dkll.columns if col not in ['DKLL','label','id']]
X = tarin_df_dkll[dkll_cols]
Y = pd.get_dummies(tarin_df_dkll['DKLL'])
print(Y)
#决策树
# from sklearn import tree
# clf = tree.DecisionTreeClassifier(criterion='entropy')
# 随机森林
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=200)
# # 导入KNN 分类器
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X,Y)
test_df_dkll = test_df_dkll[dkll_cols]
res = clf.predict(test_df_dkll)
print(len(res))
res_temp = []
for i in range(len(res)):
# print(res[i])
if(res[i][0]==1):
res_temp.append(2.292)
elif(res[i][1]==1):
res_temp.append(2.521)
elif(res[i][2]==1):
res_temp.append(2.708)
elif(res[i][3]==1):
res_temp.append(2.979)
elif(res[i][4]==1):
res_temp.append(3.250)
elif(res[i][5]==1):
res_temp.append(3.575)
else:
res_temp.append(2.708)
# res_temp.append(1.111111111111111111)
print(len(res_temp))
print(res_temp)
test_df_dkll['DKLL'] = res_temp
for i in (test_index):
df.at[i,'DKLL'] = test_df_dkll['DKLL'][i] #使用at来改变df
# df['DKLL']
plt.figure(figsize=(15,5))
print("DKLL的数值分布")
plt.title('DKLL')
x = df['DKLL']
y = df.index
plt.scatter(x, y , s=1)
plt.show()
df['DKLL_CLASS']=df['DKLL']
4. 特征工程
# 可以把生成后的新特征也归类到以下三种,生成更多的新特征!!!!!!!!!!!!!!!!!!!!!!!!!
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
old_fea=[]
old_fea = [col for col in df.columns if col not in ['id', 'label']]
print(old_fea)
4.1 构造业务特征
# 数据脱敏bai处理就是对敏感数据du进行变形处zhi理,其目的是保护隐私dao数据zhuan等信息的安全,
# 例如机构和企业收集的个人身份信息、手机号码、银行卡信息等敏感数据。
#个人月缴存额,单位月缴存额---新建相关字段
df['YEAR_GRYJCE'] = df['GRYJCE']*12 #一年的总个人缴存额
df['MONTH_GRYJCE_DWYJCE'] = df['GRYJCE'] + df['DWYJCE'] #一个月的总缴存额
df['YEAR_GRYJCE_DWYJCE'] = (df['GRYJCE'] + df['DWYJCE'])*12 #一年的总缴存额
#贷款余额,贷款发放额---新建相关字段
df['DKYE_TO_DKFFE'] = df['DKYE'] / df['DKFFE'] #已还本金占比
df['DKFFE_SUB_DKYE'] = df['DKFFE'] - df['DKYE'] #贷款未还本金
df['DKFFE_SUB_DKYE_TO_DKFFE'] = (df['DKFFE'] - df['DKYE'])/ df['DKFFE'] #未还本金占比
df['WEIHUAN_TO_YIHUAN'] = df['DKFFE_SUB_DKYE']/df['DKYE'] #未还比已还
# df['YIHUAN_TO_WEIHUAN'] = df['DKYE']/df['DKFFE_SUB_DKYE'] #已还比未还
df['REAL_DKLL'] = df['DKLL']/100
df['DKFFE_SUB_DKYE_DKLL'] = (df['DKFFE'] - df['DKYE'])*df['REAL_DKLL'] #贷款未还本金*利率=未还利息
df['DKFFE_SUB_DKYE_1_DKLL'] = (df['DKFFE'] - df['DKYE'])*(1+df['REAL_DKLL']) #贷款未还本金*利率=未还本息和
df['DKYE_DKLL'] = df['DKYE']*df['REAL_DKLL'] #贷款已还本金*利率=已还利息
df['DKYE_1_DKLL'] = df['DKYE']*(1+df['REAL_DKLL']) #贷款已还本金*1+利率=已还本息和
df['DKFFE_DKLL'] = df['DKFFE']*df['REAL_DKLL'] #贷款总利息
df['DKFFE_1_DKLL'] = df['DKFFE']*(1+df['REAL_DKLL']) #贷款总本息和
df['DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL'] = df['DKFFE_SUB_DKYE_1_DKLL'] / df['DKFFE_1_DKLL'] #未还本息和/贷款总本息和
df['DKYE_TO_DKFFE_1_DKLL'] = df['DKYE_1_DKLL']/ df['DKFFE_1_DKLL'] #已还本息和/贷款总本息和
df['DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKFFE_SUB_DKYE_DKLL']/ df['DKFFE_DKLL'] #未还利息/贷款总利息
df['DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKYE_DKLL']/ df['DKFFE_DKLL'] #已还利息/贷款总利息
# 个人账户当年归集余额 = 汇缴+补缴+结息+转入-提取额
#个人账户当年归结余额,个人账户上年转结余额,个人账户余额---新建相关字段
df['GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE_DWYJCE'] #个人账户当年归结余额 - 一年的总缴存额
df['GRZHDNGJYE_SUB_YEAR_GRYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE'] #个人账户当年归结余额 - 一年的总个人缴存额
df['GRZHDNGJYE_SUB_GRZHSNJZYE'] = df['GRZHDNGJYE'] + df['GRZHSNJZYE'] #账户余额(暂当做未脱敏的数据)
df['JIEXI'] = (df['GRYJCE'] + df['DWYJCE'])*12*0.015 #一年的结息额 (结息按1.5%)
df['BUJIAO_ZHUANRU_SUB_TIQVE']=df['GRZHDNGJYE']-df['YEAR_GRYJCE_DWYJCE']-df['JIEXI']#补缴+转入-提取额=个人账户当年归集余额-汇缴-结息
df['GRYJCE_TO_GRZHYE'] = df['GRYJCE']/df['GRZHYE'] #个人月缴存额/个人账户余额
df['YEAR_GRYJCE_TO_GRZHYE'] = df['YEAR_GRYJCE']/df['GRZHYE'] #一年的总个人缴存额/个人账户余额
df['MONTH_GRYJCE_DWYJCE_TO_GRZHYE'] = df['MONTH_GRYJCE_DWYJCE']/df['GRZHYE'] #一个月的总缴存额/个人账户余额
df['GRZHDNGJYE_TO_GRZHYE'] = df['GRZHDNGJYE']/df['GRZHYE'] #个人账户当年归结余额/个人账户余额
df['GRZHSNJZYE_TO_GRZHYE'] = df['GRZHSNJZYE']/df['GRZHYE'] #个人账户上年转结余额/个人账户余额
df['BUJIAO_ZHUANRU_SUB_TIQVE'] = df['BUJIAO_ZHUANRU_SUB_TIQVE']/df['GRZHYE'] #(补缴+转入-提取额)/个人账户余额
df['JIEXI_TO_YEAR_GRYJCE_DWYJCE'] = df['JIEXI']/df['YEAR_GRYJCE_DWYJCE'] #一年的结息额/一年的总缴存额
df['JIEXI_TO_GRZHDNGJYE'] = df['JIEXI']/df['GRZHDNGJYE'] #一年的结息额/个人账户当年归结余额
# 个人缴款基数---新建相关字段
df['GJJJKBL'] = df['GRYJCE'] / df['GRJCJS'] #公积金缴款比例
# df['GRJCJS_TO_DKFFE_SUB_DKYE'] = df['GRJCJS']/df['DKFFE_SUB_DKYE'] #个人缴款基数/贷款未还本金
df['GRJCJS_TO_DKYE'] = df['GRJCJS']/df['DKYE'] #个人缴款基数/已还本金
df['GRJCJS_TO_DKFFE'] = df['GRJCJS']/df['DKFFE'] #个人缴款基数/贷款发放额
df['GRJCJS_TO_GRZHDNGJYE'] = df['GRJCJS']/df['GRZHDNGJYE'] #个人缴款基数/个人账户当年归结余额
df['GRJCJS_TO_GRZHSNJZYE'] = df['GRJCJS']/df['GRZHSNJZYE'] #个人缴款基数/个人账户上年转结余额
df['GRJCJS_TO_GRZHYE'] = df['GRJCJS']/df['GRZHYE'] #个人缴款基数/个人账户余额
# 暂不清楚是否是噪声的字段
df['DKYE_DIV_GRYJCE_ADD_DWYJCE'] = df['DKYE'] / ((df['GRYJCE'] + df['DWYJCE'])*12)
df['GRYJCE_ADD_DWYJCE_TO_DKYE'] = (df['GRYJCE'] + df['DWYJCE']) / df['DKYE']
df['GRZHYE_diff_GRZHDNGJYE'] = df['GRZHYE'] - df['GRZHDNGJYE']
df['GRZHYE_diff_GRZHSNJZYE'] = df['GRZHYE'] - df['GRZHSNJZYE']
# 'YIHUAN_TO_WEIHUAN','GRJCJS_TO_DKFFE_SUB_DKYE'
gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL','DKYE_TO_DKFFE_1_DKLL',
'DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL','DKYE_DKLL_TO_DKFFE_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_YEAR_GRYJCE_DWYJCE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE']
#若有两个字段是一样的要只保留一个
#对于有正负数的字段要新建表示正负的字段
#将float转为三位小数
for i in range(len(df.columns)):
# print(df.columns[i])
# print(df[df.columns[i]].dtype)
if(df.columns[i]!=['label',]):
if(df[df.columns[i]].dtype=='float64'):
df[df.columns[i]] = df[df.columns[i]].apply(lambda x:round(x,4))
print(df)
_,missing = pankong(df)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
# 保存以上处理过后的数据到
df.to_csv('D:/df_little_change.csv',index = False)
df = pd.read_csv('D:/df_little_change.csv')
print(df.shape)
print(df)
4.2类别特征count、count ratio、onehot编码等
for f in tqdm(cate_cols):
# 将类型数据转换成01234...的数字
df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique()))))
# map()的功能是将一个自定义函数作用于Series对象的每个元素。
# df[f + '_count']字段表示类型数据中,不同值在该字段中分别出现的次数
df[f + '_count'] = df[f].map(df[f].value_counts())
# 使用get_dummies方法将类型数据转换成独热编码
df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1)
# 将两个字段联合起来
cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \
for j in range(i + 1, len(cate_cols))]
for f1, f2 in tqdm(cate_cols_combine):
# ???两个类型字段中各种值的出现次数的相加
df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['id'].transform('count')
df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count']
df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
df=if_field_is_same(df)
print(df.shape)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
4.3 离散型单特征衍生
# Create Features based on anonymised prefix groups
prefix = cate_2_cols
for i, p in enumerate(prefix):
print(i,p)
#column_set[]是以'XINGBIE', 'ZHIWU', 'XUELI'开头的字段
column_set = [x for x in df.columns.tolist() if x.startswith(prefix[i])]
# Take NA count
df[p + "_group_nan_sum"] = df[column_set].isnull().sum(axis=1) / df[column_set].shape[1]
# Take SUM/Mean if numeric
numeric_cols = [x for x in column_set if df[x].dtype != object]
if numeric_cols:
df[p + "_group_sum"] = df[column_set].sum(axis=1)
df[p + "_group_mean"] = df[column_set].mean(axis=1)
# Zero Count
df[p + "_group_0_count"] = (df[column_set] == 0).astype(int).sum(axis=1) / (
df[column_set].shape[1] - df[p + "_group_nan_sum"])
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
df=if_field_is_same(df)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
#relation_cate_2_cols用来存放选取的相关性较大的二值字段
relation_cate_2_cols = relation(df[cate_2_cols+['label']], poly_num=0.05)
# print(relation_cate_2_cols)
检查哪些数据的值差不多一样,发作用不大
weak_filed = find_weak_filed(df[cate_cols+cate_2_cols])
print(len(weak_filed))
print(weak_filed)
特征选择
rfecv_cate_2_col=clf_rfecv(df,cate_2_cols)
relation_cate_2_cols 与 rfecv_cate_2_col取并集–>select_cate_2_col
# relation_cate_2_cols 与 rfecv_cate_2_col取并集
select_cate_2_col=list(set(relation_cate_2_cols).union(set(rfecv_cate_2_col)))
print(len(select_cate_2_col))
print(select_cate_2_col)
# 保存二值类数据到本地
df[select_cate_2_col].to_csv('D:/rizhao_select_cate_2_col.csv',index = False)
_,missing = pankong(df)
df = select_missing_rate(df,missing,rate=0.001)
4.4 df内只留下多类别数据和连续数据
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df,)
# 保存二值类数据到本地
df[cate_cols+num_cols1+['label']].to_csv('D:/rizhao_cate_cols_num_cols1.csv',index = False)
cate_cols_num_cols1_df = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(cate_cols_num_cols1_df.shape)
print(cate_cols_num_cols1_df)
过滤多分类字段
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(cate_cols_num_cols1_df,20)
df=cate_cols_num_cols1_df
# 相关系数
relation_cate_cols = relation(df[cate_cols+['label']], poly_num=0.05)
rfecv_cate_cols=clf_rfecv(df,cate_cols)
# relation_cate_cols 与 rfecv_cate_cols 取并集
select_cate_col=[]
select_cate_col=list(set(relation_cate_cols).union(set(rfecv_cate_cols)))
select_cate_col=list(set(select_cate_col).union(set(['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT'])))
print(len(select_cate_col))
print(select_cate_col)
# 保存多分类数据到本地
df[select_cate_col].to_csv('D:/rizhao_select_cate_col.csv',index = False)
# 保存上一部处理过的数据到本地
df[select_cate_col+num_cols1+['label']].to_csv('D:/rizhao_select_cate_col_num_cols1.csv',index = False)
4.5 类别特征与数值特征交叉
select_cate_col_num_cols1 = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(select_cate_col_num_cols1.shape)
print(select_cate_col_num_cols1)
df_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
select_cate_col = df_select_cate_col.columns
print(len(select_cate_col))
_,_,cate_cols,num_cols1 = find_filed_class(select_cate_col_num_cols1,20)
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
select_cate_col_num_cols1[select_cate_col]
多类别与数值第一次交叉
select_cate_col_num_cols1['label']=df['label']
relation_cate_cols = []
rfecv_cate_col=[]
i=0
for f1 in tqdm(select_cate_col):
temp_cate_cols=[]
g = select_cate_col_num_cols1.groupby(f1)
# print(g)
for f2 in num_gen_feats:
for stat in ['sum', 'mean', 'std']:
# f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
# 相关系数
relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))
print(len(relation_cate_cols))
多类别与数值第二次交叉
for f1 in tqdm(select_cate_col):
temp_cate_cols=[]
g = select_cate_col_num_cols1.groupby(f1)
# print(g)
for f2 in num_gen_feats:
for stat in ['max', 'min', 'var','count']:
# f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
# 相关系数
relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))
print(len(relation_cate_cols))
select_cate_col_num_cols1[relation_cate_cols]
# 保存上一部处理过的数据到本地
select_cate_col_num_cols1[relation_cate_cols+['label']].to_csv('D:/df_relation_cate_cols.csv',index = False)
df_relation_cate_cols = pd.read_csv('D:/df_relation_cate_cols.csv')
print(df_relation_cate_cols.shape)
print(df_relation_cate_cols)
_,missing = pankong(df_relation_cate_cols)
df_relation_cate_cols = select_missing_rate(df_relation_cate_cols,missing,rate=0.001)
df_relation_cate_cols = fill_kongzhi(df_relation_cate_cols,fill="mode")
_,missing = pankong(df_relation_cate_cols)
cate_1_cols,_,_,_ = find_filed_class(df_relation_cate_cols,20)
no_cate_1_cols = [col for col in df_relation_cate_cols.columns if col not in cate_1_cols]
df_relation_cate_cols=df_relation_cate_cols[no_cate_1_cols]
print(df_relation_cate_cols.shape)
gbdt_select_temp_df2=[]
for i in tqdm(range(int(len(df_relation_cate_cols.columns)/400+1))):
temp_col=[]
temp_col.extend(df_relation_cate_cols.columns[i*400:i*400+400])
# print(temp_col)
# print(len(temp_col))
# print(i)
if(i<int(len(df_relation_cate_cols.columns)/400)):
gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col+['label']],max_num=150))
elif(i==int(len(df_relation_cate_cols.columns)/400)):
gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col],max_num=150))
print(len(gbdt_select_temp_df2))
print(len(gbdt_select_temp_df2))
print(gbdt_select_temp_df2)
gbdt_select_cate_num_mix = df_relation_cate_cols[gbdt_select_temp_df2+['label']]
gbdt_select_cate_num_mix=if_field_is_same(gbdt_select_cate_num_mix)
print(gbdt_select_cate_num_mix.shape)
# 保存上一部处理过的数据到本地
gbdt_select_cate_num_mix.to_csv('D:/gbdt_select_temp_df2.csv',index = False)
gbdt_select_cate_num_mix= pd.read_csv('D:/gbdt_select_temp_df2.csv')
print(gbdt_select_cate_num_mix.shape)
print(gbdt_select_cate_num_mix)
4.6 数值特征与数值特征交叉
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
# # 计算中出现的空值主要来自于这里!!!
relation_num_cols = []
i=0
for f1 in tqdm(num_gen_feats):
temp_num_cols=[]
g = df.groupby(f1)
# print(g)
for f2 in num_gen_feats:
for stat in ['sum', 'mean', 'std']:
# f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
temp_num_cols.append('{}_{}_{}'.format(f1, f2, stat))
# 相关系数
relation_num_cols.extend(relation(df[temp_num_cols+['label']], poly_num=0.05))
print(relation_num_cols)
print(len(relation_num_cols))
# 保存上一部处理过的数据到本地
df[relation_num_cols+['label']].to_csv('D:/df_relation_num_cols.csv',index = False)
df_relation_num_cols = pd.read_csv('D:/df_relation_num_cols.csv')
print(df_relation_num_cols.shape)
print(df_relation_num_cols)
_,missing = pankong(df_relation_num_cols)
df_relation_num_cols = select_missing_rate(df_relation_num_cols,missing,rate=0.001)
df_relation_num_cols = fill_kongzhi(df_relation_num_cols,fill="mode")
_,missing = pankong(df_relation_num_cols)
cate_1_cols,_,_,_ = find_filed_class(df_relation_num_cols,20)
no_cate_1_cols = [col for col in df_relation_num_cols.columns if col not in cate_1_cols]
df_relation_num_cols=df_relation_num_cols[no_cate_1_cols]
print(df_relation_num_cols.shape)
gbdt_select_temp_df3=[]
for i in tqdm(range(int(len(df_relation_num_cols.columns)/400+1))):
temp_col=[]
temp_col.extend(df_relation_num_cols.columns[i*400:i*400+400])
# print(temp_col)
# print(len(temp_col))
# print(i)
if(i<int(len(df_relation_num_cols.columns)/400)):
gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col+['label']],max_num=150))
elif(i==int(len(df_relation_num_cols.columns)/400)):
gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col],max_num=150))
print(len(gbdt_select_temp_df3))
print(len(gbdt_select_temp_df3))
print(gbdt_select_temp_df3)
gbdt_select_num_num_mix = df_relation_num_cols[gbdt_select_temp_df3+['label']]
gbdt_select_num_num_mix=if_field_is_same(gbdt_select_num_num_mix)
print(gbdt_select_num_num_mix.shape)
# 保存上一部处理过的数据到本地
gbdt_select_num_num_mix.to_csv('D:/gbdt_select_num_num_mix.csv',index = False)
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)
4.7 多项式特征
num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
def polynomial_features111(df, poly_num=0.15 ,change=0,degreenum=2):
"""
poly_num:相关性 change:0原始字段不变,1输出新增字段 degreenum:阶数
"""
num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
poly_features = df[:40000][num_gen_feats]
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
# 新特征是否与target有相关性。
poly_features = pd.DataFrame(poly_features ,
columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
)
poly_features['TARGET'] =df[:40000]['label']
poly_corrs = poly_features.corr()['TARGET'].sort_values()
po_temp = []
for i in range(len(poly_corrs)):
if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_gen_feats + ['TARGET']):
po_temp.append(poly_corrs.index[i])
print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个')
if(change == 1):
dfpo = df[num_gen_feats]
dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False)
dfpo_transformer.fit(dfpo)
dfpo = dfpo_transformer.transform(dfpo)
dfpo = pd.DataFrame(dfpo ,
columns = poly_transformer.get_feature_names(input_features = num_gen_feats)
)
# 新的字段拼接到df上
df = dfpo[po_temp]
return df,po_temp
# Make a new dataframe for polynomial features
df_poly,poly_field = polynomial_features111(df[num_gen_feats+['label']],poly_num=0.01 ,change=1,degreenum=2)
print(df_poly.shape)
print(len(poly_field))
gbdt_poly_df=[]
df_poly['label']=df['label']
gbdt_poly_df.extend(GBDTselectfea(df_poly[poly_field+['label']],max_num=100))
# 保存上一部处理过的数据到本地
df_poly[gbdt_poly_df].to_csv('D:/df_gbdt_poly_fea.csv',index = False)
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)
4.8 连续型变量分析log
1.查看某一个数值型变量的分布,查看变量是否符合正态分布, 如果不符合正太分布的变量可以log化后再观察下是否符合正态分布。 2.如果想统一处理一批数据变标准化 必须把这些之前已经正态化的数据提出 3.正态化的原因:一些情况下正态非正态可以让模型更快的收敛, 一些模型要求数据正态(eg. GMM、KNN),保证数据不要过偏态即可,过于偏态可能会影响模型预测结果。
value_vars = ['GRZHYE','GRJCJS', 'GRYJCE', 'YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'JIEXI','DKYE_DIV_GRYJCE_ADD_DWYJCE','GJJJKBL']
df = Normal_distribution(df, value_vars, 0)
4.9 拼接上面筛选的所有特征
# 原始字段和一些新字段
print("-----------------原始字段和一些新字段--------------------")
df_little_change = pd.read_csv('D:/df_little_change.csv')
print(df_little_change.shape)
print(df_little_change)
# 二值类数据
print("-----------------二值类数据--------------------")
rizhao_select_cate_2_col = pd.read_csv('D:/rizhao_select_cate_2_col.csv')
rizhao_select_cate_2_col['id'] = df_little_change['id']
print(rizhao_select_cate_2_col.shape)
print(rizhao_select_cate_2_col)
# 多值类数据
print("-----------------多值类数据--------------------")
rizhao_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
rizhao_select_cate_col['id'] = df_little_change['id']
print(rizhao_select_cate_col.shape)
print(rizhao_select_cate_col)
# 类别与数值交叉
print("-----------------类别与数值交叉--------------------")
gbdt_select_temp_df2= pd.read_csv('D:/gbdt_select_temp_df2.csv')
gbdt_select_temp_df2['id'] = df_little_change['id']
print(gbdt_select_temp_df2.shape)
print(gbdt_select_temp_df2)
# 数值与数值交叉
print("-----------------数值与数值交叉--------------------")
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
gbdt_select_num_num_mix['id'] = df_little_change['id']
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)
# 多项式数据
print("-----------------多项式数据--------------------")
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
df_gbdt_poly_fea['id'] = df_little_change['id']
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)
df = pd.merge(df_little_change,rizhao_select_cate_2_col ,on='id')
print(df.shape)
df = pd.merge(df,rizhao_select_cate_col ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_temp_df2 ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_num_num_mix ,on='id')
print(df.shape)
df = pd.merge(df,df_gbdt_poly_fea ,on='id')
print(df.shape)
_,missing = pankong(df)
df = fill_kongzhi(df)
_,missing = pankong(df)
print(len(df.columns))
print(len(set(df.columns)))
df=if_field_is_same(df)
df.shape
print(len(df.columns))
print(len(set(df.columns)))
col_temp=[]
for i in range(len(df.columns)):
print(df.columns[i])
if(df.columns[i] not in col_temp):
if(" " in df.columns[i]):
col_temp.append(df.columns[i].replace(" ", "_*_"))
else:
col_temp.append(df.columns[i])
print(len(col_temp))
print(col_temp)
df.columns = col_temp
for i in range(len(df.columns)):
print(df.columns[i])
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_concat.csv',index = False)
5. 模型调参
方法一:
第一步:学习率和迭代次数
import pandas as pd
import lightgbm as lgb
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'nthread':12,
'learning_rate':0.1,
'num_leaves':32,
'max_depth': 5,
'subsample': 0.8,
'colsample_bytree': 0.8,
}
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())
第二步:确定max_depth和num_leaves
from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}
gsearch1 = GridSearchCV(
estimator=lgb.LGBMClassifier(
boosting_type='gbdt',objective='binary',metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
bagging_fraction = 0.8,
feature_fraction = 0.8),
param_grid = params_test1,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch1.fit(X_train,y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_
第三步:确定min_data_in_leaf和max_bin in
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}
gsearch2 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
bagging_fraction = 0.8,
feature_fraction = 0.8),
param_grid = params_test2,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_
第四步:确定feature_fraction、bagging_fraction、bagging_freq
params_test3={'feature_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
'bagging_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0],
'bagging_freq': range(0,101,10)}
gsearch3 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71),
param_grid = params_test3,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_
第五步:确定lambda_l1和lambda_l2
# params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
# 'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]}
params_test4={'lambda_l1': [0.8,0.85,0.9,0.95],
'lambda_l2': [0.8,0.85,0.9,0.95]}
gsearch4 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq=0,
feature_fraction= 0.8),
param_grid = params_test4,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_
第六步:确定 min_split_gain
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
gsearch5 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9),
param_grid = params_test5,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_
# subsample
params_test6={'subsample':[0.0,0.1,0.2,0.3,0.4]}
gsearch6 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0),
param_grid = params_test6,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch6.fit(X_train,y_train)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_
# colsample_bytree
params_test7={'colsample_bytree':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]}
gsearch7 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0,
subsample=0),
param_grid = params_test7,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch7.fit(X_train,y_train)
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_
# min_child_weight
params_test8={'min_child_weight':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
gsearch8 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
n_estimators=154,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0,
subsample=0,
colsample_bytree=0),
param_grid = params_test8,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch8.fit(X_train,y_train)
gsearch8.cv_results_, gsearch8.best_params_, gsearch8.best_score_
对调参没有经验,所以不知道以上超惨设置好之后,学习率和迭代次数怎么调整?所以索性写了一个循环,找到大概的较好的值
# subsample_freq
params_test9={'learning_rate':[0.02,0.03,0.04,0.05,0.06,0.07,0.08],
'n_estimators':[1000,2000,5000,8000,10000,20000,30000]}
gsearch9 = GridSearchCV(
estimator = lgb.LGBMClassifier(
boosting_type='gbdt',
objective='binary',
metrics='auc',
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0,
subsample=0,
colsample_bytree=0,
min_child_weight=0),
param_grid = params_test9,
scoring='roc_auc',
cv=5,
n_jobs=-1
)
gsearch9.fit(X_train,y_train)
gsearch9.cv_results_, gsearch9.best_params_, gsearch9.best_score_
第七步:降低学习率,增加迭代次数,验证模型
auc_list=[]
tpr_list=[]
for j in [0.018,0.019,0.02,0.21,0.22,0.023,0.24,0.025]:
auc_list=[]
tpr_list=[]
for i in [5000,6000,7000,8000,10000,12000,15000,18000,20000,25000]:
model=lgb.LGBMClassifier(
learning_rate=j,
n_estimators=i,
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0,
subsample=0,
colsample_bytree=0,
min_child_weight=0
)
model.fit(X_train,y_train)
y_pre=model.predict_proba(X_test)[:, 1]
print("---------------------------------------------------")
print("learning_rate:"+str(j)+" "+"n_estimators:"+str(i))
auc=round(roc_auc_score(y_test,y_pre), 6)
tpr=round(tpr_weight_funtion(y_test,y_pre), 6)
if(auc in auc_list and tpr in tpr_list):
print("---break---")
break
auc_list.append(auc)
tpr_list.append(tpr)
print("auc:",auc)
print("tpr:",tpr)
print("---------------------------------------------------")
# 0.06-10000-0.470364-0.941146
# 0.06-20000-0.470364-0.941146
# 0.05-20000-0.476182-0.941146
方法二:
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
# from sklearn.datasets import load_breast_cancer
# from sklearn.cross_validation import train_test_split
# canceData=load_breast_cancer()
# X=canceData.data
# y=canceData.target
# X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
cols = [col for col in df.columns if col not in ['label','id']]
X=df[:40000][cols]
y=df[:40000]['label']
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
### 数据转换
print('数据转换')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
### 设置初始参数--不含交叉验证参数
print('设置参数')
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'nthread':4,
'learning_rate':0.1
}
### 交叉验证(调参)
print('交叉验证')
max_auc = float('0')
best_params = {}
# 准确率
print("调参1:提高准确率")
for num_leaves in range(5,100,5):
for max_depth in range(3,8,1):
params['num_leaves'] = num_leaves
params['max_depth'] = max_depth
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=5,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc = mean_auc
best_params['num_leaves'] = num_leaves
best_params['max_depth'] = max_depth
if 'num_leaves' and 'max_depth' in best_params.keys():
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']
# 过拟合
print("调参2:降低过拟合")
for max_bin in range(5,256,10):
for min_data_in_leaf in range(1,102,10):
params['max_bin'] = max_bin
params['min_data_in_leaf'] = min_data_in_leaf
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=5,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc = mean_auc
best_params['max_bin']= max_bin
best_params['min_data_in_leaf'] = min_data_in_leaf
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
params['min_data_in_leaf'] = best_params['min_data_in_leaf']
params['max_bin'] = best_params['max_bin']
print("调参3:降低过拟合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:
for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:
for bagging_freq in range(0,50,5):
params['feature_fraction'] = feature_fraction
params['bagging_fraction'] = bagging_fraction
params['bagging_freq'] = bagging_freq
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=5,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc=mean_auc
best_params['feature_fraction'] = feature_fraction
best_params['bagging_fraction'] = bagging_fraction
best_params['bagging_freq'] = bagging_freq
if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
params['feature_fraction'] = best_params['feature_fraction']
params['bagging_fraction'] = best_params['bagging_fraction']
params['bagging_freq'] = best_params['bagging_freq']
print("调参4:降低过拟合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
params['lambda_l1'] = lambda_l1
params['lambda_l2'] = lambda_l2
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=5,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc=mean_auc
best_params['lambda_l1'] = lambda_l1
best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
params['lambda_l1'] = best_params['lambda_l1']
params['lambda_l2'] = best_params['lambda_l2']
print("调参5:降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
params['min_split_gain'] = min_split_gain
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=5,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc=mean_auc
best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
params['min_split_gain'] = best_params['min_split_gain']
print(best_params)
方法三:贝叶斯
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer, accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
answers = []
mean_score = 0
mean_f1_score = 0
n_folds = 5
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1024)
cols = [col for col in df.columns if col not in ['label','id']]
# X=df[:40000][cols]
# y=df[:40000]['label']
for tr, te in sk.split(df[:40000][cols], df[:40000]['label']):
X = df[:40000][cols].iloc[tr]
y = df[:40000]['label'].iloc[tr]
print(X.shape)
print(y.shape)
#定义优化参数
def rf_cv(n_estimators,learning_rate):
val = cross_val_score(
LGBMClassifier(
learning_rate=min(learning_rate,0.15),
n_estimators=int(n_estimators),
# boosting_type='dart', #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。
# learning_rate=min(learning_rate,0.4), #0.05->0.918 0.07->0.924 0.08->0.926
# n_estimators=int(n_estimators), #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。
# max_depth=int(max_depth), #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。
# num_leaves=int(num_leaves), #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth)
# subsample = min(subsample,0.9), #训练样本采样率,行
# colsample_bytree = min(colsample_bytree,0.9), #训练特征采样率,列
# random_state=int(random_state), #随机种子数
# min_data_in_leaf=int(min_data_in_leaf), # 可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves
# reg_alpha= min(reg_alpha,0.999),
# reg_lambda= min(reg_lambda,0.999),
# lambda_l1= 0.1, # 0.1
# lambda_l2=0.2, # 0.2
# min_split_gain=min(min_split_gain,0.9),
# min_child_weight=min(min_child_weight,0.9),
# metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss'
n_jobs=6, #并行运行多线程核心数
verbose=-1
),
X,y,scoring="accuracy",cv=5
).mean()
return val
#贝叶斯优化
rf_bo = BayesianOptimization(rf_cv,
{
"n_estimators":(1000,20000),
"learning_rate":(0.001,0.1)
# "colsample_bytree":(0.85,0.97),
# "min_data_in_leaf":(100,2000)
# "subsample":(0.7,0.9),
# "max_depth":(25,40),
# "num_leaves":(31,35)
# "reg_alpha":(0.2,0.5),
# "reg_lambda":(0.3,0.5),
# "lambda_l1":(0.6,0.95),
# "lambda_l2":(0.5,0.8),
# "random_state":(0,1024),
# "min_split_gain":(0.2,0.6),
# "min_child_weight":(0.6,0.9)
})
#开始优化
num_iter = 100
init_points = 5
rf_bo.maximize(init_points=init_points,n_iter=num_iter)#显示优化结果
rf_bo.max
以上步骤后数据位55000行,1971列,调参之后的训练结果不是很好,甚至少于曾经用原始数据+贝叶斯优化的结果。
因此,在这里准备再次进行特征选择
#GBDT
gbdt_col=[]
for i in tqdm(range(int(len(df.columns)/400+1))):
temp_col=[]
temp_col.extend(df.columns[i*400:i*400+400])
if(i<int(len(df.columns)/400)):
gbdt_col.extend(GBDTselectfea(df[temp_col+['label']],max_num=300))
elif(i==int(len(df.columns)/400)):
gbdt_col.extend(GBDTselectfea(df[temp_col],max_num=250))
print(len(gbdt_col))
print(gbdt_col)
# 循环递归消除
rfecv_col=[]
for i in tqdm(range(int(len(df.columns)/200+1))):
temp_col=[]
temp_col.extend(df.columns[i*200:i*200+200])
rfecv_col.extend(clf_rfecv(df[:40000][temp_col+['label']],temp_col+['label'],5))
print(len(rfecv_col))
print(rfecv_col)
gbdt_col=gbdt_col.extend(['id','label'])
gbdt_rfecv_col=list(set(rfecv_col).union(set(gbdt_col)))
print(len(gbdt_rfecv_col))
df=df[gbdt_rfecv_col]
print(df.shape)
print(df)
# 加载数据
df= pd.read_csv('D:/df_rfecv.csv')
print(df.shape)
print(df)
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_rfecv.csv',index = False)
6 训练
oof = np.zeros(train_df.shape[0])
# feat_imp_df = pd.DataFrame({'feat': cols, 'imp': 0})
test_df['prob'] = 0
clf = LGBMClassifier(
boosting_type='gbdt',
objective='binary',
learning_rate=0.015,
n_estimators=6500,
# metrics='auc',
max_depth=6,
num_leaves=30,
max_bin=25,
min_data_in_leaf=71,
bagging_fraction=0.65,
bagging_freq= 0,
feature_fraction= 0.8,
lambda_l1=0.9,
lambda_l2=0.9,
min_split_gain=0,
metric=None,
n_jobs=6, #并行运行多线程核心数
verbose=-1
)
val_aucs = []
seeds = [1023, 2048, 2098]
for seed in seeds:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
print('--------------------- {} fold ---------------------'.format(i))
t = time.time()
trn_x, trn_y = train_df[cols].iloc[trn_idx].reset_index(drop=True), train_df['label'].values[trn_idx]
val_x, val_y = train_df[cols].iloc[val_idx].reset_index(drop=True), train_df['label'].values[val_idx]
clf.fit(
trn_x, trn_y,
eval_set=[(val_x, val_y)],
# categorical_feature=cate_cols,
eval_metric='auc',
early_stopping_rounds=200,
verbose=200
)
# feat_imp_df['imp'] += clf.feature_importances_ / skf.n_splits
oof[val_idx] = clf.predict_proba(val_x)[:, 1]
test_df['prob'] += clf.predict_proba(test_df[cols])[:, 1] / skf.n_splits / len(seeds)
cv_auc = roc_auc_score(train_df['label'], oof)
val_aucs.append(cv_auc)
print('\ncv_auc: ', cv_auc)
print(val_aucs, np.mean(val_aucs))
评价指标:TPR
def tpr_weight_funtion(y_true,y_predict):
d = pd.DataFrame()
d['prob'] = list(y_predict)
d['y'] = list(y_true)
d = d.sort_values(['prob'], ascending=[0])
y = d.y
PosAll = pd.Series(y).value_counts()[1]
NegAll = pd.Series(y).value_counts()[0]
pCumsum = d['y'].cumsum()
nCumsum = np.arange(len(y)) - pCumsum + 1
pCumsumPer = pCumsum / PosAll
nCumsumPer = nCumsum / NegAll
TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3
tpr = round(tpr_weight_funtion(train_df['label'], oof), 6)
tpr, round(np.mean(val_aucs), 5)
# print(test_df)
submit['id'] = test_df['id']
submit['label'] = test_df['prob']
submit.to_csv('D:/submit12.csv'.format(tpr, round(np.mean(val_aucs), 6)), index = False)
submit.head()
这是本人第二次参赛,回想第一次参赛啥也不懂,这次对相关技术了解得更多,以后要继续努力,加强基础知识的学习,同时也要经常关注各类比赛,将比赛与个人的研究方向相结合。希望各位大神多多指教,其中很多都是个人自己的想法,并不确定其中的正确性和原理,各种方法组合到一起知否能达到最优本人也不是很清楚。希望大家多多指教。