建模-特征筛选

第二次任务:对数据已经预处理的变量,使用IV和随机森林的特征重要性进行筛选;

目录:
1、导入数据
2、IV值计算
3、importance计算
4、特征筛选

1、导入数据

#导入需要的包
import numpy as np
import pandas as pd
import LR as lr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
####数据导入
data =  pd.read_csv(r'/data/1/home/mabufa/data/task/data_task02.csv')
####标签区分
label = data['status']
data_var = data.drop(['status'], axis=1)

2、IV值计算

####定义IV值计算公式
def calc_iv(df, feature, target, pr = False):
    '''
    input:
        df: data
        feature: independent variable
        target: good/bad
        pr: True to enable printing of output
    output:
        iv:float
        data:pandas.DataFrame
    '''
    lst = []
    df[feature] = df[feature].fillna('NULL')

    for i in range(df[feature].nunique()):  #nunique()返回不同行或列的值,去重后的数量;axis=0/1对应列或行;
        val = list(df[feature].unique())[i]    #对于一维数组或列表,去除其中重复的元素,并按元素由大到小返回一个新的无元素重复的元组或列表;
        lst.append([feature,
            val,
            df[df[feature] == val].count()[feature],
            df[(df[feature] == val) & (df[target] == 0)].count()[feature],
            df[(df[feature] == val) & (df[target] == 1)].count()[feature]])
    data = pd.DataFrame(lst, columns=['Variable','Value','All','Good','Bad'])
    
    data['Share'] = data['All'] / data['All'].sum()  #分组的占比
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE':{np.inf:0,-np.inf:0}})  #替换正无穷和负无穷为0,以字典的形式;
    
    data['IV'] = data['WoE']*(data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable','Value'],ascending=[True,True])  #以列['Variable','Value']排序,采用升序;
    data.index = range(len(data.index)) #重置索引
    
    if pr:
        print(data)
        print('IV = ',data['IV'].sum())
        
    iv = data['IV'].sum()
    data = data.append(data)
    return iv,data
##IV值计算
IV_dict = {}
f_col = data_var.columns

for x in f_col:
     IV_1,df = calc_iv(data, x, 'status')
#      print('{}: {}'.format(x, IV_1))
     IV_dict[x] = IV_1
#将变量IV值进行降序排列,方便后续挑选变量
IV_dict_sorted = sorted(IV_dict.items(), key=lambda x: x[1], reverse=True)
IV_values = [i[1] for i in IV_dict_sorted]
IV_name = [i[0] for i in IV_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature IV')
plt.bar(range(len(IV_values)),IV_values)

在这里插入图片描述
3、importance计算

#### 随机森林,查看importance

param = {'n_estimators': list(range(10, 1001, 50))}
g = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
                       param_grid=param, cv=5)
g.fit(data_var, label)
g.best_estimator_
#调参
param = {'n_estimators': list(range(770, 870, 10))}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
                       param_grid=param, cv=5)
forest_grid.fit(data_var, label)
rnd_clf = forest_grid.best_estimator_
rnd_clf
##f_importance 
f_importance = {}

importances = rnd_clf.feature_importances_
indices = np.argsort(importances)[::-1]  #argsort()函数将importances中的元素从小到大排列,提取其对应的index(索引),然后输出到indices;
for f in range(data_var.shape[1]):
#     print(" %  s%f" % (f_col[indices[f]], importances[indices[f]]))
    f_importance[f_col[indices[f]]] = importances[indices[f]]
#将变量importances进行降序排列,方便后续挑选变量
im_dict_sorted = sorted(f_importance.items(), key=lambda x: x[1], reverse=True)
im_values = [i[1] for i in im_dict_sorted]
im_name = [i[0] for i in im_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature importances')
plt.bar(range(len(im_values)),im_values)

在这里插入图片描述
4、特征筛选

##IV值和importance的字典转换

df_iv = pd.DataFrame(IV_dict_sorted, columns=['vars','iv'])
df_im = pd.DataFrame(im_dict_sorted, columns=['vars','importances'])
df_iv_im =  df_iv .merge(df_im [['vars','importances']],on=['vars'],how='left')
##特征筛选:IV值取大于0.1,具有较强预测能力;importance阈值简单设为0.015;

df_iv_im = df_iv_im[df_iv_im['iv']>0.1]
df_iv_im  = df_iv_im [df_iv_im ['importances']>0.015]
  • 3
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值