task2-特征工程

task2-特征工程

1.利用随机森林挑选特征

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X, y)
rfc_impc = pd.Series(rfc.feature_importances_, index=X.columns).sort_values(ascending=False)
fea_gini = rfc_impc[:20].index
fea_gini

选出特征如下:

{'abs',
 'apply_score',
 'consfin_avg_limit',
 'consfin_credit_limit',
 'first_transaction_time',
 'historical_trans_amount',
 'history_fail_fee',
 'latest_query_day',
 'loans_overdue_count',
 'loans_score',
 'max_cumulative_consume_later_1_month',
 'pawns_auctions_trusts_consume_last_6_month',
 'repayment_capability',
 'trans_activity_day',
 'trans_amount_3_month',
 'trans_amount_increase_rate_lately',
 'trans_day_last_12_month',
 'trans_fail_top_count_enum_last_12_month',
 'trans_fail_top_count_enum_last_1_month',
 'trans_fail_top_count_enum_last_6_month'}

2.利用iv值挑选特征

import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):  
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict) 
        
    return pd.DataFrame.from_dict(iv_dict, orient='index')   
        
def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1    # 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x, y, feature,event = 1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total
    
    iv = 0
    woe_dict = {}
    for x1 in set(x):    # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total    
        rate_non_event = non_event_count / non_event_total
        
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv

ceui_iv = woe(X, y, event=1)
ceui_iv= ceui_iv.reset_index()
ceui_iv = ceui_iv.rename(columns ={0:'iv','index':'feature'})
ceui_iv[(ceui_iv.iv>=0.1) & (ceui_iv.iv<=0.5) ]

选取的iv值范围(0.1~0.5)的特征如下:

take_amount_in_later_12_month_highest	0.180730
trans_days_interval_filter	0.145248
trans_days_interval	0.128173
rank_trad_1_month	0.102168
avg_price_last_12_month	0.321003
trans_top_time_last_1_month	0.105876
trans_top_time_last_6_month	0.104981
consume_top_time_last_6_month	0.100463
trans_fail_top_count_enum_last_6_month	0.288356
trans_fail_top_count_enum_last_12_month	0.285487
trans_day_last_12_month	0.237879
loans_count	0.159526
loans_settle_count	0.110633
loans_overdue_count	0.377598
history_suc_fee	0.170674
latest_one_month_suc	0.147711
latest_one_month_fail	0.333086
loans_long_time	0.196917
consfin_credit_limit	0.380944
consfin_max_limit	0.183655
latest_query_day	0.264939
loans_latest_day	0.215142

将两组特征做了简单的联合处理:

new_features = pd.Series(list(set(fea_gini)|set(c3.index)))
X_new = X[new_features]

3.评分(acc、auc):

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2333)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train = std.fit_transform(X_train.values)
X_test = std.transform(X_test.values)

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

lr   = LogisticRegression(C = 0.1, penalty = 'l1')
sm   = svm.SVC(C = 0.1, kernel = 'linear')
tr   = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
rfc  = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=2333)
gbdtc= GradientBoostingClassifier(n_estimators= 250, subsample=0.9,random_state=2333)
xgbc = XGBClassifier(max_depth=3, min_child_weight=5)
lgbc = LGBMClassifier( max_depth=3, min_child_weight=11)
      
      
for i in ['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']:
    eval(i).fit(X_train,y_train)
    y_pred = eval(i).predict(X_test)
    i_acc = accuracy_score(y_test, y_pred)
    auc_i = roc_auc_score(y_test, y_pred)
    
table_1 = pd.DataFrame(index = ['acc','auc'],columns = [['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']])
for i in ['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']:
     table_1.loc['acc',i] = i_acc
     table_1.loc['auc',i] = auc_i   

得到评分表格(全部特征)如下:
在这里插入图片描述
同样条件下用挑选出的特征得到的评分表格:
在这里插入图片描述
参考:https://yezuolin.com/2018/11/IV&RandomForestClassifier/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值