task2-特征工程
1.利用随机森林挑选特征
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X, y)
rfc_impc = pd.Series(rfc.feature_importances_, index=X.columns).sort_values(ascending=False)
fea_gini = rfc_impc[:20].index
fea_gini
选出特征如下:
{'abs',
'apply_score',
'consfin_avg_limit',
'consfin_credit_limit',
'first_transaction_time',
'historical_trans_amount',
'history_fail_fee',
'latest_query_day',
'loans_overdue_count',
'loans_score',
'max_cumulative_consume_later_1_month',
'pawns_auctions_trusts_consume_last_6_month',
'repayment_capability',
'trans_activity_day',
'trans_amount_3_month',
'trans_amount_increase_rate_lately',
'trans_day_last_12_month',
'trans_fail_top_count_enum_last_12_month',
'trans_fail_top_count_enum_last_1_month',
'trans_fail_top_count_enum_last_6_month'}
2.利用iv值挑选特征
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target
def woe(X, y, event=1):
res_woe = []
iv_dict = {}
for feature in X.columns:
x = X[feature].values
# 1) 连续特征离散化
if type_of_target(x) == 'continuous':
x = discrete(x)
# 2) 计算该特征的woe和iv
# woe_dict, iv = woe_single_x(x, y, feature, event)
woe_dict, iv = woe_single_x(x, y, feature, event)
iv_dict[feature] = iv
res_woe.append(woe_dict)
return pd.DataFrame.from_dict(iv_dict, orient='index')
def discrete(x):
# 使用5等分离散化特征
res = np.zeros(x.shape)
for i in range(5):
point1 = stats.scoreatpercentile(x, i * 20)
point2 = stats.scoreatpercentile(x, (i + 1) * 20)
x1 = x[np.where((x >= point1) & (x <= point2))]
mask = np.in1d(x, x1)
res[mask] = i + 1 # 将[i, i+1]块内的值标记成i+1
return res
def woe_single_x(x, y, feature,event = 1):
# event代表预测正例的标签
event_total = sum(y == event)
non_event_total = y.shape[-1] - event_total
iv = 0
woe_dict = {}
for x1 in set(x): # 遍历各个块
y1 = y.reindex(np.where(x == x1)[0])
event_count = sum(y1 == event)
non_event_count = y1.shape[-1] - event_count
rate_event = event_count / event_total
rate_non_event = non_event_count / non_event_total
if rate_event == 0:
rate_event = 0.0001
# woei = -20
elif rate_non_event == 0:
rate_non_event = 0.0001
# woei = 20
woei = math.log(rate_event / rate_non_event)
woe_dict[x1] = woei
iv += (rate_event - rate_non_event) * woei
return woe_dict, iv
ceui_iv = woe(X, y, event=1)
ceui_iv= ceui_iv.reset_index()
ceui_iv = ceui_iv.rename(columns ={0:'iv','index':'feature'})
ceui_iv[(ceui_iv.iv>=0.1) & (ceui_iv.iv<=0.5) ]
选取的iv值范围(0.1~0.5)的特征如下:
take_amount_in_later_12_month_highest 0.180730
trans_days_interval_filter 0.145248
trans_days_interval 0.128173
rank_trad_1_month 0.102168
avg_price_last_12_month 0.321003
trans_top_time_last_1_month 0.105876
trans_top_time_last_6_month 0.104981
consume_top_time_last_6_month 0.100463
trans_fail_top_count_enum_last_6_month 0.288356
trans_fail_top_count_enum_last_12_month 0.285487
trans_day_last_12_month 0.237879
loans_count 0.159526
loans_settle_count 0.110633
loans_overdue_count 0.377598
history_suc_fee 0.170674
latest_one_month_suc 0.147711
latest_one_month_fail 0.333086
loans_long_time 0.196917
consfin_credit_limit 0.380944
consfin_max_limit 0.183655
latest_query_day 0.264939
loans_latest_day 0.215142
将两组特征做了简单的联合处理:
new_features = pd.Series(list(set(fea_gini)|set(c3.index)))
X_new = X[new_features]
3.评分(acc、auc):
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2333)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train = std.fit_transform(X_train.values)
X_test = std.transform(X_test.values)
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
lr = LogisticRegression(C = 0.1, penalty = 'l1')
sm = svm.SVC(C = 0.1, kernel = 'linear')
tr = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
rfc = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=2333)
gbdtc= GradientBoostingClassifier(n_estimators= 250, subsample=0.9,random_state=2333)
xgbc = XGBClassifier(max_depth=3, min_child_weight=5)
lgbc = LGBMClassifier( max_depth=3, min_child_weight=11)
for i in ['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']:
eval(i).fit(X_train,y_train)
y_pred = eval(i).predict(X_test)
i_acc = accuracy_score(y_test, y_pred)
auc_i = roc_auc_score(y_test, y_pred)
table_1 = pd.DataFrame(index = ['acc','auc'],columns = [['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']])
for i in ['lr','sm','tr','rfc','gbdtc','xgbc','lgbc']:
table_1.loc['acc',i] = i_acc
table_1.loc['auc',i] = auc_i
得到评分表格(全部特征)如下:
同样条件下用挑选出的特征得到的评分表格:
参考:https://yezuolin.com/2018/11/IV&RandomForestClassifier/