分别用IV和随机森林选取的特征值做预测

分别用IV和随机森林选取的特征值做预测

  • 读取数据
    该数据已经补齐缺失值
data=pd.read_csv('/home/infisa/wjht/project/DataWhale/output/data_fill',encoding='gbk')
y=data['status']
X=data.drop('status',axis=1)

*IV值法选取特征值

def woe(X, y, event=1):
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict)

    return iv_dict


def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1  # 将[i, i+1]块内的值标记成i+1
    return res


def woe_single_x(x, y, feature, event=1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total

    iv = 0
    woe_dict = {}
    for x1 in set(x):  # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total
        rate_non_event = non_event_count / non_event_total

        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv
    
  • 得到iv
warnings.filterwarnings("ignore")
iv_dict = woe(X, y)
  • Iv选取特征
def iv_choose(iv):
    iv_list = []
    for i, j in iv.items():
        if (j > 0.02) & (j < 0.6):
            iv_list.append(i)
    return iv_list


iv_list = iv_choose(iv)
  • 得到选取的特征列表,共50个特征
['trans_top_time_last_1_month', 'trans_days_interval', 'loans_overdue_count', 'loans_product_count', 'trans_amount_3_month', 'pawns_auctions_trusts_consume_last_1_month', 'latest_six_month_apply', 'consume_top_time_last_6_month', 'trans_day_last_12_month', 'history_fail_fee', 'latest_query_time_month', 'latest_one_month_apply', 'consfin_max_limit', 'loans_long_time', 'top_trans_count_last_1_month', 'latest_one_month_fail', 'consfin_product_count', 'latest_three_month_loan', 'avg_price_top_last_12_valid_month', 'loans_org_count_behavior', 'latest_query_day', 'trans_fail_top_count_enum_last_6_month', 'trans_fail_top_count_enum_last_12_month', 'trans_fail_top_count_enum_last_1_month', 'latest_one_month_suc', 'loans_max_limit', 'apply_credibility', 'consume_top_time_last_1_month', 'loans_settle_count', 'consfin_credit_limit', 'rank_trad_1_month', 'consfin_avg_limit', 'trans_amount_increase_rate_lately', 'latest_six_month_loan', 'take_amount_in_later_12_month_highest', 'trans_top_time_last_6_month', 'trans_days_interval_filter', 'loans_credit_limit', 'avg_price_last_12_month']
  • 导出数据
data_iv=data.drop(iv_list,axis=1,inplace=False)
print(data_iv.shape)
data_iv.to_csv(OUTPUT_DATA_PATH+'data_iv')
  • 随机森林选取特征
feat_lables = X.columns
forest = RandomForestClassifier(n_estimators=10000,n_jobs=1)
forest.fit(X, y)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]

for i in range(X.shape[1]):
    print("%2d. %-*s %f"%(i+1, 30, feat_lables[i], importance[imp_result[i]]))

threshold = 0.01
data_index = list(X.columns[ importance < threshold])
X.drop(data_index,axis=1,inplace=True)
data_for=pd.concat([X,y],axis=1)
  • 所选的特征列,共49个特征
['low_volume_percent', 'middle_volume_percent', 'take_amount_in_later_12_month_highest', 'regional_mobility', 'student_feature', 'is_high_user', 'avg_consume_less_12_valid_month', 'top_trans_count_last_1_month', 'avg_price_top_last_12_valid_month', 'cross_consume_count_last_1_month', 'consume_mini_time_last_1_month', 'max_consume_count_later_6_month', 'railway_consume_count_last_12_month', 'pawns_auctions_trusts_consume_last_1_month', 'jewelry_consume_count_last_6_month', 'query_org_count', 'query_finance_count', 'query_cash_count', 'latest_one_month_apply', 'latest_three_month_apply', 'latest_six_month_apply', 'loans_credibility_behavior', 'loans_org_count_behavior', 'consfin_org_count_behavior', 'loans_cash_count', 'latest_one_month_loan', 'latest_three_month_loan', 'loans_credibility_limit', 'loans_org_count_current', 'loans_product_count', 'consfin_credibility', 'consfin_org_count_current', 'consfin_product_count', 'reg_preference_for_trad', 'latest_query_time_year', 'latest_query_time_month', 'latest_query_time_weekday', 'loans_latest_time_year', 'loans_latest_time_month', 'loans_latest_time_weekday']
  • 导出随机森林所选特数据表
data_for.to_csv(OUTPUT_DATA_PATH+'data_forest')

*导入经过IV值和随机森林选取的数据集,分别做预测

import pandas as pd
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
# 引入要用到的评价函数
from sklearn.metrics import precision_score,roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
# 引入用到的分类算法
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings


warnings.filterwarnings("ignore")
data_iv=pd.read_csv('/home/infisa/wjht/project/DataWhale/output/data_iv',encoding='gbk')
y_iv=data_iv['status']
X_iv=data_iv.drop('status',axis=1)

data_for=pd.read_csv('/home/infisa/wjht/project/DataWhale/output/data_forest',encoding='gbk')
y_forest=data_for['status']
X_forest=data_for.drop('status',axis=1)

X_iv=scale(X_iv,axis=0)
X_forest=scale(X_forest,axis=0)
Xiv_train, Xiv_test, yiv_train, yiv_test = train_test_split(X_iv, y_iv, test_size=0.3, random_state=2018)
Xfor_train, Xfor_test, yfor_train, yfor_test = train_test_split(X_forest, y_forest, test_size=0.3, random_state=2018)


# 定义一个包含多个评价指标的函数
def multi_score(model,x_test,y_test):
    test_predict=model.predict(x_test)
    if hasattr(model,'decision_function'):
        pre_test=model.decision_function(x_test)
    else:
        pre_test=model.predict_proba(x_test)[:,1]
    print("准确率",accuracy_score(y_test,test_predict))
    print("精确率",precision_score(y_test,test_predict))
    print("召回率",recall_score(y_test,test_predict))
    print("F1-score",f1_score(y_test,test_predict))
    print("AUC",roc_auc_score(y_test,pre_test))
    false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test,pre_test)
    roc_auc=auc(false_positive_rate, true_positive_rate)


# 依次训练不同分类器
# LogisticRegression
lr = LogisticRegression(random_state =2018)
# lr.fit(Xiv_train, yiv_train)
lr.fit(Xfor_train, yfor_train)
# 决策树
dt = DecisionTreeClassifier(random_state=2018)
# dt.fit(Xiv_train, yiv_train)
dt.fit(Xfor_train, yfor_train)
# SVM
svm_linearSVC = LinearSVC(random_state=2018)
# svm_linearSVC.fit(Xiv_train, yiv_train)
svm_linearSVC.fit(Xfor_train, yfor_train)
# 随机森林
rfc = RandomForestClassifier(n_estimators=100, random_state=2018)
# rfc.fit(Xiv_train, yiv_train)
rfc.fit(Xfor_train, yfor_train)
# GBDT
gbc = GradientBoostingClassifier(random_state=2018)
# gbc.fit(Xiv_train, yiv_train)
gbc.fit(Xfor_train, yfor_train)
# xgboost
xgbc = XGBClassifier(random_state=2018)
# xgbc.fit(Xiv_train, yiv_train)
xgbc.fit(Xfor_train, yfor_train)
# lightgbm
lgbc = LGBMClassifier(random_state=2018)
# lgbc.fit(Xiv_train, yiv_train)
lgbc.fit(Xfor_train, yfor_train)

# 依次对训练好的多个模型用多个指标评价

model_name = ["lr", "dt", "svm_linearSVC", "rfc", "gbc", "xgbc", "lgbc"]
for name in model_name:
    model = eval(name)
    print(name)
    # multi_score(model, Xiv_test, yiv_test)
    multi_score(model, Xfor_test, yfor_test)
  • IV数据级预测的结果
'''
lr
准确率 0.7435178696566223
精确率 0.4672897196261682
召回率 0.1392757660167131
F1-score 0.2145922746781116
AUC 0.6885726059695575
dt
准确率 0.6391030133146461
精确率 0.3078817733990148
召回率 0.34818941504178275
F1-score 0.32679738562091504
AUC 0.5425404004047866
svm_linearSVC
准确率 0.7484232655921513
精确率 0.5
召回率 0.10863509749303621
F1-score 0.17848970251716245
AUC 0.6836510072715511
rfc
准确率 0.759635599159075
精确率 0.5833333333333334
召回率 0.15598885793871867
F1-score 0.24615384615384617
AUC 0.6700298895183249
gbc
准确率 0.7526278906797477
精确率 0.5230769230769231
召回率 0.1894150417827298
F1-score 0.278118609406953
AUC 0.6936585187735387
xgbc
准确率 0.7505255781359496
精确率 0.5109489051094891
召回率 0.19498607242339833
F1-score 0.28225806451612906
AUC 0.702247191011236
lgbc
准确率 0.7449194113524877
精确率 0.48520710059171596
召回率 0.22841225626740946
F1-score 0.3106060606060606
AUC 0.6772714469030703
'''
  • 使用随机森林数据集预测的结果
'''
lr
准确率 0.7876664330763841
精确率 0.6261261261261262
召回率 0.3871866295264624
F1-score 0.4784853700516351
AUC 0.7791409762866055
dt
准确率 0.6902592852137351
精确率 0.3893333333333333
召回率 0.40668523676880225
F1-score 0.3978201634877384
AUC 0.5961328805566858
svm_linearSVC
准确率 0.7862648913805186
精确率 0.6363636363636364
召回率 0.35097493036211697
F1-score 0.4524236983842011
AUC 0.7803146484721396
rfc
准确率 0.7834618079887876
精确率 0.6666666666666666
召回率 0.2785515320334262
F1-score 0.3929273084479371
AUC 0.7608447309943351
gbc
准确率 0.7806587245970568
精确率 0.6116504854368932
召回率 0.35097493036211697
F1-score 0.44601769911504424
AUC 0.7705183979635484
xgbc
准确率 0.7883672039243167
精确率 0.6376811594202898
召回率 0.36768802228412256
F1-score 0.4664310954063604
AUC 0.7721980532690683
lgbc
准确率 0.7631394533987386
精确率 0.5458515283842795
召回率 0.34818941504178275
F1-score 0.4251700680272109
AUC 0.7555866796031424
'''
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>