数据挖掘 —— 金融数据(二)

特征工程
任务二
特征衍生
特征挑选:分别用IV值和随机森林等进行特征选择
……以及你能想到特征工程处理

# 导入需要的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# 忽略警告
import warnings
warnings.filterwarnings('ignore')
# 读入预处理后的新数据
data = pd.read_csv(r'new_data.csv')
data.head(10)
Unnamed: 0low_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filtertrans_days_interval...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
000.010.9900.900.550.31317.027.026.0...2900.01688.01200.075.01.02.01200.01200.012.018.0
110.020.9420001.281.000.45819.030.014.0...3500.01758.015100.080.05.06.022800.09360.04.02.0
220.040.9601.001.000.11413.068.022.0...1600.01250.04200.087.01.01.04200.04200.02.06.0
330.000.9620000.130.570.77722.014.06.0...3200.01541.016300.080.05.05.030000.012180.02.04.0
440.010.9900.461.000.17513.066.042.0...2300.01630.08300.079.02.02.08400.08250.022.0120.0
550.020.9820007.591.000.73327.08.011.0...5300.01941.011200.080.010.012.020400.08130.03.04.0
660.020.98023.670.940.08710.054.053.0...2200.02200.07600.073.02.02.016800.08900.01.03.0
770.020.9800.250.880.30219.020.020.0...2300.00.00.078.03.03.020400.00.023.04.0
880.030.6500.310.760.47215.021.014.0...5300.04750.05500.079.08.011.019200.07987.024.07.0
990.010.995000.801.000.08815.036.035.0...2800.01520.00.00.00.00.00.00.018.0142.0

10 rows × 82 columns

data = data.drop(['Unnamed: 0'],axis=1)
label = data['status']
features = data.drop(['status'],axis = 1, inplace=False)
方差筛选
from sklearn.feature_selection import VarianceThreshold

variance_select = VarianceThreshold(threshold=0)
variance_select.fit(features)
variances = variance_select.variances_
variances_df = pd.DataFrame({'features': features.columns, 'variance': variances}, columns=['features', 'variance'])

阈值的选择需要根据模型效果来判断,一般先一处方差接近0的特征,再使用其他方法

variances_df
featuresvariance
0low_volume_percent1.723472e-03
1middle_volume_percent2.097334e-02
2take_amount_in_later_12_month_highest1.539431e+07
3trans_amount_increase_rate_lately4.814812e+05
4trans_activity_month3.876929e-02
5trans_activity_day2.895997e-02
6transd_mcc2.001861e+01
7trans_days_interval_filter5.157110e+02
8trans_days_interval2.712707e+02
9regional_mobility7.922850e-01
10repayment_capability2.726128e+09
11is_high_user1.102422e-02
12number_of_trans_from_20111.011116e+02
13first_transaction_time2.190825e+08
14historical_trans_amount1.026942e+11
15historical_trans_day9.931745e+03
16rank_trad_1_month6.964543e-02
17trans_amount_3_month1.035010e+10
18avg_consume_less_12_valid_month1.932965e+00
19abs7.292569e+08
20top_trans_count_last_1_month1.228786e-01
21avg_price_last_12_month5.864391e+05
22avg_price_top_last_12_valid_month9.861509e-03
23reg_preference_for_trad7.628764e-01
24trans_top_time_last_1_month2.831578e+01
25trans_top_time_last_6_month1.677847e+02
26consume_top_time_last_1_month2.979555e+01
27consume_top_time_last_6_month1.720437e+02
28cross_consume_count_last_1_month5.031203e+00
29trans_fail_top_count_enum_last_1_month3.640022e+00
.........
50loans_score3.624338e+03
51loans_credibility_behavior4.687256e+00
52loans_count5.847206e+02
53loans_settle_count4.510908e+02
54loans_overdue_count9.629793e+00
55loans_org_count_behavior5.400224e+01
56consfin_org_count_behavior8.469347e+00
57loans_cash_count2.860550e+01
58latest_one_month_loan2.151156e+00
59latest_three_month_loan1.166045e+01
60latest_six_month_loan1.156775e+02
61history_suc_fee8.684877e+02
62history_fail_fee6.083847e+02
63latest_one_month_suc3.633371e+00
64latest_one_month_fail1.431061e+01
65loans_long_time1.235436e+03
66loans_credit_limit4.711130e+05
67loans_credibility_limit1.106184e+02
68loans_org_count_current2.860550e+01
69loans_product_count3.298047e+01
70loans_max_limit2.106647e+06
71loans_avg_limit5.131267e+05
72consfin_credit_limit5.587290e+07
73consfin_credibility1.982972e+02
74consfin_org_count_current8.469347e+00
75consfin_product_count1.118530e+01
76consfin_max_limit1.927557e+08
77consfin_avg_limit3.398962e+07
78latest_query_day1.331995e+03
79loans_latest_day2.834898e+03

80 rows × 2 columns

卡方检验
from sklearn.feature_selection import SelectKBest, chi2

# 通过观测数据集发现,只有latest_query_day和loans_latest_day有个别的负值,将负值进行处理
features['latest_query_day'][features['latest_query_day'] < 0] = 0
features['loans_latest_day'][features['loans_latest_day'] < 0] = 0

# 将k值设置为所有特征长度,通过查看p-values和scores选取特征
chi2_select = SelectKBest(chi2, k=len(features.columns))
chi2_select.fit(features, label)
chi2_df = pd.DataFrame({'features':features.columns, 'chi':chi2_select.scores_}, columns=['features', 'chi2'])
chi2_df = chi2_df.sort_values('chi2', ascending=False)

卡方检验要求只包含非负特性,对于负特性可以先移除后再添加选择好的特征进行下一步处理

查看相关性
corr_matrix = data.corr()
corr_label = corr_matrix['status'].sort_values(ascending=False)

# 设置阈值进行特性选择
p_threshold = 0.05
n_threshold = -0.05
for key, val in corr_label.items():
    if val > p_threshold or val < n_threshold:
        print(key, val)
status 1.0
trans_fail_top_count_enum_last_1_month 0.333338602447
history_fail_fee 0.306272727874
loans_overdue_count 0.266961275508
latest_one_month_fail 0.248837102262
rank_trad_1_month 0.146643133764
trans_fail_top_count_enum_last_6_month 0.134436135122
trans_fail_top_count_enum_last_12_month 0.118328333343
top_trans_count_last_1_month 0.117298890096
trans_day_last_12_month 0.0789739277861
avg_price_top_last_12_valid_month 0.0712593317035
latest_six_month_loan 0.0602267749783
latest_query_day 0.0574192009417
query_cash_count 0.0530359685433
low_volume_percent 0.0519110920093
latest_three_month_loan 0.0514113147178
avg_price_last_12_month -0.0511256913739
consfin_max_limit -0.0615505998767
consume_top_time_last_1_month -0.0656096911256
trans_top_time_last_1_month -0.0677936620685
consfin_credit_limit -0.0760572568302
consfin_avg_limit -0.081569312019
latest_one_month_suc -0.131443212351
apply_score -0.232479690992
loans_score -0.245092114728
# 划分数据集和测试集
y = data['status']
x = data.drop(['status'], axis=1)
X_train,X_test,y_train,y_test=train_test_split(x, y, test_size=0.3, random_state=2018)
print("Training Size:{}".format(X_train.shape))
print('Testing Size:{}'.format(X_test.shape))
Training Size:(3327, 80)
Testing Size:(1427, 80)

IV和WOE详解学习参考:https://blog.csdn.net/kevin7658/article/details/50780391
在二分类问题中,IV值(Information Value)主要用来对输入变量进行编码和预测能力评估

IV值预测能力
< 0.02无用
0.02 ~ 0.1弱预测
0.1 ~ 0.3中等预测
0.3 ~ 0.5强预测
> 0.5可疑

WOE(weight of evidence,证据权重),是对原始变量的一种编码形式
在这里插入图片描述
衡量了"当前分组中响应用户/所有响应用户"和"当前分组中未响应用户/所有未响应用户"的差异IV值的计算以WOE为基础,相当于是WOE值的一个加权求和
在这里插入图片描述
IV在WOE前多乘了一个因子:
1)保证了IV的值不是负数;
2)很好的考虑了分组中样本占整体的比例(比例越低,这个分组对变量整体预测能力的贡献越低)
计算IV值
在这里插入图片描述

import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):  
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict) 
        
    return iv_dict
        
def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1    # 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x, y, feature,event = 1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total
    
    iv = 0
    woe_dict = {}
    for x1 in set(x):    # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total    
        rate_non_event = non_event_count / non_event_total
        
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv
# 使用IV值进行特征选择
iv_dict = woe(X_train, y_train)
iv = sorted(iv_dict.items(), key = lambda x:x[1],reverse = True)
iv
[('historical_trans_amount', 2.6609646134512865),
 ('trans_amount_3_month', 2.5546436077538357),
 ('repayment_capability', 2.3272292519672519),
 ('pawns_auctions_trusts_consume_last_6_month', 2.2207773896414862),
 ('first_transaction_day', 2.1632919045099226),
 ('first_transaction_time', 2.1632919045099208),
 ('abs', 1.966985825643712),
 ('consfin_avg_limit', 1.692832469252038),
 ('loans_avg_limit', 1.4613631156793805),
 ('max_cumulative_consume_later_1_month', 1.4598660465564153),
 ('consume_mini_time_last_1_month', 1.3787845037197419),
 ('historical_trans_day', 1.1181905151873197),
 ('consfin_credit_limit', 0.87117239524244494),
 ('pawns_auctions_trusts_consume_last_1_month', 0.8530625616084101),
 ('avg_price_last_12_month', 0.7281431950917352),
 ('loans_score', 0.66248415979820885),
 ('loans_latest_day', 0.63015158005847938),
 ('apply_score', 0.62779816536133459),
 ('history_suc_fee', 0.51153872259602406),
 ('latest_query_day', 0.4918927409937115),
 ('trans_days_interval_filter', 0.48819446807893713),
 ('loans_long_time', 0.48029141695051775),
 ('loans_count', 0.47950109452443113),
 ('trans_top_time_last_6_month', 0.46125407327719448),
 ('consfin_max_limit', 0.4486198753883251),
 ('history_fail_fee', 0.44575826921280298),
 ('take_amount_in_later_12_month_highest', 0.4407207265219969),
 ('consume_top_time_last_6_month', 0.4158480097512699),
 ('loans_settle_count', 0.40804854630249232),
 ('trans_days_interval', 0.39158870373595028),
 ('loans_max_limit', 0.38328027252467706),
 ('trans_day_last_12_month', 0.37280971925428658),
 ('query_sum_count', 0.36655641812361395),
 ('number_of_trans_from_2011', 0.35116979855985458),
 ('latest_six_month_apply', 0.35015468562891283),
 ('latest_six_month_loan', 0.34722084728410074),
 ('loans_product_count', 0.32397462359606016),
 ('latest_three_month_apply', 0.31636627681852769),
 ('loans_credit_limit', 0.29628915057605282),
 ('apply_credibility', 0.29565953570477754),
 ('loans_org_count_behavior', 0.29323686235094837),
 ('query_org_count', 0.29227035471616658),
 ('transd_mcc', 0.29112368987904647),
 ('latest_one_month_apply', 0.28723141233549943),
 ('latest_three_month_loan', 0.28041640099961634),
 ('loans_credibility_limit', 0.27812326660847181),
 ('consfin_org_count_behavior', 0.25997712592137667),
 ('consfin_org_count_current', 0.25997712592137667),
 ('loans_credibility_behavior', 0.25684177248722784),
 ('loans_cash_count', 0.25639224912498743),
 ('loans_org_count_current', 0.25639224912498743),
 ('max_consume_count_later_6_month', 0.25330625624264236),
 ('trans_top_time_last_1_month', 0.25278744961596039),
 ('loans_overdue_count', 0.25244469195399577),
 ('consfin_credibility', 0.25212673276061065),
 ('consume_top_time_last_1_month', 0.25190545938772502),
 ('query_cash_count', 0.2505945564771927),
 ('latest_one_month_fail', 0.25041575086113804),
 ('trans_fail_top_count_enum_last_12_month', 0.2484962281097749),
 ('consfin_product_count', 0.24840677004038769),
 ('trans_fail_top_count_enum_last_6_month', 0.24459065905835767),
 ('latest_one_month_loan', 0.24357719668455058),
 ('cross_consume_count_last_1_month', 0.24338410524382642),
 ('avg_consume_less_12_valid_month', 0.24153027060850193),
 ('query_finance_count', 0.24100208671673942),
 ('reg_preference_for_trad', 0.23906447403181369),
 ('trans_fail_top_count_enum_last_1_month', 0.23519543933090667),
 ('railway_consume_count_last_12_month', 0.23504608134223881),
 ('latest_one_month_suc', 0.23313434057293608),
 ('trans_activity_day', 0.23168003248070962),
 ('low_volume_percent', 0.22857880126543986),
 ('avg_price_top_last_12_valid_month', 0.22804418697211443),
 ('jewelry_consume_count_last_6_month', 0.22615342289100443),
 ('middle_volume_percent', 0.22570726622757747),
 ('trans_amount_increase_rate_lately', 0.22543255469737311),
 ('regional_mobility', 0.22209882955283974),
 ('rank_trad_1_month', 0.22174758276579848),
 ('trans_activity_month', 0.22020446065315732),
 ('top_trans_count_last_1_month', 0.21999826689565763),
 ('is_high_user', 0.21752150441707879)]
def iv_choose(iv):
    iv_list = []
    for i, j in iv.items():
        if (j > 0.02) & (j < 0.6):
            iv_list.append(i)
    return iv_list


iv_list = iv_choose(iv_dict)
print(len(iv_list))
iv_list
62





['low_volume_percent',
 'middle_volume_percent',
 'take_amount_in_later_12_month_highest',
 'trans_amount_increase_rate_lately',
 'trans_activity_month',
 'trans_activity_day',
 'transd_mcc',
 'trans_days_interval_filter',
 'trans_days_interval',
 'regional_mobility',
 'is_high_user',
 'number_of_trans_from_2011',
 'rank_trad_1_month',
 'avg_consume_less_12_valid_month',
 'top_trans_count_last_1_month',
 'avg_price_top_last_12_valid_month',
 'reg_preference_for_trad',
 'trans_top_time_last_1_month',
 'trans_top_time_last_6_month',
 'consume_top_time_last_1_month',
 'consume_top_time_last_6_month',
 'cross_consume_count_last_1_month',
 'trans_fail_top_count_enum_last_1_month',
 'trans_fail_top_count_enum_last_6_month',
 'trans_fail_top_count_enum_last_12_month',
 'max_consume_count_later_6_month',
 'railway_consume_count_last_12_month',
 'jewelry_consume_count_last_6_month',
 'trans_day_last_12_month',
 'apply_credibility',
 'query_org_count',
 'query_finance_count',
 'query_cash_count',
 'query_sum_count',
 'latest_one_month_apply',
 'latest_three_month_apply',
 'latest_six_month_apply',
 'loans_credibility_behavior',
 'loans_count',
 'loans_settle_count',
 'loans_overdue_count',
 'loans_org_count_behavior',
 'consfin_org_count_behavior',
 'loans_cash_count',
 'latest_one_month_loan',
 'latest_three_month_loan',
 'latest_six_month_loan',
 'history_suc_fee',
 'history_fail_fee',
 'latest_one_month_suc',
 'latest_one_month_fail',
 'loans_long_time',
 'loans_credit_limit',
 'loans_credibility_limit',
 'loans_org_count_current',
 'loans_product_count',
 'loans_max_limit',
 'consfin_credibility',
 'consfin_org_count_current',
 'consfin_product_count',
 'consfin_max_limit',
 'latest_query_day']

随机森林选取特征

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=2018)
rfc.fit(X_train, y_train)
importance = pd.Series(rfc.feature_importances_, index=x.columns).sort_values(ascending=False)
importance
trans_fail_top_count_enum_last_1_month        0.067599
history_fail_fee                              0.041441
loans_score                                   0.033226
apply_score                                   0.024346
trans_fail_top_count_enum_last_6_month        0.023965
trans_day_last_12_month                       0.021230
trans_amount_3_month                          0.020492
loans_overdue_count                           0.019551
first_transaction_time                        0.018474
abs                                           0.018429
latest_one_month_fail                         0.017917
historical_trans_day                          0.017571
trans_amount_increase_rate_lately             0.015686
max_cumulative_consume_later_1_month          0.015285
latest_query_day                              0.014965
pawns_auctions_trusts_consume_last_6_month    0.014888
loans_count                                   0.014848
loans_avg_limit                               0.014819
avg_price_last_12_month                       0.014458
loans_latest_day                              0.014370
first_transaction_day                         0.014048
consfin_max_limit                             0.013881
loans_long_time                               0.013819
number_of_trans_from_2011                     0.013500
trans_activity_month                          0.013334
trans_days_interval_filter                    0.013190
loans_max_limit                               0.012967
loans_settle_count                            0.012967
trans_days_interval                           0.012737
middle_volume_percent                         0.012705
                                                ...   
consfin_org_count_current                     0.009565
rank_trad_1_month                             0.009436
loans_product_count                           0.009289
pawns_auctions_trusts_consume_last_1_month    0.009274
query_org_count                               0.009239
loans_cash_count                              0.009040
loans_credibility_limit                       0.008462
max_consume_count_later_6_month               0.008326
query_finance_count                           0.008000
consume_top_time_last_1_month                 0.007986
loans_org_count_behavior                      0.007921
loans_credibility_behavior                    0.007709
latest_three_month_loan                       0.007701
latest_one_month_suc                          0.007681
low_volume_percent                            0.007610
query_cash_count                              0.007516
top_trans_count_last_1_month                  0.007101
consfin_product_count                         0.007084
loans_org_count_current                       0.006887
consfin_org_count_behavior                    0.006272
latest_six_month_loan                         0.006270
latest_one_month_apply                        0.006140
avg_consume_less_12_valid_month               0.006009
regional_mobility                             0.005574
cross_consume_count_last_1_month              0.004980
latest_one_month_loan                         0.002922
reg_preference_for_trad                       0.002207
railway_consume_count_last_12_month           0.000141
jewelry_consume_count_last_6_month            0.000138
is_high_user                                  0.000000
dtype: float64
len(importance)
80
# 设定一定的阈值,进行特征提取
def select_features(scores,attributes, threshold = 0.01):
    '''
    scores:[]
    attributes:[]
    retrun: 被选择的特征列表和已根据重要性分数排序后的[(分数,特征),...]
    '''
    feature_score_sorted = sorted(zip(scores,attributes),reverse=True)
    select = []
    for row in feature_score_sorted:
        score,feature = row
        if score > threshold:
            select.append(feature)
    return select,feature_score_sorted
features_seleted,feature_score_sorted = select_features(importance, data.columns)
# 打印更具特征重要性分数排序后对应的特征列表
feature_score_sorted
[(0.06759857529152126, 'low_volume_percent'),
 (0.04144132375795051, 'middle_volume_percent'),
 (0.033226341812976498, 'take_amount_in_later_12_month_highest'),
 (0.024346147281823001, 'trans_amount_increase_rate_lately'),
 (0.02396461689852037, 'trans_activity_month'),
 (0.021229800352657799, 'trans_activity_day'),
 (0.020492073581023017, 'transd_mcc'),
 (0.019551465016905793, 'trans_days_interval_filter'),
 (0.018473500098720008, 'trans_days_interval'),
 (0.018429471987365054, 'regional_mobility'),
 (0.017917067189165657, 'repayment_capability'),
 (0.01757101867594172, 'is_high_user'),
 (0.015685719822459497, 'number_of_trans_from_2011'),
 (0.015285233399844795, 'first_transaction_time'),
 (0.014965346331371771, 'historical_trans_amount'),
 (0.014887904705946475, 'historical_trans_day'),
 (0.014848478987436912, 'rank_trad_1_month'),
 (0.014818943757584562, 'trans_amount_3_month'),
 (0.014458491945750176, 'avg_consume_less_12_valid_month'),
 (0.014369872836001269, 'abs'),
 (0.014048245049498597, 'top_trans_count_last_1_month'),
 (0.013881305649718716, 'avg_price_last_12_month'),
 (0.013818585375387446, 'avg_price_top_last_12_valid_month'),
 (0.013499605784536378, 'reg_preference_for_trad'),
 (0.01333403483971948, 'trans_top_time_last_1_month'),
 (0.013189673621842641, 'trans_top_time_last_6_month'),
 (0.012967350610280825, 'consume_top_time_last_1_month'),
 (0.012967031812736776, 'consume_top_time_last_6_month'),
 (0.012737321621440773, 'cross_consume_count_last_1_month'),
 (0.012704743549929004, 'trans_fail_top_count_enum_last_1_month'),
 (0.012670739873021739, 'trans_fail_top_count_enum_last_6_month'),
 (0.012634942017596049, 'trans_fail_top_count_enum_last_12_month'),
 (0.012547159590595519, 'consume_mini_time_last_1_month'),
 (0.012269127921896999, 'max_cumulative_consume_later_1_month'),
 (0.012251014193021116, 'max_consume_count_later_6_month'),
 (0.012130445200419506, 'railway_consume_count_last_12_month'),
 (0.012034854528026892, 'pawns_auctions_trusts_consume_last_1_month'),
 (0.011854647991772738, 'pawns_auctions_trusts_consume_last_6_month'),
 (0.011706539361219558, 'jewelry_consume_count_last_6_month'),
 (0.01166026942923009, 'status'),
 (0.011377310777428085, 'first_transaction_day'),
 (0.011259220840075928, 'trans_day_last_12_month'),
 (0.011153859215878038, 'apply_score'),
 (0.010988313523754335, 'apply_credibility'),
 (0.010748172884177704, 'query_org_count'),
 (0.010443991633735872, 'query_finance_count'),
 (0.010106738553680085, 'query_cash_count'),
 (0.0097070363548293296, 'query_sum_count'),
 (0.0096860508706849251, 'latest_one_month_apply'),
 (0.0095788427733895475, 'latest_three_month_apply'),
 (0.0095651360382182811, 'latest_six_month_apply'),
 (0.009435867222090371, 'loans_score'),
 (0.0092886391460863804, 'loans_credibility_behavior'),
 (0.0092742649699127776, 'loans_count'),
 (0.0092388186178661829, 'loans_settle_count'),
 (0.009040150558181137, 'loans_overdue_count'),
 (0.008461666363636015, 'loans_org_count_behavior'),
 (0.0083264175951640769, 'consfin_org_count_behavior'),
 (0.00799967023354658, 'loans_cash_count'),
 (0.0079861813469583494, 'latest_one_month_loan'),
 (0.0079209175329646251, 'latest_three_month_loan'),
 (0.0077087109921875323, 'latest_six_month_loan'),
 (0.0077013706125157568, 'history_suc_fee'),
 (0.0076812466427810602, 'history_fail_fee'),
 (0.0076104715634482956, 'latest_one_month_suc'),
 (0.0075162527300091219, 'latest_one_month_fail'),
 (0.0071011562089886267, 'loans_long_time'),
 (0.0070837738212370079, 'loans_credit_limit'),
 (0.0068867085612011308, 'loans_credibility_limit'),
 (0.0062722923476042358, 'loans_org_count_current'),
 (0.0062699820118575483, 'loans_product_count'),
 (0.0061403277931418362, 'loans_max_limit'),
 (0.0060089335095311162, 'loans_avg_limit'),
 (0.0055742146918062018, 'consfin_credit_limit'),
 (0.004979817893285372, 'consfin_credibility'),
 (0.0029223544733663646, 'consfin_org_count_current'),
 (0.0022066949392423817, 'consfin_product_count'),
 (0.00014142014941099571, 'consfin_max_limit'),
 (0.00013797225326978062, 'consfin_avg_limit'),
 (0.0, 'latest_query_day')]
# 最后选取的特征
new_features = data[features_seleted]
print('经过随机森林特征选择后,当阈值为{}时,最后保留的特征数为{}个'.format(0.01,new_features.shape[1]))
经过随机森林特征选择后,当阈值为0.01时,最后保留的特征数为47个
task3_dataset = pd.concat([new_features,y],axis=1)
task3_dataset.to_csv('task2_proc.csv',index = False)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值