数据挖掘 —— 金融数据（二）

最新推荐文章于 2023-12-22 02:55:43 发布

qmys

最新推荐文章于 2023-12-22 02:55:43 发布

阅读量793

点赞数

分类专栏：数据挖掘

本文链接：https://blog.csdn.net/mys_mys/article/details/98970161

版权

数据挖掘专栏收录该内容

6 篇文章 1 订阅

订阅专栏

特征工程
任务二
特征衍生
特征挑选：分别用IV值和随机森林等进行特征选择
……以及你能想到特征工程处理

# 导入需要的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# 忽略警告
import warnings
warnings.filterwarnings('ignore')

# 读入预处理后的新数据
data = pd.read_csv(r'new_data.csv')
data.head(10)

	Unnamed: 0	low_volume_percent	middle_volume_percent	take_amount_in_later_12_month_highest	trans_amount_increase_rate_lately	trans_activity_month	trans_activity_day	transd_mcc	trans_days_interval_filter	trans_days_interval	...	loans_max_limit	loans_avg_limit	consfin_credit_limit	consfin_credibility	consfin_org_count_current	consfin_product_count	consfin_max_limit	consfin_avg_limit	latest_query_day	loans_latest_day
0	0	0.01	0.99	0	0.90	0.55	0.313	17.0	27.0	26.0	...	2900.0	1688.0	1200.0	75.0	1.0	2.0	1200.0	1200.0	12.0	18.0
1	1	0.02	0.94	2000	1.28	1.00	0.458	19.0	30.0	14.0	...	3500.0	1758.0	15100.0	80.0	5.0	6.0	22800.0	9360.0	4.0	2.0
2	2	0.04	0.96	0	1.00	1.00	0.114	13.0	68.0	22.0	...	1600.0	1250.0	4200.0	87.0	1.0	1.0	4200.0	4200.0	2.0	6.0
3	3	0.00	0.96	2000	0.13	0.57	0.777	22.0	14.0	6.0	...	3200.0	1541.0	16300.0	80.0	5.0	5.0	30000.0	12180.0	2.0	4.0
4	4	0.01	0.99	0	0.46	1.00	0.175	13.0	66.0	42.0	...	2300.0	1630.0	8300.0	79.0	2.0	2.0	8400.0	8250.0	22.0	120.0
5	5	0.02	0.98	2000	7.59	1.00	0.733	27.0	8.0	11.0	...	5300.0	1941.0	11200.0	80.0	10.0	12.0	20400.0	8130.0	3.0	4.0
6	6	0.02	0.98	0	23.67	0.94	0.087	10.0	54.0	53.0	...	2200.0	2200.0	7600.0	73.0	2.0	2.0	16800.0	8900.0	1.0	3.0
7	7	0.02	0.98	0	0.25	0.88	0.302	19.0	20.0	20.0	...	2300.0	0.0	0.0	78.0	3.0	3.0	20400.0	0.0	23.0	4.0
8	8	0.03	0.65	0	0.31	0.76	0.472	15.0	21.0	14.0	...	5300.0	4750.0	5500.0	79.0	8.0	11.0	19200.0	7987.0	24.0	7.0
9	9	0.01	0.99	500	0.80	1.00	0.088	15.0	36.0	35.0	...	2800.0	1520.0	0.0	0.0	0.0	0.0	0.0	0.0	18.0	142.0

10 rows × 82 columns

data = data.drop(['Unnamed: 0'],axis=1)

label = data['status']
features = data.drop(['status'],axis = 1, inplace=False)

方差筛选

from sklearn.feature_selection import VarianceThreshold

variance_select = VarianceThreshold(threshold=0)
variance_select.fit(features)
variances = variance_select.variances_
variances_df = pd.DataFrame({'features': features.columns, 'variance': variances}, columns=['features', 'variance'])

阈值的选择需要根据模型效果来判断，一般先一处方差接近0的特征，再使用其他方法

variances_df

	features	variance
0	low_volume_percent	1.723472e-03
1	middle_volume_percent	2.097334e-02
2	take_amount_in_later_12_month_highest	1.539431e+07
3	trans_amount_increase_rate_lately	4.814812e+05
4	trans_activity_month	3.876929e-02
5	trans_activity_day	2.895997e-02
6	transd_mcc	2.001861e+01
7	trans_days_interval_filter	5.157110e+02
8	trans_days_interval	2.712707e+02
9	regional_mobility	7.922850e-01
10	repayment_capability	2.726128e+09
11	is_high_user	1.102422e-02
12	number_of_trans_from_2011	1.011116e+02
13	first_transaction_time	2.190825e+08
14	historical_trans_amount	1.026942e+11
15	historical_trans_day	9.931745e+03
16	rank_trad_1_month	6.964543e-02
17	trans_amount_3_month	1.035010e+10
18	avg_consume_less_12_valid_month	1.932965e+00
19	abs	7.292569e+08
20	top_trans_count_last_1_month	1.228786e-01
21	avg_price_last_12_month	5.864391e+05
22	avg_price_top_last_12_valid_month	9.861509e-03
23	reg_preference_for_trad	7.628764e-01
24	trans_top_time_last_1_month	2.831578e+01
25	trans_top_time_last_6_month	1.677847e+02
26	consume_top_time_last_1_month	2.979555e+01
27	consume_top_time_last_6_month	1.720437e+02
28	cross_consume_count_last_1_month	5.031203e+00
29	trans_fail_top_count_enum_last_1_month	3.640022e+00
...	...	...
50	loans_score	3.624338e+03
51	loans_credibility_behavior	4.687256e+00
52	loans_count	5.847206e+02
53	loans_settle_count	4.510908e+02
54	loans_overdue_count	9.629793e+00
55	loans_org_count_behavior	5.400224e+01
56	consfin_org_count_behavior	8.469347e+00
57	loans_cash_count	2.860550e+01
58	latest_one_month_loan	2.151156e+00
59	latest_three_month_loan	1.166045e+01
60	latest_six_month_loan	1.156775e+02
61	history_suc_fee	8.684877e+02
62	history_fail_fee	6.083847e+02
63	latest_one_month_suc	3.633371e+00
64	latest_one_month_fail	1.431061e+01
65	loans_long_time	1.235436e+03
66	loans_credit_limit	4.711130e+05
67	loans_credibility_limit	1.106184e+02
68	loans_org_count_current	2.860550e+01
69	loans_product_count	3.298047e+01
70	loans_max_limit	2.106647e+06
71	loans_avg_limit	5.131267e+05
72	consfin_credit_limit	5.587290e+07
73	consfin_credibility	1.982972e+02
74	consfin_org_count_current	8.469347e+00
75	consfin_product_count	1.118530e+01
76	consfin_max_limit	1.927557e+08
77	consfin_avg_limit	3.398962e+07
78	latest_query_day	1.331995e+03
79	loans_latest_day	2.834898e+03

80 rows × 2 columns

卡方检验

from sklearn.feature_selection import SelectKBest, chi2

# 通过观测数据集发现，只有latest_query_day和loans_latest_day有个别的负值，将负值进行处理
features['latest_query_day'][features['latest_query_day'] < 0] = 0
features['loans_latest_day'][features['loans_latest_day'] < 0] = 0

# 将k值设置为所有特征长度，通过查看p-values和scores选取特征
chi2_select = SelectKBest(chi2, k=len(features.columns))
chi2_select.fit(features, label)
chi2_df = pd.DataFrame({'features':features.columns, 'chi':chi2_select.scores_}, columns=['features', 'chi2'])
chi2_df = chi2_df.sort_values('chi2', ascending=False)

卡方检验要求只包含非负特性，对于负特性可以先移除后再添加选择好的特征进行下一步处理

查看相关性

corr_matrix = data.corr()
corr_label = corr_matrix['status'].sort_values(ascending=False)

# 设置阈值进行特性选择
p_threshold = 0.05
n_threshold = -0.05
for key, val in corr_label.items():
    if val > p_threshold or val < n_threshold:
        print(key, val)

status 1.0
trans_fail_top_count_enum_last_1_month 0.333338602447
history_fail_fee 0.306272727874
loans_overdue_count 0.266961275508
latest_one_month_fail 0.248837102262
rank_trad_1_month 0.146643133764
trans_fail_top_count_enum_last_6_month 0.134436135122
trans_fail_top_count_enum_last_12_month 0.118328333343
top_trans_count_last_1_month 0.117298890096
trans_day_last_12_month 0.0789739277861
avg_price_top_last_12_valid_month 0.0712593317035
latest_six_month_loan 0.0602267749783
latest_query_day 0.0574192009417
query_cash_count 0.0530359685433
low_volume_percent 0.0519110920093
latest_three_month_loan 0.0514113147178
avg_price_last_12_month -0.0511256913739
consfin_max_limit -0.0615505998767
consume_top_time_last_1_month -0.0656096911256
trans_top_time_last_1_month -0.0677936620685
consfin_credit_limit -0.0760572568302
consfin_avg_limit -0.081569312019
latest_one_month_suc -0.131443212351
apply_score -0.232479690992
loans_score -0.245092114728

# 划分数据集和测试集
y = data['status']
x = data.drop(['status'], axis=1)
X_train,X_test,y_train,y_test=train_test_split(x, y, test_size=0.3, random_state=2018)
print("Training Size:{}".format(X_train.shape))
print('Testing Size:{}'.format(X_test.shape))

Training Size:(3327, 80)
Testing Size:(1427, 80)

IV和WOE详解学习参考：https://blog.csdn.net/kevin7658/article/details/50780391
在二分类问题中，IV值（Information Value）主要用来对输入变量进行编码和预测能力评估

IV值	预测能力
< 0.02	无用
0.02 ~ 0.1	弱预测
0.1 ~ 0.3	中等预测
0.3 ~ 0.5	强预测
> 0.5	可疑

WOE（weight of evidence，证据权重），是对原始变量的一种编码形式
在这里插入图片描述
衡量了"当前分组中响应用户/所有响应用户"和"当前分组中未响应用户/所有未响应用户"的差异IV值的计算以WOE为基础，相当于是WOE值的一个加权求和

IV在WOE前多乘了一个因子:
1）保证了IV的值不是负数;
2）很好的考虑了分组中样本占整体的比例（比例越低，这个分组对变量整体预测能力的贡献越低）
计算IV值
在这里插入图片描述

import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):  
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict) 
        
    return iv_dict
        
def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1    # 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x, y, feature,event = 1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total
    
    iv = 0
    woe_dict = {}
    for x1 in set(x):    # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total    
        rate_non_event = non_event_count / non_event_total
        
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv

# 使用IV值进行特征选择
iv_dict = woe(X_train, y_train)
iv = sorted(iv_dict.items(), key = lambda x:x[1],reverse = True)
iv

[('historical_trans_amount', 2.6609646134512865),
 ('trans_amount_3_month', 2.5546436077538357),
 ('repayment_capability', 2.3272292519672519),
 ('pawns_auctions_trusts_consume_last_6_month', 2.2207773896414862),
 ('first_transaction_day', 2.1632919045099226),
 ('first_transaction_time', 2.1632919045099208),
 ('abs', 1.966985825643712),
 ('consfin_avg_limit', 1.692832469252038),
 ('loans_avg_limit', 1.4613631156793805),
 ('max_cumulative_consume_later_1_month', 1.4598660465564153),
 ('consume_mini_time_last_1_month', 1.3787845037197419),
 ('historical_trans_day', 1.1181905151873197),
 ('consfin_credit_limit', 0.87117239524244494),
 ('pawns_auctions_trusts_consume_last_1_month', 0.8530625616084101),
 ('avg_price_last_12_month', 0.7281431950917352),
 ('loans_score', 0.66248415979820885),
 ('loans_latest_day', 0.63015158005847938),
 ('apply_score', 0.62779816536133459),
 ('history_suc_fee', 0.51153872259602406),
 ('latest_query_day', 0.4918927409937115),
 ('trans_days_interval_filter', 0.48819446807893713),
 ('loans_long_time', 0.48029141695051775),
 ('loans_count', 0.47950109452443113),
 ('trans_top_time_last_6_month', 0.46125407327719448),
 ('consfin_max_limit', 0.4486198753883251),
 ('history_fail_fee', 0.44575826921280298),
 ('take_amount_in_later_12_month_highest', 0.4407207265219969),
 ('consume_top_time_last_6_month', 0.4158480097512699),
 ('loans_settle_count', 0.40804854630249232),
 ('trans_days_interval', 0.39158870373595028),
 ('loans_max_limit', 0.38328027252467706),
 ('trans_day_last_12_month', 0.37280971925428658),
 ('query_sum_count', 0.36655641812361395),
 ('number_of_trans_from_2011', 0.35116979855985458),
 ('latest_six_month_apply', 0.35015468562891283),
 ('latest_six_month_loan', 0.34722084728410074),
 ('loans_product_count', 0.32397462359606016),
 ('latest_three_month_apply', 0.31636627681852769),
 ('loans_credit_limit', 0.29628915057605282),
 ('apply_credibility', 0.29565953570477754),
 ('loans_org_count_behavior', 0.29323686235094837),
 ('query_org_count', 0.29227035471616658),
 ('transd_mcc', 0.29112368987904647),
 ('latest_one_month_apply', 0.28723141233549943),
 ('latest_three_month_loan', 0.28041640099961634),
 ('loans_credibility_limit', 0.27812326660847181),
 ('consfin_org_count_behavior', 0.25997712592137667),
 ('consfin_org_count_current', 0.25997712592137667),
 ('loans_credibility_behavior', 0.25684177248722784),
 ('loans_cash_count', 0.25639224912498743),
 ('loans_org_count_current', 0.25639224912498743),
 ('max_consume_count_later_6_month', 0.25330625624264236),
 ('trans_top_time_last_1_month', 0.25278744961596039),
 ('loans_overdue_count', 0.25244469195399577),
 ('consfin_credibility', 0.25212673276061065),
 ('consume_top_time_last_1_month', 0.25190545938772502),
 ('query_cash_count', 0.2505945564771927),
 ('latest_one_month_fail', 0.25041575086113804),
 ('trans_fail_top_count_enum_last_12_month', 0.2484962281097749),
 ('consfin_product_count', 0.24840677004038769),
 ('trans_fail_top_count_enum_last_6_month', 0.24459065905835767),
 ('latest_one_month_loan', 0.24357719668455058),
 ('cross_consume_count_last_1_month', 0.24338410524382642),
 ('avg_consume_less_12_valid_month', 0.24153027060850193),
 ('query_finance_count', 0.24100208671673942),
 ('reg_preference_for_trad', 0.23906447403181369),
 ('trans_fail_top_count_enum_last_1_month', 0.23519543933090667),
 ('railway_consume_count_last_12_month', 0.23504608134223881),
 ('latest_one_month_suc', 0.23313434057293608),
 ('trans_activity_day', 0.23168003248070962),
 ('low_volume_percent', 0.22857880126543986),
 ('avg_price_top_last_12_valid_month', 0.22804418697211443),
 ('jewelry_consume_count_last_6_month', 0.22615342289100443),
 ('middle_volume_percent', 0.22570726622757747),
 ('trans_amount_increase_rate_lately', 0.22543255469737311),
 ('regional_mobility', 0.22209882955283974),
 ('rank_trad_1_month', 0.22174758276579848),
 ('trans_activity_month', 0.22020446065315732),
 ('top_trans_count_last_1_month', 0.21999826689565763),
 ('is_high_user', 0.21752150441707879)]

def iv_choose(iv):
    iv_list = []
    for i, j in iv.items():
        if (j > 0.02) & (j < 0.6):
            iv_list.append(i)
    return iv_list


iv_list = iv_choose(iv_dict)
print(len(iv_list))
iv_list

62





['low_volume_percent',
 'middle_volume_percent',
 'take_amount_in_later_12_month_highest',
 'trans_amount_increase_rate_lately',
 'trans_activity_month',
 'trans_activity_day',
 'transd_mcc',
 'trans_days_interval_filter',
 'trans_days_interval',
 'regional_mobility',
 'is_high_user',
 'number_of_trans_from_2011',
 'rank_trad_1_month',
 'avg_consume_less_12_valid_month',
 'top_trans_count_last_1_month',
 'avg_price_top_last_12_valid_month',
 'reg_preference_for_trad',
 'trans_top_time_last_1_month',
 'trans_top_time_last_6_month',
 'consume_top_time_last_1_month',
 'consume_top_time_last_6_month',
 'cross_consume_count_last_1_month',
 'trans_fail_top_count_enum_last_1_month',
 'trans_fail_top_count_enum_last_6_month',
 'trans_fail_top_count_enum_last_12_month',
 'max_consume_count_later_6_month',
 'railway_consume_count_last_12_month',
 'jewelry_consume_count_last_6_month',
 'trans_day_last_12_month',
 'apply_credibility',
 'query_org_count',
 'query_finance_count',
 'query_cash_count',
 'query_sum_count',
 'latest_one_month_apply',
 'latest_three_month_apply',
 'latest_six_month_apply',
 'loans_credibility_behavior',
 'loans_count',
 'loans_settle_count',
 'loans_overdue_count',
 'loans_org_count_behavior',
 'consfin_org_count_behavior',
 'loans_cash_count',
 'latest_one_month_loan',
 'latest_three_month_loan',
 'latest_six_month_loan',
 'history_suc_fee',
 'history_fail_fee',
 'latest_one_month_suc',
 'latest_one_month_fail',
 'loans_long_time',
 'loans_credit_limit',
 'loans_credibility_limit',
 'loans_org_count_current',
 'loans_product_count',
 'loans_max_limit',
 'consfin_credibility',
 'consfin_org_count_current',
 'consfin_product_count',
 'consfin_max_limit',
 'latest_query_day']

随机森林选取特征

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=2018)
rfc.fit(X_train, y_train)
importance = pd.Series(rfc.feature_importances_, index=x.columns).sort_values(ascending=False)
importance

trans_fail_top_count_enum_last_1_month        0.067599
history_fail_fee                              0.041441
loans_score                                   0.033226
apply_score                                   0.024346
trans_fail_top_count_enum_last_6_month        0.023965
trans_day_last_12_month                       0.021230
trans_amount_3_month                          0.020492
loans_overdue_count                           0.019551
first_transaction_time                        0.018474
abs                                           0.018429
latest_one_month_fail                         0.017917
historical_trans_day                          0.017571
trans_amount_increase_rate_lately             0.015686
max_cumulative_consume_later_1_month          0.015285
latest_query_day                              0.014965
pawns_auctions_trusts_consume_last_6_month    0.014888
loans_count                                   0.014848
loans_avg_limit                               0.014819
avg_price_last_12_month                       0.014458
loans_latest_day                              0.014370
first_transaction_day                         0.014048
consfin_max_limit                             0.013881
loans_long_time                               0.013819
number_of_trans_from_2011                     0.013500
trans_activity_month                          0.013334
trans_days_interval_filter                    0.013190
loans_max_limit                               0.012967
loans_settle_count                            0.012967
trans_days_interval                           0.012737
middle_volume_percent                         0.012705
                                                ...   
consfin_org_count_current                     0.009565
rank_trad_1_month                             0.009436
loans_product_count                           0.009289
pawns_auctions_trusts_consume_last_1_month    0.009274
query_org_count                               0.009239
loans_cash_count                              0.009040
loans_credibility_limit                       0.008462
max_consume_count_later_6_month               0.008326
query_finance_count                           0.008000
consume_top_time_last_1_month                 0.007986
loans_org_count_behavior                      0.007921
loans_credibility_behavior                    0.007709
latest_three_month_loan                       0.007701
latest_one_month_suc                          0.007681
low_volume_percent                            0.007610
query_cash_count                              0.007516
top_trans_count_last_1_month                  0.007101
consfin_product_count                         0.007084
loans_org_count_current                       0.006887
consfin_org_count_behavior                    0.006272
latest_six_month_loan                         0.006270
latest_one_month_apply                        0.006140
avg_consume_less_12_valid_month               0.006009
regional_mobility                             0.005574
cross_consume_count_last_1_month              0.004980
latest_one_month_loan                         0.002922
reg_preference_for_trad                       0.002207
railway_consume_count_last_12_month           0.000141
jewelry_consume_count_last_6_month            0.000138
is_high_user                                  0.000000
dtype: float64

len(importance)

# 设定一定的阈值，进行特征提取
def select_features(scores,attributes, threshold = 0.01):
    '''
    scores:[]
    attributes:[]
    retrun: 被选择的特征列表和已根据重要性分数排序后的[(分数，特征)，...]
    '''
    feature_score_sorted = sorted(zip(scores,attributes),reverse=True)
    select = []
    for row in feature_score_sorted:
        score,feature = row
        if score > threshold:
            select.append(feature)
    return select,feature_score_sorted

features_seleted,feature_score_sorted = select_features(importance, data.columns)

# 打印更具特征重要性分数排序后对应的特征列表
feature_score_sorted

[(0.06759857529152126, 'low_volume_percent'),
 (0.04144132375795051, 'middle_volume_percent'),
 (0.033226341812976498, 'take_amount_in_later_12_month_highest'),
 (0.024346147281823001, 'trans_amount_increase_rate_lately'),
 (0.02396461689852037, 'trans_activity_month'),
 (0.021229800352657799, 'trans_activity_day'),
 (0.020492073581023017, 'transd_mcc'),
 (0.019551465016905793, 'trans_days_interval_filter'),
 (0.018473500098720008, 'trans_days_interval'),
 (0.018429471987365054, 'regional_mobility'),
 (0.017917067189165657, 'repayment_capability'),
 (0.01757101867594172, 'is_high_user'),
 (0.015685719822459497, 'number_of_trans_from_2011'),
 (0.015285233399844795, 'first_transaction_time'),
 (0.014965346331371771, 'historical_trans_amount'),
 (0.014887904705946475, 'historical_trans_day'),
 (0.014848478987436912, 'rank_trad_1_month'),
 (0.014818943757584562, 'trans_amount_3_month'),
 (0.014458491945750176, 'avg_consume_less_12_valid_month'),
 (0.014369872836001269, 'abs'),
 (0.014048245049498597, 'top_trans_count_last_1_month'),
 (0.013881305649718716, 'avg_price_last_12_month'),
 (0.013818585375387446, 'avg_price_top_last_12_valid_month'),
 (0.013499605784536378, 'reg_preference_for_trad'),
 (0.01333403483971948, 'trans_top_time_last_1_month'),
 (0.013189673621842641, 'trans_top_time_last_6_month'),
 (0.012967350610280825, 'consume_top_time_last_1_month'),
 (0.012967031812736776, 'consume_top_time_last_6_month'),
 (0.012737321621440773, 'cross_consume_count_last_1_month'),
 (0.012704743549929004, 'trans_fail_top_count_enum_last_1_month'),
 (0.012670739873021739, 'trans_fail_top_count_enum_last_6_month'),
 (0.012634942017596049, 'trans_fail_top_count_enum_last_12_month'),
 (0.012547159590595519, 'consume_mini_time_last_1_month'),
 (0.012269127921896999, 'max_cumulative_consume_later_1_month'),
 (0.012251014193021116, 'max_consume_count_later_6_month'),
 (0.012130445200419506, 'railway_consume_count_last_12_month'),
 (0.012034854528026892, 'pawns_auctions_trusts_consume_last_1_month'),
 (0.011854647991772738, 'pawns_auctions_trusts_consume_last_6_month'),
 (0.011706539361219558, 'jewelry_consume_count_last_6_month'),
 (0.01166026942923009, 'status'),
 (0.011377310777428085, 'first_transaction_day'),
 (0.011259220840075928, 'trans_day_last_12_month'),
 (0.011153859215878038, 'apply_score'),
 (0.010988313523754335, 'apply_credibility'),
 (0.010748172884177704, 'query_org_count'),
 (0.010443991633735872, 'query_finance_count'),
 (0.010106738553680085, 'query_cash_count'),
 (0.0097070363548293296, 'query_sum_count'),
 (0.0096860508706849251, 'latest_one_month_apply'),
 (0.0095788427733895475, 'latest_three_month_apply'),
 (0.0095651360382182811, 'latest_six_month_apply'),
 (0.009435867222090371, 'loans_score'),
 (0.0092886391460863804, 'loans_credibility_behavior'),
 (0.0092742649699127776, 'loans_count'),
 (0.0092388186178661829, 'loans_settle_count'),
 (0.009040150558181137, 'loans_overdue_count'),
 (0.008461666363636015, 'loans_org_count_behavior'),
 (0.0083264175951640769, 'consfin_org_count_behavior'),
 (0.00799967023354658, 'loans_cash_count'),
 (0.0079861813469583494, 'latest_one_month_loan'),
 (0.0079209175329646251, 'latest_three_month_loan'),
 (0.0077087109921875323, 'latest_six_month_loan'),
 (0.0077013706125157568, 'history_suc_fee'),
 (0.0076812466427810602, 'history_fail_fee'),
 (0.0076104715634482956, 'latest_one_month_suc'),
 (0.0075162527300091219, 'latest_one_month_fail'),
 (0.0071011562089886267, 'loans_long_time'),
 (0.0070837738212370079, 'loans_credit_limit'),
 (0.0068867085612011308, 'loans_credibility_limit'),
 (0.0062722923476042358, 'loans_org_count_current'),
 (0.0062699820118575483, 'loans_product_count'),
 (0.0061403277931418362, 'loans_max_limit'),
 (0.0060089335095311162, 'loans_avg_limit'),
 (0.0055742146918062018, 'consfin_credit_limit'),
 (0.004979817893285372, 'consfin_credibility'),
 (0.0029223544733663646, 'consfin_org_count_current'),
 (0.0022066949392423817, 'consfin_product_count'),
 (0.00014142014941099571, 'consfin_max_limit'),
 (0.00013797225326978062, 'consfin_avg_limit'),
 (0.0, 'latest_query_day')]

# 最后选取的特征
new_features = data[features_seleted]
print('经过随机森林特征选择后，当阈值为{}时，最后保留的特征数为{}个'.format(0.01,new_features.shape[1]))

经过随机森林特征选择后，当阈值为0.01时，最后保留的特征数为47个

task3_dataset = pd.concat([new_features,y],axis=1)
task3_dataset.to_csv('task2_proc.csv',index = False)

qmys

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
数据挖掘 —— 金融数据（二）

特征工程任务二特征衍生特征挑选：分别用IV值和随机森林等进行特征选择……以及你能想到特征工程处理# 导入需要的包import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_tes...
复制链接

扫一扫

专栏目录