金融逾期
任务1.1
1.将数据集划分37开,选择train_test_split
函数中test_size
=0.3即可,其中随机种子只需要设置random_state
即可
2.然后将整个数据集分开,status为目标列y,剩余特征即为x。
3.接下来的就很简单了分别建立3个模型,直接看得分,但是很神奇的地方是,做出来的结果,svm和lr的得分一模一样。
4.经过查询发现有可能是因为模型参数原因,故调整svm模型中超参数C,发现仍然没有效果
5.再次查找资料,发现有可能是数据的原因,故对数据进行标准化。最后终于得到了不一样的结论。
6.开始可能有点弄错了,最后加了一小段,使用f1得分,一般分类问题,用这个评价指标的比较多。不过值得一提的是f1得分,最终的结果是lr>决策树>svm。
PS:第一次使用markdown码文,可能比较简陋,大家多多谅解,此文主要起到抛砖引玉的作用,大家应该做的比我好。感谢。最后附上代码
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
data=pd.read_csv('data_all.csv')
data.head()
data.columns
y=data['status']
x=data[['low_volume_percent', 'middle_volume_percent',
'take_amount_in_later_12_month_highest',
'trans_amount_increase_rate_lately', 'trans_activity_month',
'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter',
'trans_days_interval', 'regional_mobility', 'repayment_capability',
'is_high_user', 'number_of_trans_from_2011', 'first_transaction_time',
'historical_trans_amount', 'historical_trans_day', 'rank_trad_1_month',
'trans_amount_3_month', 'avg_consume_less_12_valid_month', 'abs',
'top_trans_count_last_1_month', 'avg_price_last_12_month',
'avg_price_top_last_12_valid_month', 'trans_top_time_last_1_month',
'trans_top_time_last_6_month', 'consume_top_time_last_1_month',
'consume_top_time_last_6_month', 'cross_consume_count_last_1_month',
'trans_fail_top_count_enum_last_1_month',
'trans_fail_top_count_enum_last_6_month',
'trans_fail_top_count_enum_last_12_month',
'consume_mini_time_last_1_month',
'max_cumulative_consume_later_1_month',
'max_consume_count_later_6_month',
'railway_consume_count_last_12_month',
'pawns_auctions_trusts_consume_last_1_month',
'pawns_auctions_trusts_consume_last_6_month',
'jewelry_consume_count_last_6_month','first_transaction_day',
'trans_day_last_12_month', 'apply_score', 'apply_credibility',
'query_org_count', 'query_finance_count', 'query_cash_count',
'query_sum_count', 'latest_one_month_apply', 'latest_three_month_apply',
'latest_six_month_apply', 'loans_score', 'loans_credibility_behavior',
'loans_count', 'loans_settle_count', 'loans_overdue_count',
'loans_org_count_behavior', 'consfin_org_count_behavior',
'loans_cash_count', 'latest_one_month_loan', 'latest_three_month_loan',
'latest_six_month_loan', 'history_suc_fee', 'history_fail_fee',
'latest_one_month_suc', 'latest_one_month_fail', 'loans_long_time',
'loans_credit_limit', 'loans_credibility_limit',
'loans_org_count_current', 'loans_product_count', 'loans_max_limit',
'loans_avg_limit', 'consfin_credit_limit', 'consfin_credibility',
'consfin_org_count_current', 'consfin_product_count',
'consfin_max_limit', 'consfin_avg_limit', 'latest_query_day',
'loans_latest_day', 'reg_preference_for_trad',
'latest_query_time_month', 'latest_query_time_weekday',
'loans_latest_time_month', 'loans_latest_time_weekday']]
###这一步其实弄的很复杂,应该用一个drop函数更简单一些,drop['status',axis=1]小伙伴可以去试试看###
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2018)
x_train.shape,y_train.shape
x_test.shape,y_test.shape
clf1=svm.SVC().fit(x_train,y_train) #svm
clf2=LogisticRegression().fit(x_train,y_train) # lr回归
clf3=DecisionTreeClassifier().fit(x_train,y_train) #决策树
score1=clf1.score(x_test,y_test)
score2=clf2.score(x_test,y_test)
score3=clf3.score(x_test,y_test)
print(score1,score2,score3)
###在此惊奇的发现score1和score2相等后,经过无数次的实验,又有了下面的代码###
from sklearn import preprocessing
x_scaled = preprocessing.scale(x_train)
x_scaled_test = preprocessing.scale(x_test)
clf1=svm.SVC().fit(x_scaled,y_train) #svm
clf2=LogisticRegression().fit(x_scaled,y_train) # lr回归
clf3=DecisionTreeClassifier().fit(x_scaled,y_train) #决策树
score1=clf1.score(x_scaled_test,y_test)
score2=clf2.score(x_scaled_test,y_test)
score3=clf3.score(x_scaled_test,y_test)
print(score1,score2,score3)
#最终得分效果lr>svm>决策树,无调参,无交叉测试#
y_pre1=clf1.predict(x_scaled_test)
y_pre2=clf2.predict(x_scaled_test)
y_pre3=clf3.predict(x_scaled_test)
from sklearn.metrics import f1_score
s1=f1_score(y_test,y_pre1)
s2=f1_score(y_test,y_pre2)
s3=f1_score(y_test,y_pre3)
print(s1,s2,s3)