关闭

机器学习项目实战之贷款申请最大利润

标签: 机器学习
571人阅读 评论(0) 收藏 举报
分类:
import pandas as pd
loans_2007 = pd.read_csv("LoanStats3a.csv",skiprows=1)
#数据清理过滤无用特征 
half_count = len(loans_2007)/2
loans_2007 = loans_2007.dropna(thresh=half_count,axis=1)
loans_2007 = loans_2007.drop(['desc','url'],axis=1)
loans_2007.to_csv("D:\test\machineLearning\loans_2007.csv",index=False)
import pandas as pd
loans_2007 = pd.read_csv("D:\\test\\machineLearning\\loans_2007.csv")
loans_2007.drop_duplicates()
print (loans_2007.iloc[0])
print (loans_2007.shape[1])
id                                1077501
member_id                      1.2966e+06
loan_amnt                            5000
funded_amnt                          5000
funded_amnt_inv                      4975
term                            36 months
int_rate                           10.65%
installment                        162.87
grade                                   B
sub_grade                              B2
emp_title                             NaN
emp_length                      10+ years
home_ownership                       RENT
annual_inc                          24000
verification_status              Verified
issue_d                          Dec-2011
loan_status                    Fully Paid
pymnt_plan                              n
purpose                       credit_card
title                            Computer
zip_code                            860xx
addr_state                             AZ
dti                                 27.65
delinq_2yrs                             0
earliest_cr_line                 Jan-1985
inq_last_6mths                          1
open_acc                                3
pub_rec                                 0
revol_bal                           13648
revol_util                          83.7%
total_acc                               9
initial_list_status                     f
out_prncp                               0
out_prncp_inv                           0
total_pymnt                       5863.16
total_pymnt_inv                   5833.84
total_rec_prncp                      5000
total_rec_int                      863.16
total_rec_late_fee                      0
recoveries                              0
collection_recovery_fee                 0
last_pymnt_d                     Jan-2015
last_pymnt_amnt                    171.62
last_credit_pull_d               Nov-2016
collections_12_mths_ex_med              0
policy_code                             1
application_type               INDIVIDUAL
acc_now_delinq                          0
chargeoff_within_12_mths                0
delinq_amnt                             0
pub_rec_bankruptcies                    0
tax_liens                               0
Name: 0, dtype: object
52
#数据预处理
loans_2007 = loans_2007.drop(['id','member_id','funded_amnt','funded_amnt_inv','grade','sub_grade','emp_title','last_pymnt_d','last_pymnt_amnt'],axis=1)
loans_2007 = loans_2007.drop(['zip_code','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp'],axis=1)
loans_2007 = loans_2007.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','issue_d'],axis=1)
print (loans_2007.iloc[0])
print (loans_2007.shape[1])
loan_amnt                            5000
term                            36 months
int_rate                           10.65%
installment                        162.87
emp_length                      10+ years
home_ownership                       RENT
annual_inc                          24000
verification_status              Verified
loan_status                    Fully Paid
pymnt_plan                              n
purpose                       credit_card
title                            Computer
addr_state                             AZ
dti                                 27.65
delinq_2yrs                             0
earliest_cr_line                 Jan-1985
inq_last_6mths                          1
open_acc                                3
pub_rec                                 0
revol_bal                           13648
revol_util                          83.7%
total_acc                               9
initial_list_status                     f
last_credit_pull_d               Nov-2016
collections_12_mths_ex_med              0
policy_code                             1
application_type               INDIVIDUAL
acc_now_delinq                          0
chargeoff_within_12_mths                0
delinq_amnt                             0
pub_rec_bankruptcies                    0
tax_liens                               0
Name: 0, dtype: object
32
#loan_status是当前贷款的状态 
print (loans_2007["loan_status"].value_counts())
Fully Paid                                             33902
Charged Off                                             5658
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Current                                                  201
Late (31-120 days)                                        10
In Grace Period                                            9
Late (16-30 days)                                          5
Default                                                    1
Name: loan_status, dtype: int64
#Fully Paid代表已放款,Charged Off代表拒贷,进行二分类
loans_2007 = loans_2007[(loans_2007['loan_status']=='Fully Paid') | (loans_2007['loan_status']=='Charged Off')]
#将字符串转化成数字
status_replace = {
    'loan_status':{
        'Fully Paid':1,
        'Charged Off':0,
    }
}
#将要替换的做成字典,key是对应的列
loans_2007 = loans_2007.replace(status_replace)
orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
    col_series = loans_2007[col].dropna().unique()
    if len(col_series) == 1:
        #如果某一列都是一种值,也将其去掉
        drop_columns.append(col)

loans_2007 = loans_2007.drop(drop_columns,axis=1)
print drop_columns
print loans_2007.shape
['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']
(39560, 24)
#检查是否有缺失值
null_count=loans_2007.isnull().sum()
print null_count
loan_amnt                 0
term                      0
int_rate                  0
installment               0
emp_length                0
home_ownership            0
annual_inc                0
verification_status       0
loan_status               0
pymnt_plan                0
purpose                   0
title                    10
addr_state                0
dti                       0
delinq_2yrs               0
earliest_cr_line          0
inq_last_6mths            0
open_acc                  0
pub_rec                   0
revol_bal                 0
revol_util               50
total_acc                 0
last_credit_pull_d        2
pub_rec_bankruptcies    697
dtype: int64
loans_2007 = loans_2007.drop("pub_rec_bankruptcies",axis=1)
#去掉有缺失值的行
loans_2007 = loans_2007.dropna(axis=0)
print loans_2007.dtypes.value_counts()
object     12
float64    10
int64       1
dtype: int64
#从以上结果看,由于sklearn只接受数值形的数据,不接受字符,所以显示为object,我们需要将其转化为字数值
object_columns_df = loans_2007.select_dtypes(include=["object"])
print object_columns_df.iloc[0]
term                     36 months
int_rate                    10.65%
emp_length               10+ years
home_ownership                RENT
verification_status       Verified
pymnt_plan                       n
purpose                credit_card
title                     Computer
addr_state                      AZ
earliest_cr_line          Jan-1985
revol_util                   83.7%
last_credit_pull_d        Nov-2016
Name: 0, dtype: object
cols = ['home_ownership','verification_status','emp_length','term','addr_state']
for c in cols:
    print loans_2007[c].value_counts()
RENT        18780
MORTGAGE    17574
OWN          3045
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Not Verified       16856
Verified           12705
Source Verified     9937
Name: verification_status, dtype: int64
10+ years    8821
< 1 year     4563
2 years      4371
3 years      4074
4 years      3409
5 years      3270
1 year       3227
6 years      2212
7 years      1756
8 years      1472
9 years      1254
n/a          1069
Name: emp_length, dtype: int64
 36 months    29041
 60 months    10457
Name: term, dtype: int64
CA    7070
NY    3788
FL    2856
TX    2714
NJ    1838
IL    1517
PA    1504
VA    1400
GA    1393
MA    1336
OH    1208
MD    1049
AZ     874
WA     834
CO     786
NC     780
CT     747
MI     722
MO     682
MN     611
NV     492
SC     470
WI     453
AL     446
OR     445
LA     435
KY     325
OK     298
KS     269
UT     256
AR     243
DC     211
RI     198
NM     188
WV     176
HI     172
NH     172
DE     113
MT      84
WY      83
AK      79
SD      63
VT      54
MS      19
TN      17
IN       9
ID       6
IA       5
NE       5
ME       3
Name: addr_state, dtype: int64
mapping_dict = {
    "emp_length":{
        "10+ years":10,
        "9 years":9,
        "8 years":8,
        "7 years":7,
        "6 years":6,
        "5 years":5,
        "4 years":4,
        "3 years":3,
        "2 years":2,
        "1 year":1,  
        "< 1 year":0,
        "n/a":0
    }   
}
loans_2007 = loans_2007.drop(["last_credit_pull_d","earliest_cr_line","addr_state","title"],axis=1)
#去掉%并转化为浮点型数据
loans_2007["int_rate"]=loans_2007["int_rate"].str.rstrip("%").astype("float")
loans_2007["revol_util"]=loans_2007["revol_util"].str.rstrip("%").astype("float")
loans_2007 = loans_2007.replace(mapping_dict)
cat_columns = ['home_ownership','verification_status','emp_length','purpose','term']
dummy_df = pd.get_dummies(loans_2007[cat_columns])
loans_2007 = pd.concat([loans_2007,dummy_df],axis=1)
loans_2007 = loans_2007.drop(cat_columns,axis=1)
#贷款这样的项目,精度高的意义不大,因为只要亏了一笔,最终可能亏损很大
#所以要考虑ROC指标

#以上为数据预处理的流程,获取已经整理好的数据
loans = pd.read_csv("D:\\test\\machineLearning\\cleaned_loans2007.csv")
print loans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39498 entries, 0 to 39497
Data columns (total 37 columns):
loan_amnt                              39498 non-null float64
int_rate                               39498 non-null float64
installment                            39498 non-null float64
annual_inc                             39498 non-null float64
loan_status                            39498 non-null int64
dti                                    39498 non-null float64
delinq_2yrs                            39498 non-null float64
inq_last_6mths                         39498 non-null float64
open_acc                               39498 non-null float64
pub_rec                                39498 non-null float64
revol_bal                              39498 non-null float64
revol_util                             39498 non-null float64
total_acc                              39498 non-null float64
home_ownership_MORTGAGE                39498 non-null int64
home_ownership_NONE                    39498 non-null int64
home_ownership_OTHER                   39498 non-null int64
home_ownership_OWN                     39498 non-null int64
home_ownership_RENT                    39498 non-null int64
verification_status_Not Verified       39498 non-null int64
verification_status_Source Verified    39498 non-null int64
verification_status_Verified           39498 non-null int64
purpose_car                            39498 non-null int64
purpose_credit_card                    39498 non-null int64
purpose_debt_consolidation             39498 non-null int64
purpose_educational                    39498 non-null int64
purpose_home_improvement               39498 non-null int64
purpose_house                          39498 non-null int64
purpose_major_purchase                 39498 non-null int64
purpose_medical                        39498 non-null int64
purpose_moving                         39498 non-null int64
purpose_other                          39498 non-null int64
purpose_renewable_energy               39498 non-null int64
purpose_small_business                 39498 non-null int64
purpose_vacation                       39498 non-null int64
purpose_wedding                        39498 non-null int64
term_ 36 months                        39498 non-null int64
term_ 60 months                        39498 non-null int64
dtypes: float64(12), int64(25)
memory usage: 11.1 MB
None
#使用逻辑回归来分析数据,逻辑回归是一个非常经典的二分类
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict,KFold

lr = LogisticRegression()
cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
kf = KFold(features.shape[0],random_state=1)
predictions = cross_val_predict(lr,features,target,cv=kf)
predictions = pd.Series(predictions)
#False positive
fp_filter = (predictions == 1) & (loans["loan_status"]==0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"]==1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"]==1)
fn = len(predictions[fn_filter])
#True negative
tn_filter = (predictions == 0) & (loans["loan_status"]==0)
tn = len(predictions[tn_filter])

#Rate:True Positive很高,因为我们能赚到,但是False positive的也很高,我们亏本的概率也非常高
#经分析是因为来的人我们几乎都借钱给它了,是因为我们的数据样本不平衡
#解决方法:1.数据增强,增加一些没有借钱给他的数据案例,可以自己制造,也可以自己去收集
tpr = tp/float((tp+fn))
fpr = fp/float((fp+fn))
print tpr
print fpr
print predictions[:20]
0.999143506896
0.994872701556
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
dtype: int64
#指定参数,调整正负样本的权重
lr = LogisticRegression(class_weight='balanced')
kf = KFold(features.shape[0],random_state=1)
predictions = cross_val_predict(lr,features,target,cv=kf)
predictions = pd.Series(predictions)
#False positive
fp_filter = (predictions == 1) & (loans["loan_status"]==0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"]==1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"]==1)
fn = len(predictions[fn_filter])
#True negative
tn_filter = (predictions == 0) & (loans["loan_status"]==0)
tn = len(predictions[tn_filter])
tpr = tp/float((tp+fn))
fpr = fp/float((fp+fn))
#经权重项的调整后,训练的模型更加有意义,但是tpr不够高,fpr也不够低
print tpr
print fpr
print predictions[:20]
0.670368292035
0.1674623303
0     1
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    1
14    0
15    0
16    1
17    1
18    1
19    0
dtype: int64
#以上是使用库函数自带的权重,我们也可以自己指定权重
penalty = {
    0:5,
    1:1
}
lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0],random_state=1)

predictions = cross_val_predict(lr,features,target,cv=kf)
predictions = pd.Series(predictions)
#False positive
fp_filter = (predictions == 1) & (loans["loan_status"]==0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"]==1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"]==1)
fn = len(predictions[fn_filter])
#True negative
tn_filter = (predictions == 0) & (loans["loan_status"]==0)
tn = len(predictions[tn_filter])
tpr = tp/float((tp+fn))
fpr = fp/float((fp+fn))
#发现权重项对我们最终的结果影响很大,实际情况中根据需求自己调整权重
print tpr
print fpr
print predictions[:20]
0.718686316784
0.215662055336
0     1
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    1
14    0
15    0
16    1
17    1
18    1
19    0
dtype: int64
#使用随机森林进行分析
#以上是使用库函数自带的权重,我们也可以自己指定权重
from sklearn.ensemble import RandomForestClassifier

lr = RandomForestClassifier(class_weight="balanced",random_state=1)
kf = KFold(features.shape[0],random_state=1)

predictions = cross_val_predict(lr,features,target,cv=kf)
predictions = pd.Series(predictions)
#False positive
fp_filter = (predictions == 1) & (loans["loan_status"]==0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"]==1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"]==1)
fn = len(predictions[fn_filter])
#True negative
tn_filter = (predictions == 0) & (loans["loan_status"]==0)
tn = len(predictions[tn_filter])
tpr = tp/float((tp+fn))
fpr = fp/float((fp+fn))
#发现使用随机森林效果也不是很好
print tpr
print fpr
print predictions[:20]
0.973862193213
0.857050557261
0     1
1     1
2     1
3     1
4     1
5     0
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    0
dtype: int64
#使用随机森林进行分析
#将树的数量增加到10颗
from sklearn.ensemble import RandomForestClassifier

lr = RandomForestClassifier(n_estimators=10,class_weight="balanced",random_state=1)
kf = KFold(features.shape[0],random_state=1)

predictions = cross_val_predict(lr,features,target,cv=kf)
predictions = pd.Series(predictions)
#False positive
fp_filter = (predictions == 1) & (loans["loan_status"]==0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"]==1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"]==1)
fn = len(predictions[fn_filter])
#True negative
tn_filter = (predictions == 0) & (loans["loan_status"]==0)
tn = len(predictions[tn_filter])
tpr = tp/float((tp+fn))
fpr = fp/float((fp+fn))
#发现效果也不是很好
print tpr
print fpr
print predictions[:20]
0.973862193213
0.857050557261
0     1
1     1
2     1
3     1
4     1
5     0
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    0
dtype: int64
#实际中:换算法模型,去掉一些特征,生成一些新的特征,调模型的参数,比如权重等来实现更好的效果
0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:19965次
    • 积分:769
    • 等级:
    • 排名:千里之外
    • 原创:59篇
    • 转载:2篇
    • 译文:0篇
    • 评论:6条
    文章存档
    最新评论