贷款申请最大化利润（二分类，逻辑回归，随机森林）

最新推荐文章于 2022-11-21 11:38:04 发布

curd_boy

最新推荐文章于 2022-11-21 11:38:04 发布

阅读量1k

点赞数 1

分类专栏：机器学习

本文链接：https://blog.csdn.net/weixin_43746433/article/details/96382621

版权

机器学习专栏收录该内容

23 篇文章 11 订阅

订阅专栏

文章目录

1. 数据清洗过滤无用特征
2. 数据预处理
3. 获得最大利润的条件与做法

1. 数据清洗过滤无用特征

1.1 设定的nan个数界限保留整列

dropna(thresh=n)非nan最少n个才能保留

import pandas as pd
loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
half_count = len(loans_2007) / 2
loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
loans_2007.to_csv('loans_2007.csv', index=False)

1.2 查看数据，第一行与列数

import pandas as pd
loans_2007 = pd.read_csv("loans_2007.csv")
#loans_2007.drop_duplicates()
print(loans_2007.iloc[0])
print(loans_2007.shape[1])#列

id                                1077501
member_id                      1.2966e+06
loan_amnt                            5000
funded_amnt                          5000
funded_amnt_inv                      4975
term                            36 months
int_rate                           10.65%
installment                        162.87
grade                                   B
sub_grade                              B2
emp_title                             NaN
emp_length                      10+ years
home_ownership                       RENT
annual_inc                          24000
verification_status              Verified
issue_d                          Dec-2011
loan_status                    Fully Paid
pymnt_plan                              n
purpose                       credit_card
title                            Computer
zip_code                            860xx
addr_state                             AZ
dti                                 27.65
delinq_2yrs                             0
earliest_cr_line                 Jan-1985
inq_last_6mths                          1
open_acc                                3
pub_rec                                 0
revol_bal                           13648
revol_util                          83.7%
total_acc                               9
initial_list_status                     f
out_prncp                               0
out_prncp_inv                           0
total_pymnt                       5863.16
total_pymnt_inv                   5833.84
total_rec_prncp                      5000
total_rec_int                      863.16
total_rec_late_fee                      0
recoveries                              0
collection_recovery_fee                 0
last_pymnt_d                     Jan-2015
last_pymnt_amnt                    171.62
last_credit_pull_d               Nov-2016
collections_12_mths_ex_med              0
policy_code                             1
application_type               INDIVIDUAL
acc_now_delinq                          0
chargeoff_within_12_mths                0
delinq_amnt                             0
pub_rec_bankruptcies                    0
tax_liens                               0
Name: 0, dtype: object
52

1.3 删除无用列

loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
print(loans_2007.iloc[0])
print(loans_2007.shape[1])

loan_amnt                            5000
term                            36 months
int_rate                           10.65%
installment                        162.87
emp_length                      10+ years
home_ownership                       RENT
annual_inc                          24000
verification_status              Verified
loan_status                    Fully Paid
pymnt_plan                              n
purpose                       credit_card
title                            Computer
addr_state                             AZ
dti                                 27.65
delinq_2yrs                             0
earliest_cr_line                 Jan-1985
inq_last_6mths                          1
open_acc                                3
pub_rec                                 0
revol_bal                           13648
revol_util                          83.7%
total_acc                               9
initial_list_status                     f
last_credit_pull_d               Nov-2016
collections_12_mths_ex_med              0
policy_code                             1
application_type               INDIVIDUAL
acc_now_delinq                          0
chargeoff_within_12_mths                0
delinq_amnt                             0
pub_rec_bankruptcies                    0
tax_liens                               0
Name: 0, dtype: object
32

2. 数据预处理

2.1 保留loan_status中两值变为0和1 （target）

print(loans_2007['loan_status'].value_counts())

在这里插入图片描述

loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]
status_replace = {
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}
loans_2007 = loans_2007.replace(status_replace)

2.2 删除唯一属性的列

orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
    col_series = loans_2007[col].dropna().unique() #先清除缺失值，否则会多一个唯一值
    if len(col_series) == 1:
        drop_columns.append(col)
loans_2007 = loans_2007.drop(drop_columns, axis=1)
print(drop_columns)
print loans_2007.shape
loans_2007.to_csv('filtered_loans_2007.csv', index=False)

在这里插入图片描述

2.3 求出每列的空值总和，并去除空值

import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
print(null_counts)

loans = loans.drop("pub_rec_bankruptcies", axis=1) #删除空值多的列
loans = loans.dropna(axis=0) #删除空值所在的行

在这里插入图片描述

2.4 查看数据各个类型的数量，并转换字符类型的列的值

print(loans.dtypes.value_counts())
object_columns_df = loans.select_dtypes(include=["object"])
print(object_columns_df.iloc[0])

在这里插入图片描述

2.5 查看列的各个特征值的数量，判断是更改还是删除

cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for c in cols:
    print(loans[c].value_counts())

RENT        18780
MORTGAGE    17574
OWN          3045
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Not Verified       16856
Verified           12705
Source Verified     9937
Name: verification_status, dtype: int64
10+ years    8821
< 1 year     4563
2 years      4371
3 years      4074
4 years      3409
5 years      3270
1 year       3227
6 years      2212
7 years      1756
8 years      1472
9 years      1254
n/a          1069
Name: emp_length, dtype: int64
 36 months    29041
 60 months    10457
Name: term, dtype: int64
CA    7070
NY    3788
FL    2856
TX    2714
NJ    1838
IL    1517
PA    1504
VA    1400
GA    1393
MA    1336
OH    1208
MD    1049
AZ     874
WA     834
CO     786
NC     780
CT     747
MI     722
MO     682
MN     611
NV     492
SC     470
WI     453
AL     446
OR     445
LA     435
KY     325
OK     298
KS     269
UT     256
AR     243
DC     211
RI     198
NM     188
WV     176
HI     172
NH     172
DE     113
MT      84
WY      83
AK      79
SD      63
VT      54
MS      19
TN      17
IN       9
ID       6
IA       5
NE       5
ME       3
Name: addr_state, dtype: int64

print(loans["purpose"].value_counts())
print(loans["title"].value_counts())

debt_consolidation    18533
credit_card            5099
other                  3963
home_improvement       2965
major_purchase         2181
small_business         1815
car                    1544
wedding                 945
medical                 692
moving                  581
vacation                379
house                   378
educational             320
renewable_energy        103
Name: purpose, dtype: int64
Debt Consolidation                         2168
Debt Consolidation Loan                    1706
Personal Loan                               658
Consolidation                               509
debt consolidation                          502
Credit Card Consolidation                   356
Home Improvement                            354
Debt consolidation                          333
Small Business Loan                         322
Credit Card Loan                            313
Personal                                    308
Consolidation Loan                          255
Home Improvement Loan                       246
personal loan                               234
personal                                    220
Loan                                        212
Wedding Loan                                209
consolidation                               200
Car Loan                                    200
...
Name: title, dtype: int64

2.6 更改字符的类型

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)#删除特征值太多的列
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans = loans.replace(mapping_dict)

2.7 字符ont-hot编码

cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
loans = loans.drop("pymnt_plan", axis=1)
loans.to_csv('cleaned_loans2007.csv', index=False)

3. 获得最大利润的条件与做法

3.1 查看数据的类型

import pandas as pd
loans = pd.read_csv("cleaned_loans2007.csv")
print(loans.info())

全部为常量

3.2 建立数据集

cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]

3.3 LogisticRegression简单模型

交叉验证
cross_val_predict 为每个输入数据点生成交叉验证的估计值

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
print(fp_filter)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)#实际能还，预测不能
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))#盈利
fpr = fp / float((fp + tn))#赔钱

print(tpr)
print(fpr)
print(predictions[:20])

打印：

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
0.6501668684840072
0.36815038127327543

直接构建模型效果差

3.4 LogisticRegression模型添加权重

class_weight=“balanced”

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression(class_weight="balanced")#样本均衡 
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))

print(tpr)
print(fpr)
print(predictions[:20])
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('准确率',accuracy)

0.6501668684840072
0.36815038127327543
0     1
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    1
14    0
15    0
16    1
17    1
18    1
19    0
len(correct_predictions) 25577
float(len(admissions) 39498.0
准确率 0.6475517747734062

模型效果一般

class_weight=penalty

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
penalty = {
    0: 5,
    1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))

print('#自己定义权重项',tpr)
print('#自己定义权重项',fpr)
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('#自己定义权重项准确率',accuracy)

#自己定义权重项 0.6933459346111817
#自己定义权重项 0.45664124844830645
len(correct_predictions) 26540
float(len(admissions) 39498.0
#自己定义权重项准确率 0.6719327560889159

3.5 随机森林

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
rf = RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=1)
#print help(RandomForestClassifier)
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))
print('随机森林',tpr)
print('随机森林',fpr)
print(predictions[:20])
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('随机森林准确率',accuracy)

随机森林 0.9744824123571281
随机森林 0.9313708104273808
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
len(correct_predictions) 33382
float(len(admissions) 39498.0
随机森林准确率 0.8451567167957871