1. 数据清洗过滤无用特征
1.1 设定的nan个数界限保留整列
- dropna(thresh=n)非nan最少n个才能保留
import pandas as pd
loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
half_count = len(loans_2007) / 2
loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
loans_2007.to_csv('loans_2007.csv', index=False)
1.2 查看数据,第一行与列数
import pandas as pd
loans_2007 = pd.read_csv("loans_2007.csv")
#loans_2007.drop_duplicates()
print(loans_2007.iloc[0])
print(loans_2007.shape[1])#列
id 1077501
member_id 1.2966e+06
loan_amnt 5000
funded_amnt 5000
funded_amnt_inv 4975
term 36 months
int_rate 10.65%
installment 162.87
grade B
sub_grade B2
emp_title NaN
emp_length 10+ years
home_ownership RENT
annual_inc 24000
verification_status Verified
issue_d Dec-2011
loan_status Fully Paid
pymnt_plan n
purpose credit_card
title Computer
zip_code 860xx
addr_state AZ
dti 27.65
delinq_2yrs 0
earliest_cr_line Jan-1985
inq_last_6mths 1
open_acc 3
pub_rec 0
revol_bal 13648
revol_util 83.7%
total_acc 9
initial_list_status f
out_prncp 0
out_prncp_inv 0
total_pymnt 5863.16
total_pymnt_inv 5833.84
total_rec_prncp 5000
total_rec_int 863.16
total_rec_late_fee 0
recoveries 0
collection_recovery_fee 0
last_pymnt_d Jan-2015
last_pymnt_amnt 171.62
last_credit_pull_d Nov-2016
collections_12_mths_ex_med 0
policy_code 1
application_type INDIVIDUAL
acc_now_delinq 0
chargeoff_within_12_mths 0
delinq_amnt 0
pub_rec_bankruptcies 0
tax_liens 0
Name: 0, dtype: object
52
1.3 删除无用列
loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
print(loans_2007.iloc[0])
print(loans_2007.shape[1])
loan_amnt 5000
term 36 months
int_rate 10.65%
installment 162.87
emp_length 10+ years
home_ownership RENT
annual_inc 24000
verification_status Verified
loan_status Fully Paid
pymnt_plan n
purpose credit_card
title Computer
addr_state AZ
dti 27.65
delinq_2yrs 0
earliest_cr_line Jan-1985
inq_last_6mths 1
open_acc 3
pub_rec 0
revol_bal 13648
revol_util 83.7%
total_acc 9
initial_list_status f
last_credit_pull_d Nov-2016
collections_12_mths_ex_med 0
policy_code 1
application_type INDIVIDUAL
acc_now_delinq 0
chargeoff_within_12_mths 0
delinq_amnt 0
pub_rec_bankruptcies 0
tax_liens 0
Name: 0, dtype: object
32
2. 数据预处理
2.1 保留loan_status中两值变为0和1 (target)
print(loans_2007['loan_status'].value_counts())
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]
status_replace = {
"loan_status" : {
"Fully Paid": 1,
"Charged Off": 0,
}
}
loans_2007 = loans_2007.replace(status_replace)
2.2 删除唯一属性的列
orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
col_series = loans_2007[col].dropna().unique() #先清除缺失值,否则会多一个唯一值
if len(col_series) == 1:
drop_columns.append(col)
loans_2007 = loans_2007.drop(drop_columns, axis=1)
print(drop_columns)
print loans_2007.shape
loans_2007.to_csv('filtered_loans_2007.csv', index=False)
2.3 求出每列的空值总和,并去除空值
import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
print(null_counts)
loans = loans.drop("pub_rec_bankruptcies", axis=1) #删除空值多的列
loans = loans.dropna(axis=0) #删除空值所在的行
2.4 查看数据各个类型的数量,并转换字符类型的列的值
print(loans.dtypes.value_counts())
object_columns_df = loans.select_dtypes(include=["object"])
print(object_columns_df.iloc[0])
2.5 查看列的各个特征值的数量,判断是更改还是删除
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for c in cols:
print(loans[c].value_counts())
RENT 18780
MORTGAGE 17574
OWN 3045
OTHER 96
NONE 3
Name: home_ownership, dtype: int64
Not Verified 16856
Verified 12705
Source Verified 9937
Name: verification_status, dtype: int64
10+ years 8821
< 1 year 4563
2 years 4371
3 years 4074
4 years 3409
5 years 3270
1 year 3227
6 years 2212
7 years 1756
8 years 1472
9 years 1254
n/a 1069
Name: emp_length, dtype: int64
36 months 29041
60 months 10457
Name: term, dtype: int64
CA 7070
NY 3788
FL 2856
TX 2714
NJ 1838
IL 1517
PA 1504
VA 1400
GA 1393
MA 1336
OH 1208
MD 1049
AZ 874
WA 834
CO 786
NC 780
CT 747
MI 722
MO 682
MN 611
NV 492
SC 470
WI 453
AL 446
OR 445
LA 435
KY 325
OK 298
KS 269
UT 256
AR 243
DC 211
RI 198
NM 188
WV 176
HI 172
NH 172
DE 113
MT 84
WY 83
AK 79
SD 63
VT 54
MS 19
TN 17
IN 9
ID 6
IA 5
NE 5
ME 3
Name: addr_state, dtype: int64
print(loans["purpose"].value_counts())
print(loans["title"].value_counts())
debt_consolidation 18533
credit_card 5099
other 3963
home_improvement 2965
major_purchase 2181
small_business 1815
car 1544
wedding 945
medical 692
moving 581
vacation 379
house 378
educational 320
renewable_energy 103
Name: purpose, dtype: int64
Debt Consolidation 2168
Debt Consolidation Loan 1706
Personal Loan 658
Consolidation 509
debt consolidation 502
Credit Card Consolidation 356
Home Improvement 354
Debt consolidation 333
Small Business Loan 322
Credit Card Loan 313
Personal 308
Consolidation Loan 255
Home Improvement Loan 246
personal loan 234
personal 220
Loan 212
Wedding Loan 209
consolidation 200
Car Loan 200
...
Name: title, dtype: int64
2.6 更改字符的类型
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)#删除特征值太多的列
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans = loans.replace(mapping_dict)
2.7 字符ont-hot编码
cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
loans = loans.drop("pymnt_plan", axis=1)
loans.to_csv('cleaned_loans2007.csv', index=False)
3. 获得最大利润的条件与做法
3.1 查看数据的类型
import pandas as pd
loans = pd.read_csv("cleaned_loans2007.csv")
print(loans.info())
全部为常量
3.2 建立数据集
cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
3.3 LogisticRegression简单模型
- 交叉验证
- cross_val_predict 为每个输入数据点生成交叉验证的估计值
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
print(fp_filter)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)#实际能还,预测不能
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / float((tp + fn))#盈利
fpr = fp / float((fp + tn))#赔钱
print(tpr)
print(fpr)
print(predictions[:20])
打印:
0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
0.6501668684840072
0.36815038127327543
直接构建模型效果差
3.4 LogisticRegression模型添加权重
- class_weight=“balanced”
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression(class_weight="balanced")#样本均衡
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))
print(tpr)
print(fpr)
print(predictions[:20])
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('准确率',accuracy)
0.6501668684840072
0.36815038127327543
0 1
1 0
2 0
3 1
4 1
5 0
6 0
7 0
8 0
9 0
10 1
11 0
12 1
13 1
14 0
15 0
16 1
17 1
18 1
19 0
len(correct_predictions) 25577
float(len(admissions) 39498.0
准确率 0.6475517747734062
模型效果一般
- class_weight=penalty
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
penalty = {
0: 5,
1: 1
}
lr = LogisticRegression(class_weight=penalty)
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))
print('#自己定义权重项',tpr)
print('#自己定义权重项',fpr)
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('#自己定义权重项准确率',accuracy)
#自己定义权重项 0.6933459346111817
#自己定义权重项 0.45664124844830645
len(correct_predictions) 26540
float(len(admissions) 39498.0
#自己定义权重项准确率 0.6719327560889159
3.5 随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
rf = RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=1)
#print help(RandomForestClassifier)
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))
print('随机森林',tpr)
print('随机森林',fpr)
print(predictions[:20])
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"]
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('随机森林准确率',accuracy)
随机森林 0.9744824123571281
随机森林 0.9313708104273808
0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
len(correct_predictions) 33382
float(len(admissions) 39498.0
随机森林准确率 0.8451567167957871
从预测结果来看也不是很理想,
可以调节正负样本权重以及模型的参数进行优化,也可以选择比如SVM等模型进行对比。