Python数据分析与机器学习-贷款申请最大化利润

源码下载:

http://download.csdn.net/download/adam_zs/10230326

import pandas as pd

# loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
# print(loans_2007.shape) #(42538, 111)
# print(len(loans_2007)) #42538
# half_count = len(loans_2007) / 2
# loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
# loans_2007 = loans_2007.drop(['desc', 'url'], axis=1)
# loans_2007.to_csv('loans_2007.csv', index=False)

# LoanStats3a = pd.read_csv('LoanStats3a.csv', skiprows=1)
# loans_2007 = pd.read_csv('loans_2007.csv')
# print(LoanStats3a.shape)
# print(loans_2007.shape)

# loans_2007 = pd.read_csv('loans_2007.csv')
# print(loans_2007.iloc[0])

# print(loans_2007.shape) #(42538, 52)
# drop_columns = ["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d",
#                 "zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp",
#                 "total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d",
#                 "last_pymnt_amnt"]
# loans_2007 = loans_2007.drop(drop_columns, axis=1)
# print(loans_2007.shape) #(42538, 32)

# loan_status 贷款状态
# print(loans_2007['loan_status'].value_counts())
'''
sys:1: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
Fully Paid                                             33902  全额贷款
Charged Off                                             5658  没有被批准
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Current                                                  201
Late (31-120 days)                                        10
In Grace Period                                            9
Late (16-30 days)                                          5
Default                                                    1
'''

# loans_2007 = loans_2007[(loans_2007['loan_status'] == 'Fully Paid') | (loans_2007['loan_status'] == 'Charged Off')]
# status_replace = {
#     "loan_status": {
#         "Fully Paid": 1,
#         "Charged Off": 0
#     }
# }
# loans_2007 = loans_2007.replace(status_replace)

# 删除列值一样的列
# orig_columns = loans_2007.columns
# drop_columns = []
# for col in orig_columns:
#     col_series = loans_2007[col].dropna().unique()
#     if len(col_series) == 1:
#         drop_columns.append(col)
# print(drop_columns)
# loans_2007 = loans_2007.drop(drop_columns, axis=1)

# loans_2007.to_csv("filtered_loans_2007.csv", index=False)

# loans = pd.read_csv('filtered_loans_2007.csv')
# null_counts = loans.isnull().sum()
# print(null_counts)
# print(loans.shape) #(39560, 24)
# loans.drop(['pub_rec_bankruptcies'], inplace=True, axis=1)
# loans.dropna(axis=0, inplace=True)
# print(loans.shape) #(38428, 23)
# print(loans.dtypes.value_counts())
'''
object     12
float64    10
int64       1
dtype: int64
'''
# object_columns_df = loans.select_dtypes(include=['object'])
# print(object_columns_df.iloc[0])

# loans_columns = loans.columns
# for col in loans_columns:
#     print(loans[col].value_counts())

# loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
# print(loans['emp_length'].value_counts())
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
# loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
# loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
# loans.replace(mapping_dict)
# cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
# dummy_df = pd.get_dummies(loans[cat_columns])
# loans = pd.concat([loans, dummy_df], axis=1)
# loans = loans.drop(cat_columns, axis=1)
# loans = loans.drop("pymnt_plan", axis=1)
# loans.to_csv('cleaned_loans2007.csv', index=False)

# loans = pd.read_csv("cleaned_loans2007.csv")
# print(loans.info())

# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import KFold, cross_val_predict
#
loans = pd.read_csv('cleaned_loans2007.csv')
loans = loans.sample(n=100)
cols = loans.columns
features = loans[cols.drop('loan_status')]
target = loans['loan_status']

# lr = LogisticRegression(class_weight='balanced')  # class_weight 调整正负样本的比例,balanced正负样本平衡
# kf = KFold(len(features), random_state=1)
# predictions = cross_val_predict(lr, features, target, cv=kf)
# predictions = pd.Series(predictions)
#
# # True positives.
# tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
# tp = len(predictions[tp_filter])
#
# # False positives.
# fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
# fp = len(predictions[fp_filter])
#
# # False negatives.
# fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
# fn = len(predictions[fn_filter])
#
# # True negatives
# tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
# tn = len(predictions[tn_filter])
#
# # Rates
# tpr = tp / float((tp + fn))
# fpr = fp / float((fp + tn))

'''随机森林'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_predict

kf = KFold(features.shape[0], random_state=1)
rf = RandomForestClassifier(class_weight='balanced', random_state=1)

predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))


  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值