文章目录
一、导入数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
import toad
from toad.plot import bin_plot, badrate_plot
import math
from imblearn.over_sampling import SMOTE, RandomOverSampler
import seaborn as sns
sns.set()
# 数据来源:kaggle项目"give me some credit"
credit_df0 = pd.read_csv('data/GiveMeSomeCredit/cs-training.csv')
# 查看数据集
credit_df0.head()
# 查看描述性统计信息
toad.detect(credit_df0)
列名含义:
- SeriousDlqin2yrs:超过90天或更糟的逾期拖欠
- RevolvingUtilizationOfUnsecuredLines:除了房贷车贷之外的信用卡账面金额(即贷款金额)/信用卡总额度
- age:贷款人年龄
- NumberOfTime30-59DaysPastDueNotWorse:35-59天逾期但不糟糕次数
- DebtRatio:负债比率
- MonthlyIncome:月收入
- NumberOfOpenCreditLinesAndLoans:开放式信贷和贷款数量,开放式贷款(分期付款如汽车贷款或抵押贷款)和信贷(如信用卡)的数量
- NumberOfTimes90DaysLate:借款者有90天或更高逾期的次数
- NumberRealEstateLoansOrLines:不动产贷款或额度数量
- NumberOfTime60-89DaysPastDueNotWorse:60-89天逾期但不糟糕次数
- NumberOfDependents:不包括本人在内的家属数量
二、EDA
# 丢弃编号列
credit_df1 = credit_df0.drop(['Unnamed: 0'], axis=1)
# 修改列名
colnames={
'SeriousDlqin2yrs':'Isdlq',
'age':'Age',
'RevolvingUtilizationOfUnsecuredLines':'Revol',
'NumberOfTime30-59DaysPastDueNotWorse':'Num30-59late',
'NumberOfOpenCreditLinesAndLoans':'Numopen',
'NumberOfTimes90DaysLate':'Num90late',
'NumberRealEstateLoansOrLines':'Numestate',
'NumberOfTime60-89DaysPastDueNotWorse':'Num60-89late',
'NumberOfDependents':'Numdepend'}
credit_df1.rename(columns=colnames, inplace=True)
credit_df1.head()
# 查看好坏比
sns.countplot(credit_df1['Isdlq'])
print(f"好坏比:{
np.round(100 * credit_df1['Isdlq'].mean(), 2)}%")
# 好坏比:6.68%
2.1 查看Revol特征
# 查看可用额度比的描述性统计信息
credit_df1['Revol'].describe([0.99, 0.999])
"""
count 150000.000000
mean 6.048438
std 249.755371
min 0.000000
50% 0.154181
99% 1.092956
99.9% 1571.006000
max 50708.000000
Name: Revol, dtype: float64
"""
明显分布异常
# 画出Revol小于1的分布图
sns.distplot(credit_df1[credit_df1['Revol']<1]['Revol'], bins=10)
# 定义一个分箱并统计箱内坏客户率的函数
def show_rate_by_box(df, target_name, feature_name, bins):
temp = pd.concat([df[target_name], pd.cut(df[feature_name], bins=bins, right=False)], axis=1)
return pd.pivot_table(temp, index=[feature_name], values=[target_name], aggfunc=['mean', 'count'])
按理说Revol不应该大于1,所以我们重点查看大于1的数据违约率如何
# 初步分箱并查看各区间段的违约率分布,给后续分箱提供参考
revol_bins=[0,0.5,1,1.5,2,5,10,20,30,40,50,100,1000,5000,math.inf]
temp = show_rate_by_box(credit_df1, 'Isdlq', 'Revol', bins=revol_bins)
show_rate_by_box(credit_df1, 'Isdlq', 'Revol', bins=revol_bins)
# 画成图方便观看
plt.figure(figsize=(15, 5))
sns.barplot(x=temp.index, y=temp[( 'mean', 'Isdlq')])
结论:1到20坏客户比率明显上升,20以上又降下来,将异常值阈值确定为20。高于20后续统一删除。
2.2 Age
# 查看年龄的描述性统计信息
credit_df1['Age'].describe([0.01])
"""
count 150000.000000
mean 52.295207
std 14.771866
min 0.000000
1% 24.000000
50% 52.000000
max 109.000000
Name: Age, dtype: float64
"""
年龄小于18岁不符合业务逻辑,后续准备统一排除
# 查看要删除的有几人
len(credit_df1[credit_df1['Age']<18])
# 1
# 画出分布图
sns.distplot(credit_df1['Age'])
2.3 DebtRatio
# 查看负债率的描述性统计信息
credit_df1['DebtRatio'].describe([0.01, 0.99, 0.999])
"""
count 150000.000000
mean 353.005076
std 2037.818523
min 0.000000
1% 0.000000
50% 0.366508
99% 4979.040000
99.9% 10613.074000
max 329664.000000
Name: DebtRatio, dtype: float64
"""
# 画图查看分布
sns.distplot(credit_df1[</