import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('train.csv')
# data = pd.read_csv('C:\\Users\\hy\\天池\\贷款违约\\train.csv')
data.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
data.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
import copy
df = copy.copy(data)
col_name = data.columns
new_col = ['信用证标识','贷款金额','贷款期限','贷款利率','分期付款金额','贷款等级','贷款等级之子级','就业职称','就业年限(年)','房屋所有权状况','年收入','验证状态','贷款发放的月份','违约状态','贷款用途类别','邮政编码的前3位数字','地区编码','债务收入比','违约事件数','fico所属的下限范围','fico所属的上限范围','未结信用额度的数量','贬损公共记录的数量','公开记录清除的数量','信贷周转余额合计','循环额度利用率','当前的信用额度总数','贷款的初始列表状态','申请方式','信用额度开立的月份','贷款名称','公开策略','n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']
df.columns = new_col
df.head()
信用证标识 | 贷款金额 | 贷款期限 | 贷款利率 | 分期付款金额 | 贷款等级 | 贷款等级之子级 | 就业职称 | 就业年限(年) | 房屋所有权状况 | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
df.shape
(800000, 47)
df.describe() # 查看每个特征及真实值的基本信息
信用证标识 | 贷款金额 | 贷款期限 | 贷款利率 | 分期付款金额 | 就业职称 | 房屋所有权状况 | 年收入 | 验证状态 | 违约状态 | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 799999.000000 | 800000.000000 | 8.000000e+05 | 800000.000000 | 800000.000000 | ... | 759730.000000 | 759730.000000 | 759730.000000 | 759729.000000 | 759730.000000 | 766761.000000 | 730248.000000 | 759730.000000 | 759730.000000 | 759730.000000 |
mean | 399999.500000 | 14416.818875 | 3.482745 | 13.238391 | 437.947723 | 72005.351714 | 0.614213 | 7.613391e+04 | 1.009683 | 0.199513 | ... | 8.107937 | 8.575994 | 8.282953 | 14.622488 | 5.592345 | 11.643896 | 0.000815 | 0.003384 | 0.089366 | 2.178606 |
std | 230940.252015 | 8716.086178 | 0.855832 | 4.765757 | 261.460393 | 106585.640204 | 0.675749 | 6.894751e+04 | 0.782716 | 0.399634 | ... | 4.799210 | 7.400536 | 4.561689 | 8.124610 | 3.216184 | 5.484104 | 0.030075 | 0.062041 | 0.509069 | 1.844377 |
min | 0.000000 | 500.000000 | 3.000000 | 5.310000 | 15.690000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 199999.750000 | 8000.000000 | 3.000000 | 9.750000 | 248.450000 | 427.000000 | 0.000000 | 4.560000e+04 | 0.000000 | 0.000000 | ... | 5.000000 | 4.000000 | 5.000000 | 9.000000 | 3.000000 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
50% | 399999.500000 | 12000.000000 | 3.000000 | 12.740000 | 375.135000 | 7755.000000 | 1.000000 | 6.500000e+04 | 1.000000 | 0.000000 | ... | 7.000000 | 7.000000 | 7.000000 | 13.000000 | 5.000000 | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
75% | 599999.250000 | 20000.000000 | 3.000000 | 15.990000 | 580.710000 | 117663.500000 | 1.000000 | 9.000000e+04 | 2.000000 | 0.000000 | ... | 11.000000 | 11.000000 | 10.000000 | 19.000000 | 7.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
max | 799999.000000 | 40000.000000 | 5.000000 | 30.990000 | 1715.420000 | 378351.000000 | 5.000000 | 1.099920e+07 | 2.000000 | 1.000000 | ... | 70.000000 | 132.000000 | 79.000000 | 128.000000 | 45.000000 | 82.000000 | 4.000000 | 4.000000 | 39.000000 | 30.000000 |
8 rows × 42 columns
df.info() # 查看每个特征的数据类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
信用证标识 800000 non-null int64
贷款金额 800000 non-null float64
贷款期限 800000 non-null int64
贷款利率 800000 non-null float64
分期付款金额 800000 non-null float64
贷款等级 800000 non-null object
贷款等级之子级 800000 non-null object
就业职称 799999 non-null float64
就业年限(年) 753201 non-null object
房屋所有权状况 800000 non-null int64
年收入 800000 non-null float64
验证状态 800000 non-null int64
贷款发放的月份 800000 non-null object
违约状态 800000 non-null int64
贷款用途类别 800000 non-null int64
邮政编码的前3位数字 799999 non-null float64
地区编码 800000 non-null int64
债务收入比 799761 non-null float64
违约事件数 800000 non-null float64
fico所属的下限范围 800000 non-null float64
fico所属的上限范围 800000 non-null float64
未结信用额度的数量 800000 non-null float64
贬损公共记录的数量 800000 non-null float64
公开记录清除的数量 799595 non-null float64
信贷周转余额合计 800000 non-null float64
循环额度利用率 799469 non-null float64
当前的信用额度总数 800000 non-null float64
贷款的初始列表状态 800000 non-null int64
申请方式 800000 non-null int64
信用额度开立的月份 800000 non-null object
贷款名称 799999 non-null float64
公开策略 800000 non-null float64
n0 759730 non-null float64
n1 759730 non-null float64
n2 759730 non-null float64
n3 759730 non-null float64
n4 766761 non-null float64
n5 759730 non-null float64
n6 759730 non-null float64
n7 759730 non-null float64
n8 759729 non-null float64
n9 759730 non-null float64
n10 766761 non-null float64
n11 730248 non-null float64
n12 759730 non-null float64
n13 759730 non-null float64
n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
df.isnull().sum() # 查看缺失值数量
信用证标识 0
贷款金额 0
贷款期限 0
贷款利率 0
分期付款金额 0
贷款等级 0
贷款等级之子级 0
就业职称 1
就业年限(年) 46799
房屋所有权状况 0
年收入 0
验证状态 0
贷款发放的月份 0
违约状态 0
贷款用途类别 0
邮政编码的前3位数字 1
地区编码 0
债务收入比 239
违约事件数 0
fico所属的下限范围 0
fico所属的上限范围 0
未结信用额度的数量 0
贬损公共记录的数量 0
公开记录清除的数量 405
信贷周转余额合计 0
循环额度利用率 531
当前的信用额度总数 0
贷款的初始列表状态 0
申请方式 0
信用额度开立的月份 0
贷款名称 1
公开策略 0
n0 40270
n1 40270
n2 40270
n3 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
miss_val = 1 - df.describe().loc['count',:]/df.shape[0] # 缺失值的占比
miss_val
信用证标识 0.000000
贷款金额 0.000000
贷款期限 0.000000
贷款利率 0.000000
分期付款金额 0.000000
就业职称 0.000001
房屋所有权状况 0.000000
年收入 0.000000
验证状态 0.000000
违约状态 0.000000
贷款用途类别 0.000000
邮政编码的前3位数字 0.000001
地区编码 0.000000
债务收入比 0.000299
违约事件数 0.000000
fico所属的下限范围 0.000000
fico所属的上限范围 0.000000
未结信用额度的数量 0.000000
贬损公共记录的数量 0.000000
公开记录清除的数量 0.000506
信贷周转余额合计 0.000000
循环额度利用率 0.000664
当前的信用额度总数 0.000000
贷款的初始列表状态 0.000000
申请方式 0.000000
贷款名称 0.000001
公开策略 0.000000
n0 0.050338
n1 0.050338
n2 0.050338
n3 0.050338
n4 0.041549
n5 0.050338
n6 0.050338
n7 0.050338
n8 0.050339
n9 0.050338
n10 0.041549
n11 0.087190
n12 0.050338
n13 0.050338
n14 0.050338
Name: count, dtype: float64
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] # 设置字体样式
plt.rcParams['axes.unicode_minus']=False # 设置字符不显示
plt.clf()
plt.figure(figsize=(10,5))
plt.tick_params(labelsize=10)
miss_val.plot(kind='bar', color='red')
plt.show()
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8pm39NmL-1607247620879)(output_9_1.png)]
违约状态
from collections import Counter
column = '违约状态'
print(dict(Counter(df[column])))
{1: 159610, 0: 640390}
贷款金额
column = '贷款金额'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0))
sns.distplot(df[column].dropna())
plt.xlabel(column)
plt.ylabel('数量')
plt.subplot2grid((1,2),(0,1))
sns.histplot(x=column,y='违约状态', data=df)
plt.show()
1540
最小值和最大值: 500.0 40000.0
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lbpeu9Jv-1607247620884)(output_13_1.png)]
df_money = df[['贷款金额','违约状态']]
df_money
贷款金额 | 违约状态 | |
---|---|---|
0 | 35000.0 | 1 |
1 | 18000.0 | 0 |
2 | 12000.0 | 0 |
3 | 11000.0 | 0 |
4 | 3000.0 | 0 |
... | ... | ... |
799995 | 25000.0 | 0 |
799996 | 17000.0 | 0 |
799997 | 6000.0 | 1 |
799998 | 19200.0 | 0 |
799999 | 9000.0 | 0 |
800000 rows × 2 columns
贷款期限
column='贷款期限'
print(len(df[column].unique()))
print(df[column].value_counts())
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0)) #图像几行几列,从第0行第0列
sns.barplot(x=df[column].value_counts().index, y=df[column].value_counts().values)
plt.title(column)
plt.ylabel('数量')
plt.subplot2grid((1,2),(0,1))
sns.histplot(x=column,y='违约状态', data=df)
plt.show()
2
3 606902
5 193098
Name: 贷款期限, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pMb3R5VT-1607247620887)(output_16_1.png)]
df_time = df[['贷款期限', '违约状态']]
time3 = df_time[df_time['贷款期限']==3] # 606902
m3 = len(time3[time3['违约状态']==0]) # 509776
time5 = df_time[df_time['贷款期限']==5] # 193098
m5 = len(time5[time5['违约状态']==0]) # 130614
m5
130614
# 把上述两种数据的绘制封装成函数,
def lisan_plot(column): #离散数据
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0)) #图像几行几列,从第0行第0列
sns.barplot(x=df[column].value_counts().index, y=df[column].value_counts().values)
plt.title(column)
plt.ylabel('数量')
plt.subplot2grid((1,2),(0,1))
sns.histplot(x=column,y='违约状态', data=df)
plt.show()
def lianxu_plot(column): #连续数据
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0))
sns.distplot(df[column].dropna())
plt.xlabel(column)
plt.ylabel('数量')
plt.subplot2grid((1,2), (0,1))
sns.histplot(x=column,y='违约状态', data=df)
plt.show()
贷款利率
column = '贷款利率'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
lianxu_plot(column)
641
最小值和最大值: 5.31 30.99
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qyCtjmHG-1607247620891)(output_20_1.png)]
分期付款金额
column = '分期付款金额'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
lianxu_plot(column)
72360
最小值和最大值: 15.69 1715.42
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RIxuwWFW-1607247620893)(output_22_1.png)]
就业职称
column = '就业职称'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
lianxu_plot(column)
248684
最小值和最大值: 0.0 378351.0
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tOnchmUR-1607247620897)(output_24_1.png)]
贷款等级
column='贷款等级'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
7
B 233690
C 227118
A 139661
D 119453
E 55661
F 19053
G 5364
Name: 贷款等级, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rcSnLvMJ-1607247620899)(output_26_1.png)]
df_lev = df[['贷款等级', '违约状态']]
list_count = []
list_pro = []
for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
lev = df_lev[df_lev['贷款等级']==i]
m = len(lev[lev['违约状态']==1])
list_count.append(m)
sum_count = [139661,233690,227118,119453,55661,19053,5364]
for i in range(7):
pro = list_count[i]/sum_count[i]
list_pro.append(pro)
print(list_count)
print(list_pro)
[8432, 31079, 51106, 36296, 21390, 8641, 2666]
[0.06037476460858794, 0.13299242586332322, 0.22501959333914529, 0.3038517241090638, 0.38429061641005374, 0.45352437936283, 0.4970171513795675]
list_sca = []
list_pro = np.array(list_pro)
mea = np.average(list_pro)
sig = np.std(list_pro)
for i in list_pro:
i = (i-mea)/sig
list_sca.append(i)
list_sca
[-1.5435597448697518,
-1.063502731464889,
-0.45513437578927235,
0.06600492564039066,
0.5977661216413234,
1.0554530382345333,
1.3429727666076647]
贷款等级之子级
column='贷款等级之子级'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
35
C1 50763
B4 49516
B5 48965
B3 48600
C2 47068
C3 44751
C4 44272
B2 44227
B1 42382
C5 40264
A5 38045
A4 30928
D1 30538
D2 26528
A1 25909
D3 23410
A3 22655
A2 22124
D4 21139
D5 17838
E1 14064
E2 12746
E3 10925
E4 9273
E5 8653
F1 5925
F2 4340
F3 3577
F4 2859
F5 2352
G1 1759
G2 1231
G3 978
G4 751
G5 645
Name: 贷款等级之子级, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Xq5y4h18-1607247620901)(output_30_1.png)]
df_lev = df[['贷款等级之子级', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(35):
lev = df_lev[df_lev['贷款等级之子级']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.031919410243544714: Index(['A1'], dtype='object'),
0.04569698065449286: Index(['A2'], dtype='object'),
0.05588170381814169: Index(['A3'], dtype='object'),
0.06722064148991205: Index(['A4'], dtype='object'),
0.08539886975949533: Index(['A5'], dtype='object'),
0.10292105138974093: Index(['B1'], dtype='object'),
0.11226174056571778: Index(['B2'], dtype='object'),
0.12923868312757203: Index(['B3'], dtype='object'),
0.1486388238145246: Index(['B4'], dtype='object'),
0.16564893291126315: Index(['B5'], dtype='object'),
0.19135984870870515: Index(['C1'], dtype='object'),
0.20689215602957423: Index(['C2'], dtype='object'),
0.22457598712877924: Index(['C3'], dtype='object'),
0.25011293820021685: Index(['C4'], dtype='object'),
0.2615487780647725: Index(['C5'], dtype='object'),
0.27798153120702074: Index(['D1'], dtype='object'),
0.2975723763570567: Index(['D2'], dtype='object'),
0.30401537804357115: Index(['D3'], dtype='object'),
0.32286295472822746: Index(['D4'], dtype='object'),
0.3347348357439175: Index(['D5'], dtype='object'),
0.35523321956769055: Index(['E1'], dtype='object'),
0.37690255766514985: Index(['E2'], dtype='object'),
0.3874599542334096: Index(['E3'], dtype='object'),
0.4022430712822172: Index(['E4'], dtype='object'),
0.4191609846296082: Index(['E5'], dtype='object'),
0.4264978902953587: Index(['F1'], dtype='object'),
0.45599078341013827: Index(['F2'], dtype='object'),
0.45680738048644115: Index(['F3'], dtype='object'),
0.4661739624786811: Index(['G1'], dtype='object'),
0.4774396642182581: Index(['F4'], dtype='object'),
0.4809098294069862: Index(['G2'], dtype='object'),
0.48299319727891155: Index(['F5'], dtype='object'),
0.5194274028629857: Index(['G3'], dtype='object'),
0.521970705725699: Index(['G4'], dtype='object'),
0.5488372093023256: Index(['G5'], dtype='object')}
list_sca = []
list_count = np.array(list(dict_pro.values()))
mea = np.average(list_count)
sig = np.std(list_count)
for i in list_count:
i = (i-mea)/sig
list_sca.append(i)
list_sca
[-0.6724894468126269,
-0.9444795882115417,
-0.8361820464298809,
-1.0679936429813712,
-0.5736005630623201,
-0.4610136639169653,
-0.29842863815922327,
-1.1760800247888685,
-1.2355489953023386,
-0.22562056041576206,
-1.3471067276140738,
-1.4628412843030267,
-0.12099884635799843,
0.003729358447040597,
-1.687592139462477,
0.044749744640706815,
-1.5350324189562223,
-1.599875064942625,
0.16474581355957,
0.24033001604831888,
0.37083620892377905,
0.5087974668230881,
0.576012796804475,
0.6701318425525934,
0.7778424055582643,
0.8245539723898411,
1.0123251269541762,
1.017524121004523,
1.1488828087583802,
1.1842402526907807,
1.077157942215608,
1.1709761626209723,
1.4162043676228493,
1.4323967063357694,
1.6034465377665814]
就业年限(年)
column='就业年限(年)'
print(len(df[column].unique()))
print(df[column].value_counts())
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0)) #图像几行几列,从第0行第0列
sns.barplot(x=df[column].value_counts().index, y=df[column].value_counts().values)
plt.title(column)
plt.ylabel('数量')
plt.subplot2grid((1,2),(0,1))
sns.violinplot(x='违约状态',y=column, data=df)
plt.show()
12
10+ years 262753
2 years 72358
< 1 year 64237
3 years 64152
1 year 52489
5 years 50102
4 years 47985
6 years 37254
8 years 36192
7 years 35407
9 years 30272
Name: 就业年限(年), dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EEZJuj2n-1607247620902)(output_34_1.png)]
df_lev = df[['就业年限(年)', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(11):
lev = df_lev[df_lev['就业年限(年)']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.18703877786362097: Index(['10+ years'], dtype='object'),
0.1928115101734042: Index(['6 years'], dtype='object'),
0.19504617730957155: Index(['7 years'], dtype='object'),
0.19574068899445132: Index(['5 years'], dtype='object'),
0.19722590627763043: Index(['8 years'], dtype='object'),
0.19829113264561843: Index(['4 years'], dtype='object'),
0.19840116279069767: Index(['9 years'], dtype='object'),
0.20003316841261506: Index(['2 years'], dtype='object'),
0.20149021074947002: Index(['3 years'], dtype='object'),
0.20488192163394928: Index(['< 1 year'], dtype='object'),
0.20518584846348759: Index(['1 year'], dtype='object')}
list_sca = []
list_count = np.array(list(dict_pro.values()))
mea = np.average(list_count)
sig = np.std(list_count)
for i in list_count:
i = (i-mea)/sig
list_sca.append(i)
list_sca
[-2.1517339984033783,
0.4389442686578067,
1.4056352069893359,
0.7294332994473556,
1.4662287861258856,
-0.4168429341752717,
0.09163639107898441,
-1.0008303003707661,
-0.1207366933409439,
-0.5553070138451993,
0.11357298783614711]
房屋所有权状况
column='房屋所有权状况'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
6
0 395732
1 317660
2 86309
3 185
5 81
4 33
Name: 房屋所有权状况, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-d1oBIYpv-1607247620904)(output_38_1.png)]
df_lev = df[['房屋所有权状况', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(6):
lev = df_lev[df_lev['房屋所有权状况']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.15151515151515152: Int64Index([4], dtype='int64'),
0.17153528145310462: Int64Index([0], dtype='int64'),
0.20540540540540542: Int64Index([3], dtype='int64'),
0.20779988181997242: Int64Index([2], dtype='int64'),
0.23210665491405905: Int64Index([1], dtype='int64'),
0.2345679012345679: Int64Index([5], dtype='int64')}
年收入
column = '年收入'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
lianxu_plot(column)
44926
最小值和最大值: 0.0 10999200.0
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jSkJF3GR-1607247620905)(output_41_1.png)]
验证状态
column='验证状态'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
3
1 309810
2 248968
0 241222
Name: 验证状态, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-danQpWog-1607247620907)(output_43_1.png)]
df_lev = df[['验证状态', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(3):
lev = df_lev[df_lev['验证状态']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.14722123189427166: Int64Index([0], dtype='int64'),
0.2094122203931442: Int64Index([1], dtype='int64'),
0.2378578773175669: Int64Index([2], dtype='int64')}
贷款用途类别
column='贷款用途类别'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
14
0 464096
4 175433
2 52129
5 46276
3 17579
9 9238
1 9106
8 8657
10 5652
7 5373
6 4354
12 1363
11 554
13 190
Name: 贷款用途类别, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gX2wWXtO-1607247620908)(output_46_1.png)]
df_lev = df[['贷款用途类别', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(14):
lev = df_lev[df_lev['贷款用途类别']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.11151870873074102: Int64Index([12], dtype='int64'),
0.14612452350698857: Int64Index([8], dtype='int64'),
0.1692782999777693: Int64Index([4], dtype='int64'),
0.17571793051852136: Int64Index([2], dtype='int64'),
0.18943057056715398: Int64Index([3], dtype='int64'),
0.18947368421052632: Int64Index([13], dtype='int64'),
0.19616601526149266: Int64Index([7], dtype='int64'),
0.20965511280145216: Int64Index([5], dtype='int64'),
0.2113700613666138: Int64Index([0], dtype='int64'),
0.21833730244641697: Int64Index([9], dtype='int64'),
0.22576940744143317: Int64Index([6], dtype='int64'),
0.22965322009907999: Int64Index([10], dtype='int64'),
0.23826714801444043: Int64Index([11], dtype='int64'),
0.29518998462552165: Int64Index([1], dtype='int64')}
邮政编码的前3位数字
column='邮政编码的前3位数字'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
933
134.0 8956
19.0 8611
51.0 8161
31.0 7337
4.0 7176
...
858.0 1
937.0 1
938.0 1
915.0 1
916.0 1
Name: 邮政编码的前3位数字, Length: 932, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gdH0wAhu-1607247620909)(output_49_1.png)]
def hexing_plot(column):
fig = plt.figure(figsize=(10,4))
plt.subplot2grid((1,2), (0,0)) #图像几行几列,从第0行第0列
sns.distplot(df[column].dropna())
plt.title(column)
plt.ylabel('数量')
plt.subplot2grid((1,2),(0,1))
sns.violinplot(x='违约状态',y=column,data=df)
plt.show()
地区编码
column = '地区编码'
print(len(df[column].unique()))
print(df[column].value_counts())
hexing_plot(column)
51
8 116921
14 65768
13 65041
21 56671
2 30513
30 28634
0 27180
19 26198
3 25766
9 22902
7 22600
23 20919
10 19604
12 18543
26 18432
22 17722
18 17286
4 14177
11 12929
24 12776
32 12065
38 11982
36 11644
27 10521
17 9863
35 9669
5 9581
20 9124
43 7701
42 7267
15 6690
37 5998
45 5932
16 4325
28 4036
44 3963
33 3817
6 3496
39 2943
40 2287
31 2261
34 2136
25 2102
48 1880
41 1778
1 1624
29 1560
47 1213
49 1001
46 953
50 6
Name: 地区编码, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rZfDiIoa-1607247620911)(output_52_1.png)]
债务收入比
column = '债务收入比'
print(len(df[column].unique()))
# data[column].value_counts()
print('最小值和最大值:',df[column].min(), df[column].max())
lianxu_plot(column)
6322
最小值和最大值: -1.0 999.0
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qYyC7Ra6-1607247620913)(output_54_1.png)]
违约事件数
column='违约事件数'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
30
0.0 645715
1.0 102586
2.0 29944
3.0 10919
4.0 4808
5.0 2504
6.0 1399
7.0 770
8.0 443
9.0 293
10.0 192
11.0 144
12.0 86
14.0 53
13.0 50
15.0 23
16.0 20
18.0 13
19.0 10
20.0 7
17.0 6
21.0 3
22.0 3
26.0 2
29.0 2
30.0 1
25.0 1
39.0 1
27.0 1
24.0 1
Name: 违约事件数, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BlfaLo4c-1607247620915)(output_56_1.png)]
fico所属的下限范围
column = 'fico所属的下限范围'
print(len(df[column].unique()))
print(df[column].value_counts())
hexing_plot(column)
39
660.0 71974
670.0 69815
665.0 69619
675.0 61993
680.0 61294
685.0 53668
690.0 52046
695.0 47065
700.0 43292
705.0 39030
710.0 34296
715.0 30154
720.0 26036
725.0 20766
730.0 18186
735.0 14377
740.0 12585
745.0 10336
750.0 9419
755.0 8191
760.0 6716
765.0 6163
770.0 5311
775.0 4882
780.0 4286
785.0 3476
790.0 3284
795.0 2637
800.0 2261
805.0 1942
810.0 1408
815.0 1111
820.0 827
825.0 636
830.0 412
835.0 249
840.0 140
845.0 116
630.0 1
Name: fico所属的下限范围, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Jjhg6vKs-1607247620916)(output_58_1.png)]
fico所属的上限范围
column = 'fico所属的上限范围'
print(len(df[column].unique()))
print(df[column].value_counts())
hexing_plot(column)
39
664.0 71974
674.0 69815
669.0 69619
679.0 61993
684.0 61294
689.0 53668
694.0 52046
699.0 47065
704.0 43292
709.0 39030
714.0 34296
719.0 30154
724.0 26036
729.0 20766
734.0 18186
739.0 14377
744.0 12585
749.0 10336
754.0 9419
759.0 8191
764.0 6716
769.0 6163
774.0 5311
779.0 4882
784.0 4286
789.0 3476
794.0 3284
799.0 2637
804.0 2261
809.0 1942
814.0 1408
819.0 1111
824.0 827
829.0 636
834.0 412
839.0 249
844.0 140
850.0 116
634.0 1
Name: fico所属的上限范围, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PuN32qHD-1607247620917)(output_60_1.png)]
未结信用额度的数量
column='未结信用额度的数量'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
75
9.0 70865
10.0 69171
8.0 68201
11.0 64069
7.0 61355
...
86.0 1
70.0 1
82.0 1
81.0 1
77.0 1
Name: 未结信用额度的数量, Length: 75, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yth0Yydr-1607247620919)(output_62_1.png)]
贬损公共记录的数量
column='贬损公共记录的数量'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
32
0.0 664765
1.0 113186
2.0 14520
3.0 4432
4.0 1535
5.0 757
6.0 367
7.0 165
8.0 105
9.0 51
10.0 37
11.0 23
12.0 14
15.0 7
13.0 6
18.0 4
16.0 4
19.0 2
21.0 2
20.0 2
22.0 2
24.0 2
49.0 2
14.0 2
63.0 1
47.0 1
46.0 1
86.0 1
54.0 1
28.0 1
40.0 1
17.0 1
Name: 贬损公共记录的数量, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wqjctG5t-1607247620920)(output_64_1.png)]
公开记录清除的数量
column='公开记录清除的数量'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
12
0.0 700076
1.0 93639
2.0 4566
3.0 945
4.0 248
5.0 80
6.0 23
7.0 11
8.0 3
9.0 3
12.0 1
Name: 公开记录清除的数量, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-NbIhPFzM-1607247620922)(output_66_1.png)]
信贷周转余额合计
column='信贷周转余额合计'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
71116
0.0 3945
4784.0 73
6325.0 66
6018.0 65
5723.0 65
...
212442.0 1
53111.0 1
86466.0 1
212455.0 1
225652.0 1
Name: 信贷周转余额合计, Length: 71116, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rlgsYsre-1607247620924)(output_68_1.png)]
循环额度利用率
column='循环额度利用率'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
1287
0.00 4170
55.00 1589
61.00 1579
53.00 1575
57.00 1575
...
130.20 1
126.20 1
56.26 1
128.70 1
107.90 1
Name: 循环额度利用率, Length: 1286, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o91BQueL-1607247620925)(output_70_1.png)]
当前的信用额度总数
column='当前的信用额度总数'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
134
20.0 28956
21.0 28820
19.0 28697
22.0 28671
18.0 28188
...
151.0 1
117.0 1
115.0 1
162.0 1
121.0 1
Name: 当前的信用额度总数, Length: 134, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lhHyMGYa-1607247620927)(output_72_1.png)]
贷款的初始列表状态
column='贷款的初始列表状态'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
2
0 466438
1 333562
Name: 贷款的初始列表状态, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bVnFKWJo-1607247620928)(output_74_1.png)]
df_lev = df[['贷款的初始列表状态', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(2):
lev = df_lev[df_lev['贷款的初始列表状态']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.1960235278598881: Int64Index([1], dtype='int64'),
0.2020075551305854: Int64Index([0], dtype='int64')}
申请方式
column='申请方式'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
2
0 784586
1 15414
Name: 申请方式, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VBnvQBjY-1607247620929)(output_77_1.png)]
df_lev = df[['申请方式', '违约状态']]
dict_count = {}
dict_pro = {}
list_keys = df[column].value_counts().keys()
sum_count = df[column].value_counts().values
for i in range(2):
lev = df_lev[df_lev['申请方式']==list_keys[i]]
m = len(lev[lev['违约状态']==1])
pro = m/sum_count[i]
dict_pro[list_keys[i]] = pro
dict_count[list_keys[i]] = m
ser_pro = pd.Series(dict_pro)
ser_count = pd.Series(dict_pro)
lev = pd.DataFrame({'count':ser_count, 'pro':ser_pro}).groupby('count')
lev.groups
{0.19849321807934375: Int64Index([0], dtype='int64'),
0.2513948358635007: Int64Index([1], dtype='int64')}
信用额度开立的月份
column='信用额度开立的月份'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
720
Aug-2001 5567
Aug-2002 5403
Sep-2003 5403
Oct-2001 5258
Aug-2000 5246
...
Oct-1954 1
Feb-1960 1
Jul-1955 1
Dec-1960 1
Aug-1946 1
Name: 信用额度开立的月份, Length: 720, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-N5Odn5Xs-1607247620931)(output_80_1.png)]
贷款名称
column='贷款名称'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
39645
0.0 393334
4.0 148211
5.0 44748
6.0 39742
3.0 14432
...
16819.0 1
16820.0 1
16821.0 1
16822.0 1
14356.0 1
Name: 贷款名称, Length: 39644, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1AttggSo-1607247620932)(output_82_1.png)]
公开策略
column='公开策略'
print(len(df[column].unique()))
print(df[column].value_counts())
lisan_plot(column)
1
1.0 800000
Name: 公开策略, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-N43FDbVw-1607247620933)(output_84_1.png)]
n0
column='n0'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
40
0.0 579427
1.0 96105
2.0 38793
3.0 17657
4.0 10521
5.0 6132
6.0 3893
7.0 2472
8.0 1567
9.0 1002
10.0 676
11.0 423
12.0 331
13.0 185
14.0 166
15.0 84
16.0 76
17.0 45
18.0 34
19.0 23
20.0 18
22.0 15
23.0 15
21.0 13
24.0 12
26.0 10
25.0 9
29.0 5
31.0 4
28.0 3
27.0 3
34.0 2
33.0 2
30.0 2
32.0 1
51.0 1
39.0 1
38.0 1
35.0 1
Name: n0, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-j8OA4Xbj-1607247620935)(output_86_1.png)]
n1
column='n1'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
34
3.0 163008
2.0 154693
4.0 126573
1.0 87122
5.0 84396
6.0 52103
7.0 30905
8.0 18148
0.0 16266
9.0 10688
10.0 6328
11.0 3766
12.0 2265
13.0 1366
14.0 732
15.0 494
16.0 295
17.0 222
18.0 128
19.0 83
20.0 36
21.0 30
22.0 25
23.0 17
24.0 12
26.0 9
25.0 6
30.0 5
27.0 3
28.0 2
33.0 2
32.0 1
29.0 1
Name: n1, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-oJhrwrBX-1607247620937)(output_88_1.png)]
column='n2'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
51
4.0 117211
5.0 108116
3.0 106623
6.0 88160
2.0 70566
7.0 68102
8.0 50430
9.0 35775
1.0 26544
10.0 25155
11.0 17424
12.0 12317
13.0 8579
14.0 6132
15.0 4392
0.0 3271
16.0 3058
17.0 2237
18.0 1479
19.0 1179
20.0 814
21.0 545
22.0 414
23.0 303
24.0 216
25.0 168
26.0 129
27.0 90
28.0 64
29.0 54
30.0 39
31.0 27
32.0 24
34.0 17
33.0 16
36.0 13
35.0 10
38.0 7
37.0 6
39.0 5
43.0 4
42.0 4
40.0 2
41.0 2
44.0 2
46.0 1
63.0 1
60.0 1
57.0 1
52.0 1
Name: n2, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HxpnuOlr-1607247620938)(output_89_1.png)]
column='n3'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
51
4.0 117211
5.0 108116
3.0 106623
6.0 88160
2.0 70566
7.0 68102
8.0 50430
9.0 35775
1.0 26544
10.0 25155
11.0 17424
12.0 12317
13.0 8579
14.0 6132
15.0 4392
0.0 3271
16.0 3058
17.0 2237
18.0 1479
19.0 1179
20.0 814
21.0 545
22.0 414
23.0 303
24.0 216
25.0 168
26.0 129
27.0 90
28.0 64
29.0 54
30.0 39
31.0 27
32.0 24
34.0 17
33.0 16
36.0 13
35.0 10
38.0 7
37.0 6
39.0 5
43.0 4
42.0 4
40.0 2
41.0 2
44.0 2
46.0 1
63.0 1
60.0 1
57.0 1
52.0 1
Name: n3, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-35cBKe6K-1607247620939)(output_90_1.png)]
column='n4'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
47
3.0 134529
4.0 128078
2.0 105568
5.0 103239
6.0 75699
7.0 52687
1.0 49656
8.0 35717
9.0 23893
10.0 15962
11.0 10682
0.0 7785
12.0 7122
13.0 4804
14.0 3253
15.0 2374
16.0 1527
17.0 1114
18.0 839
19.0 553
20.0 407
21.0 286
22.0 238
23.0 157
24.0 145
25.0 114
26.0 65
27.0 59
28.0 40
29.0 33
30.0 29
31.0 22
32.0 17
33.0 14
35.0 11
36.0 10
37.0 6
38.0 5
34.0 5
40.0 5
39.0 4
42.0 2
41.0 2
46.0 2
49.0 1
43.0 1
Name: n4, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JtmtgY7p-1607247620941)(output_91_1.png)]
column='n5'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
66
5.0 76576
6.0 75679
4.0 71175
7.0 71021
8.0 64347
...
59.0 1
68.0 1
70.0 1
63.0 1
66.0 1
Name: n5, Length: 65, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SJhQ3kQe-1607247620942)(output_92_1.png)]
column='n6'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
108
4.0 64692
3.0 64139
5.0 61863
2.0 58022
6.0 57240
...
100.0 1
110.0 1
90.0 1
101.0 1
128.0 1
Name: n6, Length: 107, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FyJ3kpIV-1607247620944)(output_93_1.png)]
column='n7'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
71
6.0 83331
5.0 80569
7.0 79026
8.0 71219
4.0 68256
...
79.0 1
71.0 1
72.0 1
53.0 1
58.0 1
Name: n7, Length: 70, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Gz10n2fY-1607247620945)(output_94_1.png)]
column='n8'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
103
11.0 44808
10.0 44653
9.0 43882
12.0 43358
8.0 42024
...
128.0 1
87.0 1
78.0 1
83.0 1
127.0 1
Name: n8, Length: 102, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Euvw6hjY-1607247620948)(output_95_1.png)]
column='n9'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
45
4.0 118134
5.0 108976
3.0 107222
6.0 88847
2.0 70902
7.0 68413
8.0 50342
9.0 35601
1.0 26464
10.0 24868
11.0 17092
12.0 11982
13.0 8207
14.0 5836
15.0 4066
0.0 3228
16.0 2839
17.0 2010
18.0 1286
19.0 1045
20.0 697
21.0 461
22.0 316
23.0 240
24.0 167
25.0 132
26.0 96
27.0 64
28.0 51
29.0 42
30.0 24
32.0 15
31.0 15
34.0 12
33.0 11
35.0 5
36.0 5
43.0 4
39.0 4
38.0 3
37.0 3
45.0 1
42.0 1
44.0 1
Name: n9, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-T6KZSQKa-1607247620949)(output_96_1.png)]
column='n10'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
77
9.0 68023
10.0 66407
8.0 65287
11.0 61601
7.0 58455
...
70.0 1
82.0 1
81.0 1
74.0 1
77.0 1
Name: n10, Length: 76, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2c9NmuGz-1607247620950)(output_97_1.png)]
column='n11'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
6
0.0 729682
1.0 540
2.0 24
4.0 1
3.0 1
Name: n11, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3GZfNutY-1607247620951)(output_98_1.png)]
column='n12'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
6
0.0 757315
1.0 2281
2.0 115
3.0 16
4.0 3
Name: n12, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-k9RU0Q2H-1607247620953)(output_99_1.png)]
column='n14'
print(len(df[column].unique()))
print(df[column].value_counts())
lianxu_plot(column)
32
1.0 187501
2.0 171704
0.0 128053
3.0 124621
4.0 73869
5.0 36018
6.0 17388
7.0 9460
8.0 4902
9.0 2692
10.0 1423
11.0 835
12.0 489
13.0 278
14.0 194
15.0 101
16.0 74
17.0 43
18.0 17
20.0 17
19.0 16
21.0 8
23.0 7
24.0 5
22.0 4
25.0 4
26.0 3
30.0 1
28.0 1
27.0 1
29.0 1
Name: n14, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mNTAYqBA-1607247620954)(output_100_1.png)]