数据及代码连接—提取码:1234
1.数据说明与预处理
import pandas as pd
import matplotlib. pyplot as plt
bank = pd. read_csv( 'data/bank-full.csv' , delimiter= ';' )
print ( bank. head( 5 ) )
print ( bank. describe( ) )
print ( bank. describe( include= [ 'O' ] ) )
print ( bank. info( ) )
for col in bank. select_dtypes( include= [ 'object' ] ) . columns:
print ( col, ':' , bank[ bank[ col] == 'unknown' ] [ col] . count( ) )
print ( '样本类别分布情况:\n' , bank[ 'y' ] . value_counts( ) )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
fig, ax = plt. subplots( 1 , 1 , figsize= ( 4 , 4 ) )
colors = [ "#FA5858" , "#64FE2E" ]
labels = "no" , "yes"
ax. set_title( '是否认购定期存款' , fontsize = 16 )
bank[ 'y' ] . value_counts( ) . plot. pie( explode= [ 0 , 0.25 ] , autopct= '%.2f%%' , ax = ax, shadow= True , colors = colors, labels= labels, fontsize= 14 , startangle= 25 )
plt. axis( 'off' )
plt. show( )
age job marital education . . . pdays previous poutcome y
0 58 management married tertiary . . . - 1 0 unknown no
1 44 technician single secondary . . . - 1 0 unknown no
2 33 entrepreneur married secondary . . . - 1 0 unknown no
3 47 blue- collar married unknown . . . - 1 0 unknown no
4 33 unknown single unknown . . . - 1 0 unknown no
[ 5 rows x 17 columns]
age balance . . . pdays previous
count 45211.000000 45211.000000 . . . 45211.000000 45211.000000
mean 40.936210 1362.272058 . . . 40.197828 0.580323
std 10.618762 3044.765829 . . . 100.128746 2.303441
min 18.000000 - 8019.000000 . . . - 1.000000 0.000000
25 % 33.000000 72.000000 . . . - 1.000000 0.000000
50 % 39.000000 448.000000 . . . - 1.000000 0.000000
75 % 48.000000 1428.000000 . . . - 1.000000 0.000000
max 95.000000 102127.000000 . . . 871.000000 275.000000
[ 8 rows x 7 columns]
job marital education . . . month poutcome y
count 45211 45211 45211 . . . 45211 45211 45211
unique 12 3 4 . . . 12 4 2
top blue- collar married secondary . . . may unknown no
freq 9732 27214 23202 . . . 13766 36959 39922
[ 4 rows x 10 columns]
< class 'pandas.core.frame.DataFrame' >
RangeIndex: 45211 entries, 0 to 45210
Data columns ( total 17 columns) :
age 45211 non- null int64
job 45211 non- null object
marital 45211 non- null object
education 45211 non- null object
default 45211 non- null object
balance 45211 non- null int64
housing 45211 non- null object
loan 45211 non- null object
contact 45211 non- null object
day 45211 non- null int64
month 45211 non- null object
duration 45211 non- null int64
campaign 45211 non- null int64
pdays 45211 non- null int64
previous 45211 non- null int64
poutcome 45211 non- null object
y 45211 non- null object
dtypes: int64( 7 ) , object ( 10 )
memory usage: 5.9 + MB
None
job : 288
marital : 0
education : 1857
default : 0
housing : 0
loan : 0
contact : 13020
month : 0
poutcome : 36959
y : 0
样本类别分布情况:
no 39922
yes 5289
Name: y, dtype: int64
2.探索性分析
bank. hist( bins= 25 , figsize= ( 14 , 10 )