import pandas as pd
import numpy as np
import seaborn as sns
income = pd. read_excel( r'./income.xlsx' )
income. head( )
age
workclass
fnlwgt
education
education-num
marital-status
occupation
relationship
race
sex
capital-gain
capital-loss
hours-per-week
native-country
income
0
39
State-gov
77516
Bachelors
13
Never-married
Adm-clerical
Not-in-family
White
Male
2174
0
40
United-States
<=50K
1
50
Self-emp-not-inc
83311
Bachelors
13
Married-civ-spouse
Exec-managerial
Husband
White
Male
0
0
13
United-States
<=50K
2
38
Private
215646
HS-grad
9
Divorced
Handlers-cleaners
Not-in-family
White
Male
0
0
40
United-States
<=50K
3
53
Private
234721
11th
7
Married-civ-spouse
Handlers-cleaners
Husband
Black
Male
0
0
40
United-States
<=50K
4
28
Private
338409
Bachelors
13
Married-civ-spouse
Prof-specialty
Wife
Black
Female
0
0
40
Cuba
<=50K
income. apply ( lambda x: np. sum ( x. isnull( ) ) )
income. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age 32561 non-null int64
workclass 30725 non-null object
fnlwgt 32561 non-null int64
education 32561 non-null object
education-num 32561 non-null int64
marital-status 32561 non-null object
occupation 30718 non-null object
relationship 32561 non-null object
race 32561 non-null object
sex 32561 non-null object
capital-gain 32561 non-null int64
capital-loss 32561 non-null int64
hours-per-week 32561 non-null int64
native-country 31978 non-null object
income 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
从上可以看出,存在缺失值,workclass,occupation,native-country ,缺失值都是类别值,因此用众数进行填充
income. fillna( value = {
'workclass' : income. workclass. mode( ) [ 0 ] ,
'occupation' : income. occupation. mode( ) [ 0 ] ,
'native-country' : income[ 'native-country' ] . mode( ) [ 0 ] } , inplace = True )
income. head( )
age
workclass
fnlwgt
education
education-num
marital-status
occupation
relationship
race
sex
capital-gain
capital-loss
hours-per-week
native-country
income
0
39
State-gov
77516
Bachelors
13
Never-married
Adm-clerical
Not-in-family
White
Male
2174
0
40
United-States
<=50K
1
50
Self-emp-not-inc
83311
Bachelors
13
Married-civ-spouse
Exec-managerial
Husband
White
Male
0
0
13
United-States
<=50K
2
38
Private
215646
HS-grad
9
Divorced
Handlers-cleaners
Not-in-family
White
Male
0
0
40
United-States
<=50K
3
53
Private
234721
11th
7
Married-civ-spouse
Handlers-cleaners
Husband
Black
Male
0
0
40
United-States
<=50K
4
28
Private
338409
Bachelors
13
Married-civ-spouse
Prof-specialty
Wife
Black
Female
0
0
40
Cuba
<=50K
income. describe( )
age
fnlwgt
education-num
capital-gain
capital-loss
hours-per-week
count
32561.000000
3.256100e+04
32561.000000
32561.000000
32561.000000
32561.000000
mean
38.581647
1.897784e+05
10.080679
1077.648844
87.303830
40.437456
std
13.640433
1.055500e+05
2.572720
7385.292085
402.960219
12.347429
min
17.000000
1.228500e+04
1.000000
0.000000
0.000000
1.000000
25%
28.000000
1.178270e+05
9.000000
0.000000
0.000000
40.000000
50%
37.000000
1.783560e+05
10.000000
0.000000
0.000000
40.000000
75%
48.000000
2.370510e+05
12.000000
0.000000
0.000000
45.000000
max
90.000000
1.484705e+06
16.000000
99999.000000
4356.000000
99.000000
income. describe( include = [ 'object' ] )
workclass
education
marital-status
occupation
relationship
race
sex
native-country
income
count
32561
32561
32561
32561
32561
32561
32561
32561
32561
unique
8
16
7
14
6
5
2
41
2
top
Private
HS-grad
Married-civ-spouse
Prof-specialty
Husband
White
Male
United-States
<=50K
freq
24532
10501
14976
5983
13193
27816
21790
29753
24720
绘制不同收入水平下的年龄核密度图
import matplotlib. pyplot as plt
plt. style. use( 'ggplot' )
fig, axes = plt. subplots( 2 , 1 )
income. age[ income. income == ' <=50K' ] . plot( kind = 'kde' , label = '<=50K' , ax = axes[ 0 ] , legend = True , linestyle = '-' )
income. age[ income. income == ' >50K' ] . plot( kind = 'kde' , label = '>50K' , ax = axes[ 0 ] , legend = True , linestyle = '--' )
income[ 'hours-per-week' ] [ income. income == ' <=50K' ] . plot( kind = 'kde' , label = '<=50K' , ax = axes[ 1 ] , legend = True , linestyle = '-' )
income[ 'hours-per-week' ] [ income. income == ' >50K' ] . plot( kind = 'kde' , label = '>50K' , ax = axes[ 1 ] , legend = True , linestyle = '--' )
plt. show( )