一. 了解数据集
任务目标:建立分类模型预测一个人的收入能否超过五万美元
人口普查数据集: https://archive.ics.uci.edu/ml/datasets/adult
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
sns. set_style( 'whitegrid' )
% matplotlib inline
import warnings
warnings. filterwarnings( 'ignore' )
plt. rcParams[ 'font.sans-serif' ] = [ 'MicroSoft YaHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
headers = [ 'age' , 'workclass' , 'fnlwgt' , 'education' , 'education-num' , 'marital-status' ,
'occupation' , 'relationship' , 'race' , 'sex' , 'capital-gain' , 'capital-loss' ,
'hours-per-week' , 'native-country' , 'predclass' ]
training_raw = pd. read_csv( 'dataset/adult.data' , header= None , names= headers,
sep= ',\s' , na_values= [ '?' ] , engine= 'python' )
test_raw = pd. read_csv( 'dataset/adult.test' , header= None , names= headers,
sep= ',\s' , na_values= [ '?' ] , engine= 'python' , skiprows= 1 )
dataset_raw = training_raw. append( test_raw)
dataset_raw. reset_index( inplace = True )
dataset_raw. drop( 'index' , inplace= True , axis= 1 )
print ( dataset_raw. shape)
dataset_raw. iloc[ 10 : 15 ]
dataset_raw. info( )
dataset_raw. describe( )
dataset_raw. describe( include = [ 'O' ] )
二. 单特征分析
import math
def plot_distribution ( dataset, cols, width, height, hspace, wspace) :
fig = plt. figure( figsize = ( width, height) )
fig. subplots_adjust( left= None , bottom= None , right= None , top= None , wspace= wspace, hspace= hspace)
rows = math. ceil( dataset. shape[ 1 ] / cols)
for i, column in enumerate ( dataset. columns) :
ax = fig. add_subplot( rows, cols, i+ 1 )
ax. set_title( column)
if dataset. dtypes[ column] == np. object :
g = sns. countplot( y= column, data= dataset)
substrings = [ s. get_text( ) [ : 18 ] for s in g. get_yticklabels( ) ]
g. set ( yticklabels = substrings)
plt. xticks( rotation = 25 )
else :
g = sns. distplot( dataset[ column] )
plt. xticks( rotation = 25 )
plt. tight_layout( )
plot_distribution( dataset_raw, cols= 3 , width= 24 , height= 20 , hspace= 0.2 , wspace= 0.5 )
import missingno as msno
msno. matrix( dataset_raw, figsize= ( 16 , 5 ) )
三.数据清洗与特征提取组合
dataset_bin = pd. DataFrame( )
dataset_con = pd. DataFrame( )
dataset_raw. loc[ dataset_raw[ 'predclass' ] == '<=50K' , 'predclass' ] = 0
dataset_raw. loc[ dataset_raw[ 'predclass' ] == '<=50K.' , 'predclass' ] = 0
dataset_raw. loc[ dataset_raw[ 'predclass' ] == '>50K' , 'predclass' ] = 1
dataset_raw. loc[ dataset_raw[ 'predclass' ] == '>50K.' , 'predclass' ] = 1
dataset_bin[ 'predclass' ] = dataset_raw[ 'predclass' ]
dataset_con[ 'predclass' ] = dataset_raw[ 'predclass' ]
fig = plt. figure( figsize = ( 20 , 3 ) )
sns. countplot( y = 'predclass' , data= dataset_bin)