In [1]:
import numpy as np
import pandas as pd
In [2]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
print ('train_df:%s,%s'%train_df.shape)
print ('test_df:%s,%s'%test_df.shape)
In [3]:
##检查因变量
train_df.income_level.unique()
test_df.income_level.unique()
Out[3]:
In [4]:
#为了便于分析,将变量编码为0,1
train_df.loc[train_df['income_level']==-50000,'income_level']=0
train_df.loc[train_df['income_level']== 50000,'income_level']=1
test_df.loc[test_df['income_level']=='-50000','income_level']=0
test_df.loc[test_df['income_level']=='50000+.','income_level']=1
In [5]:
##检查样本不均衡程度
a=train_df['income_level'].sum()*100.0/train_df['income_level'].count()
b=test_df['income_level'].sum()*100.0/test_df['income_level'].count()
print ('train_df (1,0):(%s,%s)'%(a,100-a))
print ('test_df (1,0):(%s,%s)'%(b,100-b))
In [6]:
train_df.info()
In [7]:
import matplotlib.pyplot as plt
def num_tr(filed,n):
fig=plt.figure(figsize=(10,5))
train_df[filed].hist(bins=n)
plt.title('%s'%filed)
plt.show()
In [8]:
num_tr('age',100)
In [9]:
'''
#创建年龄分组字段
labels=[0,1,2,3,4,5,6,7,8,9]
train_df['age_class']=pd.cut(train_df['age'],bins=[-1,10,20,30,40,50,60,70,80,90,100],labels=labels)
test_df['age_class']=pd.cut(test_df['age'],bins=[-1,10,20,30,40,50,60,70,80,90,100],labels=labels)
'''
Out[9]:
In [10]:
'''
fig=plt.figure(figsize=(12,6))
train_df.groupby(['age_class','income_level'])['income_level'].count().unstack().plot(kind='bar')
plt.title('income_level wrt age')
plt.show()
'''
Out[10]:
In [11]:
#查看收入水平的人群的年龄分布
fig=plt.figure(figsize=(12,6))
train_df.age[train_df.income_level==0].plot(kind='kde')
train_df.age[train_df.income_level==1].plot(kind='kde')
plt.legend(('0','1'))
plt.show()
In [12]:
fig=plt.figure(figsize=(8,4))
plt.subplot2grid((1,2),(0,0))
train_df.capital_gains.plot(kind='box')
plt.subplot2grid((1,2),(0,1))
train_df.capital_losses.plot(kind='box')
plt.show()
In [13]:
#查看收入水平的人群的周工作时长分布
fig=plt.figure(figsize=(8,4))
plt.subplot2grid((1,2),(0,0))
train_df.weeks_worked_in_year[train_df.income_level==0].hist(bins=20)
plt.subplot2grid((1,2),(0,1))
train_df.weeks_worked_in_year[train_df.income_level==1].hist(bins=20,color=