In [1]:
import pandas as pd
data_year = {}
path = 'C:\\Users\\yi&lei\\Documents\\电子书\\pydata-book-1st-edition\\pydata-book-1st-edition\\ch02\\names'
for i in range(1880,2011):
dir = path + '\\yob%d.txt' %i
data_year[i] = pd.read_csv(dir,engine='python',header=None,names=['name','gender','birth'])
In [2]:
data_year[1880].head() ##birth为出生人数
Out[2]:
In [3]:
data = data_year[1880]
for i in range(1881,2011):
data = pd.concat([data,data_year[i]],ignore_index=True)
data.head()
Out[3]:
In [4]:
data.shape
Out[4]:
*采用concat连接多个Data Frame更高效的做法¶
In [5]:
data_list = []
for i in range(1880,2011):
dir = path + '\\yob%d.txt' %i
data_year = pd.read_csv(dir,engine='python',header=None,names=['name','gender','birth'])
data_year['year'] = i
data_list.append(data_year)
data = pd.concat(data_list,ignore_index=True)
In [6]:
data.shape
Out[6]:
统计每年出生婴儿的性别¶
In [7]:
data_gp = data.groupby(['year','gender']).sum()
data_gp.unstack().tail()
Out[7]:
*插入prop列,用于存放每个名字占总人数的比例¶
In [10]:
data_g = data.groupby(['year','gender'])
data_g.sum().head()
Out[10]:
In [11]:
def add_prop(group):
births = group.birth.astype(float)
group['prop'] = births/births.sum()
return group
name = data.groupby(['year','gender']).apply(add_prop)
In [12]:
name.head()
Out[12]:
检查分组后的总值是不是为1¶
In [13]:
import numpy as np
np.allclose(name.groupby(['year','gender']).prop.sum(axis=0),1)
Out[13]:
*取每对gender/year的前1000名,注意apply的用法¶
In [14]:
name.groupby(['year','gender']).sort_index(by=['prop'])[:1000]
##不能直接对groupby对象做排序,看下面的报错提示要用APPLY
In [15]:
def top1000(group):
return group.sort_values(by=['prop'],ascending=False)[:1000]
In [16]:
top_1000 = name.groupby(['year','gender']).apply(top1000)