# 利用python进行数据分析_第二章_案例3_全美婴儿名字分析

  
In [1]:
import pandas as pd
data_year = {}
path = 'C:\\Users\\yi&lei\\Documents\\电子书\\pydata-book-1st-edition\\pydata-book-1st-edition\\ch02\\names'
for i in range(1880,2011):
dir = path + '\\yob%d.txt' %i

In [2]:
data_year[1880].head() ##birth为出生人数

Out[2]:
name gender birth
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
In [3]:
data = data_year[1880]
for i in range(1881,2011):
data = pd.concat([data,data_year[i]],ignore_index=True)

Out[3]:
name gender birth
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
In [4]:
data.shape

Out[4]:
(1690784, 3)

#### *采用concat连接多个Data Frame更高效的做法¶

In [5]:
data_list = []
for i in range(1880,2011):
dir = path + '\\yob%d.txt' %i
data_year['year'] = i
data_list.append(data_year)
data = pd.concat(data_list,ignore_index=True)

In [6]:
data.shape

Out[6]:
(1690784, 4)

### 统计每年出生婴儿的性别¶

In [7]:
data_gp = data.groupby(['year','gender']).sum()
data_gp.unstack().tail()

Out[7]:
birth
gender F M
year
2006 1896468 2050234
2007 1916888 2069242
2008 1883645 2032310
2009 1827643 1973359
2010 1759010 1898382

### *插入prop列，用于存放每个名字占总人数的比例¶

In [10]:
data_g = data.groupby(['year','gender'])

Out[10]:
birth
year gender
1880 F 90993
M 110493
1881 F 91955
M 100748
1882 F 107851
In [11]:
def add_prop(group):
births = group.birth.astype(float)
group['prop'] = births/births.sum()
return group

In [12]:
name.head()

Out[12]:
name gender birth year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188

### 检查分组后的总值是不是为1¶

In [13]:
import numpy as np
np.allclose(name.groupby(['year','gender']).prop.sum(axis=0),1)

Out[13]:
True

### *取每对gender/year的前1000名，注意apply的用法¶

In [14]:
name.groupby(['year','gender']).sort_index(by=['prop'])[:1000]
##不能直接对groupby对象做排序，看下面的报错提示要用APPLY

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-14-a61dc3c107ca> in <module>()
----> 1 name.groupby(['year','gender']).sort_index(by=['prop'])[:1000]
2 ##不能直接对groupby对象做排序，看下面的报错提示要用APPLY

C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\groupby.py in __getattr__(self, attr)
546             return self[attr]
547         if hasattr(self.obj, attr):
--> 548             return self._make_wrapper(attr)
549
550         raise AttributeError("%r object has no attribute %r" %

C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\groupby.py in _make_wrapper(self, name)
560                    "using the 'apply' method".format(kind, name,
561                                                      type(self).__name__))
--> 562             raise AttributeError(msg)
563
564         # need to setup the selection

AttributeError: Cannot access callable attribute 'sort_index' of 'DataFrameGroupBy' objects, try using the 'apply' method
In [15]:
def top1000(group):
return group.sort_values(by=['prop'],ascending=False)[:1000]

In [16]:
top_1000 = name.groupby(['year','gender']).apply(top1000)

Out[16]:
name gender birth year prop
year gender
1880 F 0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
5 Margaret F 1578 1880 0.017342
6 Ida F 1472 1880 0.016177
7 Alice F 1414 1880 0.015540
8 Bertha F 1320 1880 0.014507
9 Sarah F 1288 1880 0.014155
10 Annie F 1258 1880 0.013825
11 Clara F 1226 1880 0.013474
12 Ella F 1156 1880 0.012704
13 Florence F 1063 1880 0.011682
14 Cora F 1045 1880 0.011484
15 Martha F 1040 1880 0.011429
16 Laura F 1012 1880 0.011122
17 Nellie F 995 1880 0.010935
18 Grace F 982 1880 0.010792
19 Carrie F 949 1880 0.010429

### 分析命名趋势--绘制'John','Harry','Mary','Marilyn'4个名字的年数量分布曲线图¶

In [17]:
hot_name = top_1000[top_1000.name.isin(['John','Harry','Mary','Marilyn'])]
hot = hot_name.groupby(['year','gender']).birth.sum()

C:\PorgramFiles\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version

C:\PorgramFiles\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: 'gender' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version


Out[17]:
gender F M
year
1880 7117 11834
1881 6945 10800
1882 8203 11819
1883 8058 11036
1884 9272 11746
In [18]:
%matplotlib inline
z = hot_name.pivot_table('birth',index='year',columns='gender',aggfunc=np.sum)
z.plot(subplots=True) ##这个画的是4个名字合并后，按男女展现的曲线图

C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
grouped = data.groupby(keys)
C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'gender' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
grouped = data.groupby(keys)

Out[18]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C119A58>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C308D68>], dtype=object)
In [19]:
total_birth = top_1000.pivot_table('birth',index='year',columns='name',aggfunc=np.sum)

C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
grouped = data.groupby(keys)

Out[19]:
name Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie ... Zoa Zoe Zoey Zoie Zola Zollie Zona Zora Zula Zuri
year
1880 NaN NaN NaN 102.0 NaN NaN NaN NaN NaN 71.0 ... 8.0 23.0 NaN NaN 7.0 NaN 8.0 28.0 27.0 NaN
1881 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 81.0 ... NaN 22.0 NaN NaN 10.0 NaN 9.0 21.0 27.0 NaN
1882 NaN NaN NaN 85.0 NaN NaN NaN NaN NaN 80.0 ... 8.0 25.0 NaN NaN 9.0 NaN 17.0 32.0 21.0 NaN
1883 NaN NaN NaN 105.0 NaN NaN NaN NaN NaN 79.0 ... NaN 23.0 NaN NaN 10.0 NaN 11.0 35.0 25.0 NaN
1884 NaN NaN NaN 97.0 NaN NaN NaN NaN NaN 98.0 ... 13.0 31.0 NaN NaN 14.0 6.0 8.0 58.0 27.0 NaN

5 rows × 6868 columns

In [20]:
y = total_birth[['John','Harry','Mary','Marilyn']].fillna(0)
y.shape

Out[20]:
(131, 4)
In [21]:
y.head()

Out[21]:
name John Harry Mary Marilyn
year
1880 9701.0 2158.0 7092.0 0.0
1881 8795.0 2002.0 6948.0 0.0
1882 9597.0 2246.0 8179.0 0.0
1883 8934.0 2116.0 8044.0 0.0
1884 9427.0 2338.0 9253.0 0.0
In [22]:
y.plot(subplots=True,figsize=(12,10),title='Number of birth per year')

Out[22]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C3383C8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C454470>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C46A550>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000022B2C4880F0>], dtype=object)

### 计算最流行的1000个名字所占的比例¶

In [23]:
#name中的prop列包含每个名字所占的比例

k = top_1000.pivot_table('prop',index='year',columns='gender',aggfunc=np.sum)
k.plot(title='top1000 ratio',yticks=np.linspace(0,1.2,13))

C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
grouped = data.groupby(keys)
C:\PorgramFiles\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'gender' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
grouped = data.groupby(keys)

Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x22b2c4d9da0>

### *计算占出生人数前50%的不同名字的数量¶

In [24]:
#先看看某一年的男孩的数据
boy_2010 = name[(name.year==2010) & (name.gender=='M')]

Out[24]:
name gender birth year prop
1676644 Jacob M 21875 2010 0.011523
1676645 Ethan M 17866 2010 0.009411
1676646 Michael M 17133 2010 0.009025
1676647 Jayden M 17030 2010 0.008971
1676648 William M 16870 2010 0.008887
In [25]:
boy_sort = boy_2010.sort_values(by=['prop'],ascending=True)
boy_sort['propcum'] = boy_sort.prop.cumsum()
num = boy_sort.propcum.searchsorted(0.5) + 1 ###此处注意要加1
num

Out[25]:
array([14024], dtype=int64)
In [26]:
num.shape


Out[26]:
(1,)
In [27]:
num[0]

Out[27]:
14024
In [28]:
#对整体样本进行统计
def tophalf(group):
a = group_n = group.sort_values(by=['prop'],ascending=True).prop.cumsum()
return a.searchsorted(0.5)[0]
halfnum = name.groupby(['year','gender']).apply(tophalf)

Out[28]:
gender F M
year
1880 904 1044
1881 900 983
1882 990 1084
1883 1015 1015
1884 1133 1109
In [29]:
halfnum.unstack().plot(title='Number of popular names in top 50%') ##书中是拿TOP1000的数据做的统计，我用全量数，所以图不一样

Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x22b2c4f26a0>

### *名字最后一个字母的变革,统计1910，1960，2010三年里最后一个字母的使用率变化¶

In [30]:
last_letter = data.name.str[-1]

Out[30]:
0    y
1    a
2    a
3    h
4    e
Name: name, dtype: object
In [31]:
table = data.pivot_table('birth',index=last_letter,columns=['gender','year'],aggfunc=np.sum)

Out[31]:
gender F ... M
year 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 ... 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
name
a 31446.0 31581.0 36536.0 38330.0 43680.0 45408.0 49100.0 48942.0 59442.0 58631.0 ... 39124.0 38815.0 37825.0 38650.0 36838.0 36156.0 34654.0 32901.0 31430.0 28438.0
b NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 50950.0 49284.0 48065.0 45914.0 43144.0 42600.0 42123.0 39945.0 38862.0 38859.0
c NaN NaN 5.0 5.0 NaN NaN NaN NaN NaN NaN ... 27113.0 27238.0 27697.0 26778.0 26078.0 26635.0 26864.0 25318.0 24048.0 23125.0
d 609.0 607.0 734.0 810.0 916.0 862.0 1007.0 1027.0 1298.0 1374.0 ... 60838.0 55829.0 53391.0 51754.0 50670.0 51410.0 50595.0 47910.0 46172.0 44398.0
e 33378.0 34080.0 40399.0 41914.0 48089.0 49616.0 53884.0 54353.0 66750.0 66663.0 ... 145395.0 144651.0 144769.0 142098.0 141123.0 142999.0 143698.0 140966.0 135496.0 129012.0

5 rows × 262 columns

In [32]:
subtable = table.reindex(columns=[1910,1960,2010],level='year').fillna(0)

Out[32]:
gender F M
year 1910 1960 2010 1910 1960 2010
name
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b 0.0 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
In [33]:
subtable.sum()

Out[33]:
gender  year
F       1910     396416.0
1960    2022062.0
2010    1759010.0
M       1910     194198.0
1960    2132588.0
2010    1898382.0
dtype: float64
In [34]:
tt = subtable.div(subtable.sum(),axis=1)

Out[34]:
gender F M
year 1910 1960 2010 1910 1960 2010
name
a 0.273390 0.341853 0.381240 0.005031 0.002440 0.014980
b 0.000000 0.000343 0.000256 0.002116 0.001834 0.020470
c 0.000013 0.000024 0.000538 0.002482 0.007257 0.012181
d 0.017028 0.001844 0.001482 0.113858 0.122908 0.023387
e 0.336941 0.215133 0.178415 0.147556 0.083853 0.067959
In [35]:
import matplotlib.pyplot as plt
fig,axes = plt.subplots(2,1,figsize=(10,10))
tt['M'].plot(kind='bar',rot=0,ax=axes[0],title='male')
tt['F'].plot(kind='bar',rot=0,ax=axes[1],title='female',legend=False)

Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x22b34df5be0>

### 取出男孩中，最后一个字母为d,n,y的统计结果¶

In [36]:
tl = table.loc[['d','n','y'],'M']
##注意此处不能用tl = table.loc[['d','n','y'],['M']]
tl = tl.div(tl.sum(axis = 0))
tl

Out[36]:
year 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 ... 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
name
d 0.266177 0.265193 0.273142 0.266999 0.272152 0.272450 0.274767 0.277674 0.270774 0.278388 ... 0.074552 0.068025 0.063007 0.060451 0.058769 0.057370 0.055274 0.052974 0.052799 0.052635
n 0.491023 0.488079 0.478688 0.481636 0.473754 0.466542 0.469170 0.463988 0.467060 0.460201 ... 0.754981 0.768022 0.782907 0.789611 0.796030 0.804582 0.809917 0.811432 0.818071 0.816452
y 0.242800 0.246727 0.248170 0.251366 0.254094 0.261007 0.256063 0.258338 0.262166 0.261411 ... 0.170467 0.163952 0.154086 0.149938 0.145201 0.138048 0.134809 0.135594 0.129130 0.130913

3 rows × 131 columns

In [37]:
yy = tl.T

Out[37]:
name d n y
year
1880 0.266177 0.491023 0.242800
1881 0.265193 0.488079 0.246727
1882 0.273142 0.478688 0.248170
1883 0.266999 0.481636 0.251366
1884 0.272152 0.473754 0.254094
In [38]:
yy.plot(xticks=np.linspace(1880,2010,10))

Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x22b2c534940>

### *找出以'lesl'开头的名字在男女中的分布趋势¶

In [45]:
uniname = data.name.unique()
uniname

Out[45]:
array(['Mary', 'Anna', 'Emma', ..., 'Zymaire', 'Zyonne', 'Zzyzx'], dtype=object)
In [57]:
mask = np.array(['lesl' in x.lower() for x in uniname])

Out[57]:
array([False, False, False, ..., False, False, False], dtype=bool)
In [61]:
resname = uniname[mask]
lname = data[data.name.isin(resname)]

Out[61]:
name gender birth year
654 Leslie F 8 1880
1108 Leslie M 79 1880
2523 Leslie F 11 1881
3072 Leslie M 92 1881
4593 Leslie F 9 1882
In [88]:
ll = lname.pivot_table('birth',index=['year'],columns='gender',aggfunc=np.sum)

Out[88]:
gender F M
year
1880 8 79
1881 11 92
1882 9 128
1883 7 125
1884 15 125
In [90]:
aa = ll.div(ll.sum(axis = 1),axis=0)

Out[90]:
gender F M
year
1880 0.091954 0.908046
1881 0.106796 0.893204
1882 0.065693 0.934307
1883 0.053030 0.946970
1884 0.107143 0.892857
In [97]:
aa.plot(style={'F':'y-','M':'c--'}) #b---blue   c---cyan  g---green    k----black m---magenta r---red  w---white    y----yellow

Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x22b368bceb8>