In [15]:
from __future__ import division
from numpy.random import randn
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('figure', figsize=(12, 5))
np.set_printoptions(precision=4)
%pwd
Out[15]:
In [4]:
import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])
names1880
Out[4]:
In [5]:
names1880.groupby('sex').births.sum()
Out[5]:
In [7]:
# 2010 is the last available year right now
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = 'ch02/names/yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
In [8]:
names
Out[8]:
In [12]:
total_births = names.pivot_table('births', index='year',
columns='sex', aggfunc=sum)
In [13]:
total_births.tail()
Out[13]:
In [16]:
total_births.plot(title='Total births by sex and year')
Out[16]:
In [19]:
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
In [29]:
top1000.index = np.arange(len(top1000))
In [30]:
top1000
Out[30]:
In [31]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
In [33]:
total_births = top1000.pivot_table('births', index='year', columns='name',
aggfunc=sum)
total_births
Out[33]:
In [34]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']] # 分析这四个名字的趋势
subset.plot(subplots=True, figsize=(12, 10), grid=False,
title="Number of births per year")
Out[34]:
In [35]:
plt.figure()
Out[35]:
In [39]:
table = top1000.pivot_table( index='year',
columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Out[39]:
In [65]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', index=last_letters,
columns=['sex', 'year'], aggfunc=sum)
In [66]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()
subtable.sum()
Out[66]:
In [67]:
letter_prop = subtable / subtable.sum().astype(float)
In [68]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
Out[68]:
In [69]:
plt.subplots_adjust(hspace=0.25)
In [70]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
Out[70]:
In [71]:
plt.close('all')
In [72]:
dny_ts.plot()
Out[72]:
In [73]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
Out[73]:
In [74]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
Out[74]:
In [76]:
table = filtered.pivot_table('births', index='year',
columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
Out[76]:
In [77]:
plt.close('all')
In [78]:
table.plot(style={'M': 'k-', 'F': 'k--'})
Out[78]: