对《利用Python 进行数据分析》(Wes Mckinney著)一书中的第二章中的第二部分——1880-2010全美婴儿姓名分析进行代码实验。原书中采用的是Python2.7,而我采用的Python3.7利用Pycharm调试的,因此对源代码进行了一定的修改,并且增加了部分画图代码,每步打印结果和画图与原文校验对照一致(部分打印结果已经关闭,需要查看结果则可以打开),全手工敲写,供参考。
(1)《利用Python 进行数据分析》pdf下载链接:
链接:https://pan.baidu.com/s/1NiFQb5ZIR0UpKy-K9cGeTQ
提取码:vn9m
(2)《利用Python 进行数据分析》数据集链接如下:
链接:https://pan.baidu.com/s/1jfWBk8WMHDQh6nGoSBAsmw
提取码:bgcq
第二章中的第二部分的代码还原如下:
#-*- coding:utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
### 1880-2010全美婴儿姓名 ###
names880 = pd.read_csv("PythonForDataAnalysis-master/ch02/names/yob1880.txt",
names = ["name", "sex", "births"]) # 路径修改为自己的路径
# print(names880)
# 按性别计算每个类别总数
print(names880.groupby('sex').births.sum())
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
path = "PythonForDataAnalysis-master/ch02/names/yob%d.txt"%(year) # 路径修改为自己的路径
frame = pd.read_csv(path, names = columns)
frame['year'] = year
pieces.append(frame)
# 所有年份合并为一个大表
names = pd.concat(pieces, ignore_index= True)
# print(names)
# 数据透视表, 索引为年份,列为性别
total_births = names.pivot_table('births',index = 'year', columns = 'sex', aggfunc = sum)
print(total_births.tail())
# total_births.plot(title='total births by sex and year')
plt.plot(total_births)
plt.title('total births by sex and year')
plt.legend(["F","M"])
plt.show()
def add_prop(group):
births = group.births.astype(float)
group['prop']=births/ births.sum()
return group
names = names.groupby(['year','sex']).apply(add_prop)
# print(names)
# 判断比例总和是否为1
print(np.allclose(names.groupby(['year','sex']).prop.sum(),1))
# 取子集,sex/year组合的前1000个名字
def get_top1000(group):
return group.sort_values(by = 'births', ascending= False)[:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
print(top1000)
pieces = []
for year,group in names.groupby(['year', 'sex']):
pieces.append(group.sort_values(by = 'births', ascending = False)[:1000]) # Python3中需要用sort_vaules()函数
top1000 = pd.concat(pieces, ignore_index= True)
print(top1000)
# 分析命名趋势
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', index = 'year', columns = 'name',aggfunc = sum)
print(total_births)
fig,axes = plt.subplots(4,1, figsize=(10,6))
names_plot = ["John","Harry","Mary","Marilyn"]
#设置总标题
fig.suptitle("Number of births per year")
# 绘制多个子图
for i in range(len(names_plot)):
axes[i].plot(total_births[names_plot[i]], label = names_plot[i])
axes[i].legend() # 添加图例
# plt.plot(total_births['Mary'])
plt.show()
# 评估命名多样性增长
table = top1000.pivot_table('prop', index = 'year', columns = 'sex', aggfunc = sum)
# table.plot(title = 'Sum of table1000.prop by year and sex',yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10))
plt.figure('Sum of table1000.prop by year and sex')
plt.plot(table)
plt.title('Sum of table1000.prop by year and sex')
plt.yticks = np.linspace(0,1.2,13)
plt.xticks = range(1880,2020,10)
plt.show()
# 只考虑2010年男孩的名字
df = boys[boys.year == 2010]
prop_cumsum = df.sort_values(by = 'prop', ascending = False).prop.cumsum()
print(prop_cumsum[:10])
print(prop_cumsum.searchsorted(0.5))
df = boys[boys.year == 1900]
in1900 = df.sort_values(by = 'prop', ascending = False).prop.cumsum()
print(in1900.searchsorted(0.5) + 1)
# 统计多样性
def get_quantile_count(group,q=0.5):
group = group.sort_values(by='prop',ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
print(diversity.head())
plt.figure('Number of population names in top 50%')
plt.plot(diversity)
plt.title("Number of population names in top 50%")
plt.legend(diversity.columns)
plt.show()
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', last_letters, ['sex', 'year'],aggfunc=sum)
# 只看其中三年的数据
subtable = table.reindex(columns = [1910, 1960, 2010], level = 'year')
print(subtable.head())
print(subtable.sum())
#各性别的条形图
# plt.figure('sex bar')
letter_prop = subtable/subtable.sum().astype(float)
fig,axes = plt.subplots(2,1, figsize=(10,6))
letter_prop['M'].plot(kind = 'bar', rot = 0, ax = axes[0], title = 'Male')
letter_prop['F'].plot(kind = 'bar', rot = 0, ax = axes[1], title = 'Female', legend = False)
plt.show()
# 男孩中某个字母统计
letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.loc[['d','n','y'], 'M'].T # python3 中ix()不可用
print(dny_ts.head())
plt.plot(dny_ts)
print(dny_ts.columns) # 注意不要写成dny_ts.columns()
plt.legend(dny_ts.columns)
plt.show()
# 变成女孩的男孩名称
all_names = top1000.name.unique()
mask= np.array(['lesl' in x.lower() for x in all_names])
lesl_like = all_names[mask]
filtered = top1000[top1000.name.isin(lesl_like)]
print(top1000.name)
# 分组查看名字类似lesl的名字
print(filtered.groupby('name').births.sum())
#按性别和年度进行聚合
table = filtered.pivot_table('births', index = 'year', columns = 'sex', aggfunc = 'sum')
print(table.tail())
print(table.sum(1)) # 按行求和得到,每行总数, 数字+NaN =数字
table = table.div(table.sum(1), axis = 0) # table表中每行数/table.sum(1)
print(table.tail())
# 绘制分性别的年度曲线图
plt.figure()
plt.plot(table['M'], label = "M")
plt.plot(table['F'], label = "F")
plt.legend()
plt.show()