《利用Python 进行数据分析》第二章：引言中的分析（含pdf和数据集下载链接）

本文链接：https://blog.csdn.net/zhou4411781/article/details/108556600

       对《利用Python 进行数据分析》（Wes Mckinney著）一书中的第二章中的第二部分——1880-2010全美婴儿姓名分析进行代码实验。原书中采用的是Python2.7，而我采用的Python3.7利用Pycharm调试的，因此对源代码进行了一定的修改，并且增加了部分画图代码，每步打印结果和画图与原文校验对照一致（部分打印结果已经关闭，需要查看结果则可以打开），全手工敲写，供参考。
       （1）《利用Python 进行数据分析》pdf下载链接：
       链接：https://pan.baidu.com/s/1NiFQb5ZIR0UpKy-K9cGeTQ
       提取码：vn9m
       （2）《利用Python 进行数据分析》数据集链接如下：
       链接：https://pan.baidu.com/s/1jfWBk8WMHDQh6nGoSBAsmw
       提取码：bgcq

第二章中的第二部分的代码还原如下：

#-*- coding:utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### 1880-2010全美婴儿姓名 ###
names880 = pd.read_csv("PythonForDataAnalysis-master/ch02/names/yob1880.txt",
                       names = ["name", "sex", "births"])   # 路径修改为自己的路径
# print(names880)


# 按性别计算每个类别总数
print(names880.groupby('sex').births.sum())
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
    path = "PythonForDataAnalysis-master/ch02/names/yob%d.txt"%(year) # 路径修改为自己的路径
    frame = pd.read_csv(path, names = columns)
    frame['year'] = year
    pieces.append(frame)

# 所有年份合并为一个大表
names = pd.concat(pieces, ignore_index= True)
# print(names)

# 数据透视表, 索引为年份，列为性别
total_births = names.pivot_table('births',index = 'year', columns = 'sex', aggfunc = sum)
print(total_births.tail())

# total_births.plot(title='total births by sex and year')
plt.plot(total_births)
plt.title('total births by sex and year')
plt.legend(["F","M"])
plt.show()


def add_prop(group):
    births = group.births.astype(float)
    group['prop']=births/ births.sum()
    return group

names = names.groupby(['year','sex']).apply(add_prop)
# print(names)

# 判断比例总和是否为1
print(np.allclose(names.groupby(['year','sex']).prop.sum(),1))

# 取子集，sex/year组合的前1000个名字
def get_top1000(group):
    return group.sort_values(by = 'births', ascending= False)[:1000]

grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
print(top1000)

pieces = []
for year,group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_values(by = 'births', ascending = False)[:1000])  # Python3中需要用sort_vaules()函数
top1000 = pd.concat(pieces, ignore_index= True)
print(top1000)


# 分析命名趋势
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

total_births = top1000.pivot_table('births', index = 'year', columns = 'name',aggfunc = sum)
print(total_births)

fig,axes = plt.subplots(4,1, figsize=(10,6))
names_plot = ["John","Harry","Mary","Marilyn"]
#设置总标题
fig.suptitle("Number of births per year")   
# 绘制多个子图
for i in range(len(names_plot)):
    axes[i].plot(total_births[names_plot[i]], label = names_plot[i])
    axes[i].legend() # 添加图例
# plt.plot(total_births['Mary'])
plt.show()

# 评估命名多样性增长
table = top1000.pivot_table('prop', index = 'year', columns = 'sex', aggfunc = sum)
# table.plot(title = 'Sum of table1000.prop by year and sex',yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10))
plt.figure('Sum of table1000.prop by year and sex')
plt.plot(table)
plt.title('Sum of table1000.prop by year and sex')
plt.yticks = np.linspace(0,1.2,13)
plt.xticks = range(1880,2020,10)
plt.show()

# 只考虑2010年男孩的名字
df = boys[boys.year == 2010]
prop_cumsum = df.sort_values(by = 'prop', ascending = False).prop.cumsum()
print(prop_cumsum[:10])
print(prop_cumsum.searchsorted(0.5))
df = boys[boys.year == 1900]
in1900 = df.sort_values(by = 'prop', ascending = False).prop.cumsum()
print(in1900.searchsorted(0.5) + 1)


# 统计多样性
def get_quantile_count(group,q=0.5):
    group = group.sort_values(by='prop',ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
print(diversity.head())

plt.figure('Number of population names in top 50%')
plt.plot(diversity)
plt.title("Number of population names in top 50%")
plt.legend(diversity.columns)
plt.show()


get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', last_letters, ['sex', 'year'],aggfunc=sum)

# 只看其中三年的数据
subtable = table.reindex(columns = [1910, 1960, 2010], level = 'year')
print(subtable.head())
print(subtable.sum())

#各性别的条形图
# plt.figure('sex bar')
letter_prop = subtable/subtable.sum().astype(float)
fig,axes = plt.subplots(2,1, figsize=(10,6))
letter_prop['M'].plot(kind = 'bar', rot = 0, ax = axes[0], title = 'Male')
letter_prop['F'].plot(kind = 'bar', rot = 0, ax = axes[1], title = 'Female', legend = False)
plt.show()

# 男孩中某个字母统计
letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.loc[['d','n','y'], 'M'].T # python3 中ix()不可用
print(dny_ts.head())
plt.plot(dny_ts)
print(dny_ts.columns) # 注意不要写成dny_ts.columns()
plt.legend(dny_ts.columns)
plt.show()

# 变成女孩的男孩名称
all_names = top1000.name.unique()
mask= np.array(['lesl' in x.lower() for x in all_names])
lesl_like = all_names[mask]


filtered = top1000[top1000.name.isin(lesl_like)]
print(top1000.name)
# 分组查看名字类似lesl的名字
print(filtered.groupby('name').births.sum())

#按性别和年度进行聚合
table = filtered.pivot_table('births', index = 'year', columns = 'sex', aggfunc = 'sum')
print(table.tail())
print(table.sum(1))  # 按行求和得到，每行总数， 数字+NaN =数字
table = table.div(table.sum(1), axis = 0)  # table表中每行数/table.sum(1)
print(table.tail())

# 绘制分性别的年度曲线图
plt.figure()
plt.plot(table['M'], label = "M")
plt.plot(table['F'], label = "F")
plt.legend()
plt.show()