# 27CC4E55A1274AE18333ED2C9A1126DC
'''人口分析实战'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
abb = pd.read_csv('E:\AI课程笔记\数据分析\state-abbrevs.csv')
abb.head()
areas = pd.read_csv('E:\AI课程笔记\数据分析\state-areas.csv')
areas.head()
populations = pd.read_csv('E:\AI课程笔记\数据分析\state-population.csv')
populations.head()
# 合并数据
abb_populations = pd.merge(abb,populations,how='outer',left_on='abbreviation',right_on='state/region') # how='outer'表示并集
abb_populations.head()
# 删除重复列
abb_populations.drop('abbreviation',axis=1,inplace=True)
abb_populations.head()
# 查看缺失值
abb_populations.isnull().any()
abb_populations.info()
#找到有哪些state/region使得state的值为NaN,进行去重操作
abb_populations.loc[abb_populations['state'].isnull()]["state/region"].unique()
#填充缺失值
a = abb_populations.loc[abb_populations['state/region']=='PR'].index # 查看PR的索引
abb_populations.iloc[a]
abb_populations.loc[a,"state"] = 'Puerto Rico' # 填充缺失值
b = abb_populations.loc[abb_populations['state/region']=='USA'].index # 查看USA的索引
abb_populations.iloc[b]
abb_populations.loc[b,"state"] = 'United States' # 填充缺失值
abb_populations.isnull().any() # 再次查看缺失值
# 合并各州面积数据
abb_populations_areas = pd.merge(abb_populations,areas,how='outer',on='state')
abb_populations_areas.head()
# 查看area列的缺失值,找出是那些行
indexs = abb_populations_areas.loc[abb_populations_areas['area (sq. mi)'].isnull()].index
abb_populations_areas.drop(indexs,axis=0,inplace=True) # 删除缺失值所在行
abb_populations_areas.isnull().any() # 再次查看缺失值
# 查看2010年的全民人口数据 query()函数 用于条件查询筛选数据
pop = abb_populations_areas.query("year == 2010 & ages == 'total'")
#计算各州人口密度
pop_density = abb_populations_areas['population']/abb_populations_areas['area (sq. mi)']
abb_populations_areas["密度"] = pop_density
abb_populations_areas.head()
c = abb_populations_areas[["state","密度"]]
print(c)
grouped = c.groupby('state')['密度'].mean() # 按州分组,求平均值
print(grouped) # 查看分组后的数据
# 排序
grouped.sort_values(ascending=False,inplace=True) # 降序
print(grouped)
08-02
08-11