实验:
第6课 数据分析工具
第一节 层级索引
In [30]:
import pandas as pd
import numpy as np
In [2]:
# 文件路径
filepath = r'C:\Users\ML Learning\Projects\第四章-数据分析预习内容\第四章-数据分析预习内容\第二节-数据分析工具pandas高阶\3_lesson_06\lesson_06\examples\datasets\2016_happiness.csv'
读取文件
In [3]:
data = pd.read_csv(filepath,usecols=['Country','Region','Happiness Rank','Happiness Score'])
In [7]:
# 数据预览
data.head()
Out[7]:
Happiness Rank | Happiness Score | ||
---|---|---|---|
Region | Country | ||
Western Europe | Denmark | 1 | 7.526 |
Switzerland | 2 | 7.509 | |
Iceland | 3 | 7.501 | |
Norway | 4 | 7.498 | |
Finland | 5 | 7.413 |
设置多个索引列
In [5]:
data.set_index(['Region','Country'],inplace=True)
In [9]:
data
Out[9]:
Happiness Rank | Happiness Score | ||
---|---|---|---|
Region | Country | ||
Western Europe | Denmark | 1 | 7.526 |
Switzerland | 2 | 7.509 | |
Iceland | 3 | 7.501 | |
Norway | 4 | 7.498 | |
Finland | 5 | 7.413 | |
... | ... | ... | ... |
Sub-Saharan Africa | Benin | 153 | 3.484 |
Southern Asia | Afghanistan | 154 | 3.360 |
Sub-Saharan Africa | Togo | 155 | 3.303 |
Middle East and Northern Africa | Syria | 156 | 3.069 |
Sub-Saharan Africa | Burundi | 157 | 2.905 |
157 rows × 2 columns
选取子集
In [11]:
# 外层选取
data.loc['Australia and New Zealand','New Zealand']
Out[11]:
Happiness Rank 8.000
Happiness Score 7.334
Name: (Australia and New Zealand, New Zealand), dtype: float64
交换层级顺序
In [12]:
data.swaplevel()
Out[12]:
Happiness Rank | Happiness Score | ||
---|---|---|---|
Country | Region | ||
Denmark | Western Europe | 1 | 7.526 |
Switzerland | Western Europe | 2 | 7.509 |
Iceland | Western Europe | 3 | 7.501 |
Norway | Western Europe | 4 | 7.498 |
Finland | Western Europe | 5 | 7.413 |
... | ... | ... | ... |
Benin | Sub-Saharan Africa | 153 | 3.484 |
Afghanistan | Southern Asia | 154 | 3.360 |
Togo | Sub-Saharan Africa | 155 | 3.303 |
Syria | Middle East and Northern Africa | 156 | 3.069 |
Burundi | Sub-Saharan Africa | 157 | 2.905 |
157 rows × 2 columns
层级索引排序
In [14]:
data.sort_index()
Out[14]:
Happiness Rank | Happiness Score | ||
---|---|---|---|
Region | Country | ||
Australia and New Zealand | Australia | 9 | 7.313 |
New Zealand | 8 | 7.334 | |
Central and Eastern Europe | Albania | 109 | 4.655 |
Armenia | 121 | 4.360 | |
Azerbaijan | 81 | 5.291 | |
... | ... | ... | ... |
Western Europe | Portugal | 94 | 5.123 |
Spain | 37 | 6.361 | |
Sweden | 10 | 7.291 | |
Switzerland | 2 | 7.509 | |
United Kingdom | 23 | 6.725 |
157 rows × 2 columns
groupby()
In [15]:
# 按单列分组
obj1 = data.groupby('Region')
print(type(obj1))
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
In [16]:
#按多列分组
obj2 = data.groupby(['Region','Country'])
print(type(obj2))
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
常用的聚合操作
In [17]:
obj1.mean()
Out[17]:
Happiness Rank | Happiness Score | |
---|---|---|
Region | ||
Australia and New Zealand | 8.500000 | 7.323500 |
Central and Eastern Europe | 78.448276 | 5.370690 |
Eastern Asia | 67.166667 | 5.624167 |
Latin America and Caribbean | 48.333333 | 6.101750 |
Middle East and Northern Africa | 78.105263 | 5.386053 |
North America | 9.500000 | 7.254000 |
Southeastern Asia | 80.000000 | 5.338889 |
Southern Asia | 111.714286 | 4.563286 |
Sub-Saharan Africa | 129.657895 | 4.136421 |
Western Europe | 29.190476 | 6.685667 |
In [18]:
type(obj1.mean())
Out[18]:
pandas.core.frame.DataFrame
In [19]:
obj1.max()
Out[19]:
Happiness Rank | Happiness Score | |
---|---|---|
Region | ||
Australia and New Zealand | 9 | 7.334 |
Central and Eastern Europe | 129 | 6.596 |
Eastern Asia | 101 | 6.379 |
Latin America and Caribbean | 136 | 7.087 |
Middle East and Northern Africa | 156 | 7.267 |
North America | 13 | 7.404 |
Southeastern Asia | 140 | 6.739 |
Southern Asia | 154 | 5.196 |
Sub-Saharan Africa | 157 | 5.648 |
Western Europe | 99 | 7.526 |
In [20]:
obj1.size()
Out[20]:
Region
Australia and New Zealand 2
Central and Eastern Europe 29
Eastern Asia 6
Latin America and Caribbean 24
Middle East and Northern Africa 19
North America 2
Southeastern Asia 9
Southern Asia 7
Sub-Saharan Africa 38
Western Europe 21
dtype: int64
In [23]:
obj1.count() # 按列统计,获取非空值
Out[23]:
Happiness Rank | Happiness Score | |
---|---|---|
Region | ||
Australia and New Zealand | 2 | 2 |
Central and Eastern Europe | 29 | 29 |
Eastern Asia | 6 | 6 |
Latin America and Caribbean | 24 | 24 |
Middle East and Northern Africa | 19 | 19 |
North America | 2 | 2 |
Southeastern Asia | 9 | 9 |
Southern Asia | 7 | 7 |
Sub-Saharan Africa | 38 | 38 |
Western Europe | 21 | 21 |
In [24]:
obj2.size()
Out[24]:
Region Country
Australia and New Zealand Australia 1
New Zealand 1
Central and Eastern Europe Albania 1
Armenia 1
Azerbaijan 1
..
Western Europe Portugal 1
Spain 1
Sweden 1
Switzerland 1
United Kingdom 1
Length: 157, dtype: int64
自定义分组
In [25]:
# 自定义分组
def get_score_group(score):
if score <= 4:
score_group = 'low'
elif score <= 6:
score_group = 'middle'
else:
score_group = 'high'
return score_group
In [26]:
# 方法1:传入自定义的函数进行分组按单列分组
data2 = data.set_index('Happiness Score')
data2.groupby(get_score_group).size()
Out[26]:
high 47
low 21
middle 89
dtype: int64
In [27]:
# 方法2:人为构造出一个分组列
data['score group'] = data['Happiness Score'].apply(get_score_group)
data.head()
Out[27]:
Happiness Rank | Happiness Score | score group | ||
---|---|---|---|---|
Region | Country | |||
Western Europe | Denmark | 1 | 7.526 | high |
Switzerland | 2 | 7.509 | high | |
Iceland | 3 | 7.501 | high | |
Norway | 4 | 7.498 | high | |
Finland | 5 | 7.413 | high |
In [28]:
data.groupby('Region').max()
Out[28]:
Happiness Rank | Happiness Score | score group | |
---|---|---|---|
Region | |||
Australia and New Zealand | 9 | 7.334 | high |
Central and Eastern Europe | 129 | 6.596 | middle |
Eastern Asia | 101 | 6.379 | middle |
Latin America and Caribbean | 136 | 7.087 | middle |
Middle East and Northern Africa | 156 | 7.267 | middle |
North America | 13 | 7.404 | high |
Southeastern Asia | 140 | 6.739 | middle |
Southern Asia | 154 | 5.196 | middle |
Sub-Saharan Africa | 157 | 5.648 | middle |
Western Europe | 99 | 7.526 | middle |
In [31]:
data.groupby('Region').agg(np.max)
Out[31]:
Happiness Rank | Happiness Score | score group | |
---|---|---|---|
Region | |||
Australia and New Zealand | 9 | 7.334 | high |
Central and Eastern Europe | 129 | 6.596 | middle |
Eastern Asia | 101 | 6.379 | middle |
Latin America and Caribbean | 136 | 7.087 | middle |
Middle East and Northern Africa | 156 | 7.267 | middle |
North America | 13 | 7.404 | high |
Southeastern Asia | 140 | 6.739 | middle |
Southern Asia | 154 | 5.196 | middle |
Sub-Saharan Africa | 157 | 5.648 | middle |
Western Europe | 99 | 7.526 | middle |
In [33]:
# 传入包含多个函数的列表
data.groupby('Region')['Happiness Score'].agg([np.max,np.min,np.mean])
Out[33]:
amax | amin | mean | |
---|---|---|---|
Region | |||
Australia and New Zealand | 7.334 | 7.313 | 7.323500 |
Central and Eastern Europe | 6.596 | 4.217 | 5.370690 |
Eastern Asia | 6.379 | 4.907 | 5.624167 |
Latin America and Caribbean | 7.087 | 4.028 | 6.101750 |
Middle East and Northern Africa | 7.267 | 3.069 | 5.386053 |
North America | 7.404 | 7.104 | 7.254000 |
Southeastern Asia | 6.739 | 3.907 | 5.338889 |
Southern Asia | 5.196 | 3.360 | 4.563286 |
Sub-Saharan Africa | 5.648 | 2.905 | 4.136421 |
Western Europe | 7.526 | 5.033 | 6.685667 |
In [34]:
# 通过字典为每个列指定不同的操作方法
data.groupby('Region').agg({'Happiness Score':np.mean,'Happiness Rank': np.max})
Out[34]:
Happiness Score | Happiness Rank | |
---|---|---|
Region | ||
Australia and New Zealand | 7.323500 | 9 |
Central and Eastern Europe | 5.370690 | 129 |
Eastern Asia | 5.624167 | 101 |
Latin America and Caribbean | 6.101750 | 136 |
Middle East and Northern Africa | 5.386053 | 156 |
North America | 7.254000 | 13 |
Southeastern Asia | 5.338889 | 140 |
Southern Asia | 4.563286 | 154 |
Sub-Saharan Africa | 4.136421 | 157 |
Western Europe | 6.685667 | 99 |
In [35]:
# 传入自定义函数
def max_min_diff(x):
return x.max() - x.min()
data.groupby('Region')['Happiness Rank'].agg(max_min_diff)
Out[35]:
Region
Australia and New Zealand 1
Central and Eastern Europe 102
Eastern Asia 67
Latin America and Caribbean 122
Middle East and Northern Africa 145
North America 7
Southeastern Asia 118
Southern Asia 70
Sub-Saharan Africa 91
Western Europe 98
Name: Happiness Rank, dtype: int64
In [ ]: