排序 排名
from pandas import Series, DataFrame
print("series 排序")
x = Series(range(4), index=['b', 'a', 'c', 'd'])
print(x)
# series 排序
# b 0
# a 1
# c 2
# d 3
# dtype: int64
print(x.sort_index())
# a 1
# b 0
# c 2
# d 3
# dtype: int64
print(x.sort_values()) # 按值排序
# b 0
# a 1
# c 2
# d 3
# dtype: int64
print("dataframe 按索引排序")
frame = DataFrame(numpy.arange(8).reshape((2, 4)),
columns=list('ABCD'),
index=['b', 'a']
)
print(frame)
# dataframe 按索引排序
# A B C D
# b 0 1 2 3
# a 4 5 6 7
print(frame.sort_index()) # 根据行索引来排序
# A B C D
# a 4 5 6 7
# b 0 1 2 3
print(frame.sort_index(axis=1)) # 根据列索引来排序
# A B C D
# b 0 1 2 3
# a 4 5 6 7
print(frame.sort_index(axis=1, ascending=False)) # 按行索引降序排列
# D C B A
# b 3 2 1 0
# a 7 6 5 4
print("按列的值排序")
print(frame.sort_values(by='B' )) # 按列的值排序
# 按列的值排序
# A B C D
# b 0 1 2 3
# a 4 5 6 7
print(frame.sort_values(by='B', ascending=False)) # 按列的值降序排列
# A B C D
# a 4 5 6 7
# b 0 1 2 3
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
# a b
# 0 0 4
# 1 1 7
# 2 0 -3
# 3 1 2
print(frame.sort_values(by='b'))# 按照b 列的值进行排序
# a b
# 2 0 -3
# 3 1 2
# 0 0 4
# 1 1 7
print(frame.sort_values(by=['a', 'b'])) # 先a 后b 进行列的值的排序
# a b
# 2 0 -3
# 0 0 4
# 3 1 2
# 1 1 7
排名: 根据值的大小 / 出现次数来进行排名,得到一组排名值: rank函数
obj = Series([4, 2, 0, 4], index=['a', 'b', 'c', 'd'])
print('rank排名:默认按照值的升序, 排名从1 开始')
# 以值从小到大来赋值排名c:0(1) b:2(2) a:4(3) d:4(4)
print(obj.rank())
#a 3.5
#b 2.0
#c 1.0
#d 3.5
#dtype: float64
#注释:
rank 表示在这个数在原来的Series中排第几名,有相同的数,取其排名平均(默认)作为值。
这里4 有相同的数 ,一个4排第3名一个4排第4名, 那么这两个的4 的排名都是取平均值 (4+3)/2 =3.5 排名都是3.5
直接rank() 函数是要求平均值的
print(obj.rank(method='first')) # 按照出现顺序排名不求平均值
#a 3.0
#b 2.0
#c 1.0
#d 4.0
#dtype: float64
print(obj.rank(ascending=False, method='max'))# 逆序,并取排名值最大值
#a 2.0
#b 3.0
#c 4.0
#d 2.0
#dtype: float64
分析: 首先 进行排序 ,排序后的结果 # a:4(1) d:4(2) b:2(3) c:0(4), 然后按照排名最大的值取, 比如 a,d 值都是4 取排名最大的 ,4 排名最大的是2 所以a, d 都是2 b 和c 的排名是3, 4
frame = DataFrame({
'b': [4.3, 7, -3, 2],
'a': [0, 1, 0, 1],
'c': [-2, 5, 8, -2.5]
})
print(frame)
a b c
0 0 4.3 -2.0
1 1 7.0 5.0
2 0 -3.0 8.0
3 1 2.0 -2.5
print(frame.rank()) # 默认按照列排名
a b c
0 1.5 3.0 2.0
1 3.5 4.0 3.0
2 1.5 1.0 4.0
3 3.5 2.0 1.0
print(frame.rank(axis=1)) # 按照行进行排名,默认升序
a b c
0 2.0 3.0 1.0
1 1.0 3.0 2.0
2 2.0 1.0 3.0
3 2.0 3.0 1.0
# 注意:
都是先排序(默认升序), 然后进行排名
索引重复情况
print(" 重复索引: 进行两次索引")
obj = Series([0, 1,2,3,4], index=['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique)
#重复索引: 进行两次索引
#False
df = DataFrame(numpy.arange(12).reshape((4, 3)), index=['a', 'a', 'b', 'b'])
print(df)
# 0 1 2
#a 0 1 2
#a 3 4 5
#b 6 7 8
#b 9 10 11
print(df.ix['b'].ix[0]) # 两次行索引
#0 6
#1 7
#2 8
print(df.ix['b'].ix[1]) # 两次行索引
#0 9
#1 10
#2 11
#Name: b, dtype: int32
# 汇总和计算描述统计
常用方法选项:
常用汇总统计函数 I:
常用汇总统计函数 II:
print("求和")
df = DataFrame([[1, numpy.nan], [7,4], [numpy.nan, numpy.nan], [0, 1]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two']
)
print(df)
# one two
#a 1.0 NaN
#b 7.0 4.0
#c NaN NaN
#d 0.0 1.0
print(df.sum()) # 按列求和
#one 8.0
#two 5.0
#dtype: float64
# 注意:排除缺失值, skipna 默认值为true
print(df.sum(skipna=False))
# one NaN
# two NaN
# dtype: float64
print(df.sum(axis=1)) #按行求和
# a 1.0
# b 11.0
# c 0.0
# d 1.0
# dtype: float64
print(df.sum(axis=1, skipna=False)) #按行求和
# a NaN
# b 11.0
# c NaN
# d 1.0
# dtype: float64
print("求平均数")
print(df.mean()) # 按列求平均数
#one 2.666667
#two 2.500000
#dtype: float64
print(df.mean(axis=1)) # 按行求平均值, 排除缺失值, 为true
#a 1.0
#b 5.5
#c NaN
#d 0.5
#dtype: float64
print(df.mean(axis=1, skipna=False)) # 按行求平均值,不排除缺失值
#a NaN
#b 5.5
#c NaN
#d 0.5
#dtype: float64
print(" 计算每一列最大值的索引")
print(df.idxmax())
# one b
# two b
# dtype: object
print("计算每一列的累加和")
print(df.cumsum())
# one two
# a 1.0 NaN
# b 8.0 4.0
# c NaN NaN
# d 8.0 5.0
print("对DataFrame 每列计算汇总统计")
print(df.describe())
# one two
# count 3.000000 2.00000
# mean 2.666667 2.50000
# std 3.785939 2.12132
# min 0.000000 1.00000
# 25% 0.500000 1.75000
# 50% 1.000000 2.50000
# 75% 4.000000 3.25000
# max 7.000000 4.00000
print("series 汇总")
obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c'])
print(obj)
# a 2
# a 4
# b 8
# c 4
# dtype: int64
print(obj.describe())
# count 4.000000
# mean 4.500000
# std 2.516611
# min 2.000000
# 25% 3.500000
# 50% 4.000000
# 75% 5.000000
# max 8.000000
# dtype: float64
去重和成员出现计数
主要方法:
print("去重")
obj = Series(['c', 'a', 'd', 'b', 'b', 'c'])
print(obj.unique())
# ['c' 'a' 'd' 'b']
print(obj.value_counts())
# b 2
# c 2
# d 1
# a 1
# dtype: int64
print("判断元素存在")
mask = obj.isin(['b', 'c'])
print(mask)
# 0 True
# 1 False
# 2 False
# 3 True
# 4 True
# 5 True
# dtype: bool
print("只打印包含的元素b 和c")
print(obj[mask])
# 0 c
# 3 b
# 4 b
# 5 c
# dtype: object
import pandas as pd
data = DataFrame({
'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 15, 2, 4, 4]
})
print(data)
# Qu1 Qu2 Qu3
#0 1 2 1
#1 3 3 15
#2 4 1 2
#3 3 2 4
#4 4 3 4
print("计算每列数字出现的次数, 缺失值为0")
print(data.apply(pd.value_counts).fillna(0))
# Qu1 Qu2 Qu3
#1 1.0 1.0 1.0
#2 0.0 2.0 1.0
#3 2.0 2.0 0.0
#4 2.0 0.0 2.0
#15 0.0 0.0 1.0
print("计算每行中各个数字出现的次数, 缺失值为0")
print(data.apply(pd.value_counts, axis=1).fillna(0))
# 1 2 3 4 15
#0 2.0 1.0 0.0 0.0 0.0
#1 0.0 0.0 2.0 0.0 1.0
#2 1.0 1.0 0.0 1.0 0.0
#3 0.0 1.0 1.0 1.0 0.0
#4 0.0 0.0 1.0 2.0 0.0
# 注意: 按列的话前面 的索引就是这些值, 后面是这些值出现的次数, 比如 这里的15 一下子就看出规律了
处理缺失数据
• NaN(Not a Number)表示浮点数和非浮点数组中的缺失数据,None也被当作NA处理。
处理缺失数据函数:
• dropna 函数:DatFrame默认丢弃任何含有缺失值的行。how参数控制行为,axis参数选择轴,thresh参数控制NaN数量的要求。
• fillna函数: inplace参数决定返回新对象还是就地修改
print("作为Null 处理的值")
string_data = Series(['a', 'b', numpy.nan, 'd'])
print(string_data)
# 作为Null 处理的值
# 0 a
# 1 b
# 2 NaN
# 3 d
print(string_data.isnull()) # 判断值是不是为空
# 0 False
# 1 False
# 2 True
# 3 False
# dtype: bool
string_data[0] = None
print(string_data)
# 0 None
# 1 b
# 2 NaN
# 3 d
# dtype: object
print(string_data.isnull())
from numpy import nan as NA
print("丢弃缺失数据NaN")
# 0 True
# 1 False
# 2 True
# 3 False
# dtype: bool
data = Series([1, NA, 3.5, NA, 7])
print(data.dropna())
# 丢弃缺失数据NaN
# 0 1.0
# 2 3.5
# 4 7.0
# dtype: float64
print(" 对丢弃的NaN 的处理")
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
print(data)
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
print("默认只要某行有NA 就全部删除")
print(data.dropna())
# 0 1 2
# 0 1.0 6.5 3.0
print("某行 全部为NA才全部删除")
print(data.dropna(how='all'))
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 3 NaN 6.5 3.0
data[0] = NA
print(data)
# 0 1 2
# 0 NaN 6.5 3.0
# 1 NaN NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
print(data.dropna(axis=1, how='all')) # 某列全部为NA 就全部删除
# 1 2
# 0 6.5 3.0
# 1 NaN NaN
# 2 NaN NaN
# 3 6.5 3.0
data = DataFrame(numpy.arange(21).reshape((7, 3)))
print(data)
# 0 1 2
# 0 0 1 2
# 1 3 4 5
# 2 6 7 8
# 3 9 10 11
# 4 12 13 14
# 5 15 16 17
# 6 18 19 20
data.ix[:4, 1] = NA # 根据行名, 行号, 列进行选择, 包括行4 0-4 行 列号是1 的 这些数据
data.ix[:2, 2] = NA
print(" 改变后的data ")
print(data)
# 0 1 2
# 0 0 NaN NaN
# 1 3 NaN NaN
# 2 6 NaN NaN
# 3 9 NaN 11.0
# 4 12 NaN 14.0
# 5 15 16.0 17.0
# 6 18 19.0 20.0
print("每行至少有2个NA 元素则删除")
print(data.dropna(thresh=2))
# 0 1 2
# 3 9 NaN 11.0
# 4 12 NaN 14.0
# 5 15 16.0 17.0
# 6 18 19.0 20.0
print("填充0")
df= DataFrame(numpy.arange(9).reshape((3,3)))
print(df)
# 0 1 2
# 0 0 1 2
# 1 3 4 5
# 2 6 7 8
df.ix[:1, 1] = NA
df.ix[:2, 2] = NA
print(df)
# 0 1 2
# 0 0 NaN NaN
# 1 3 NaN NaN
# 2 6 7.0 NaN
print(df.fillna(0)) # 默认inplace 为False
# 0 1 2
# 0 0 0.0 0.0
# 1 3 0.0 0.0
# 2 6 7.0 0.0
print(df) # 这样打印后还是没有修改
# 0 1 2
# 0 0 NaN NaN
# 1 3 NaN NaN
# 2 6 7.0 NaN
print("就地修改")
df.fillna(0, inplace=True)
print(df)
# 0 1 2
# 0 0 0.0 0.0
# 1 3 0.0 0.0
# 2 6 7.0 0.0
df = DataFrame(numpy.arange(9).reshape((3,3)))
df.ix[:1, 1] = NA
df.ix[:2, 2] = NA
print(df)
# 0 1 2
# 0 0 NaN NaN
# 1 3 NaN NaN
# 2 6 7.0 NaN
print("不同行列填充不同的值")
print(df.fillna({1:0.5, 2:-1})) # 第一列填充0.5 第二列填充 -1
# 0 1 2
# 0 0 0.5 -1.0
# 1 3 0.5 -1.0
# 2 6 7.0 -1.0
print(df.fillna(method='bfill')) # 向前填充
# 0 1 2
# 0 0 7.0 NaN
# 1 3 7.0 NaN
# 2 6 7.0 NaN
print(df.fillna(method='bfill', limit=1)) # 向前填充, 只可向前填充一步
# 0 1 2
# 0 0 NaN NaN
# 1 3 7.0 NaN
# 2 6 7.0 NaN
print("用统计数据填充")
data = Series([1, NA, 2, NA, 3])
print(data.fillna(data.mean())) # 使用平均数进行填充
# 0 1.0
# 1 2.0
# 2 2.0
# 3 2.0
# 4 3.0
# dtype: float64
多层次化索引
对Series和DataFrame进行多层次的索引MultiIndex,通过stack与unstack进行Series和DataFrame的变换。
import numpy
from pandas import Series, DataFrame, MultiIndex
print("series 的多层索引")
data = Series(numpy.arange(8), index=[['a', 'a', 'b', 'b', 'c', 'c', 'd','d'],
[1, 2, 1, 2, 1, 2, 1,2]])
print(data)
# series 的多层索引
# a 1 0
# 2 1
# b 1 2
# 2 3
# c 1 4
# 2 5
# d 1 6
# 2 7
# dtype: int32
print(data.index)
# MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
# labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]])
print(data.b)
# 1 2
# 2 3
# dtype: int32
print(data.d)
# 1 6
# 2 7
# dtype: int32
print("闭区间")
print(data['b':'c'])
# 闭区间
# b 1 2
# 2 3
# c 1 4
# 2 5
# dtype: int32
print("数组索引不区分标签")
print(data[:2])
# 数组索引不区分标签
# a 1 0
# 2 1
# dtype: int32
print("将serise转化为DataFrame")
print(data.unstack())
# 将serise转化为DataFrame
# 1 2
# a 0 1
# b 2 3
# c 4 5
# d 6 7
print("将dataframe 转化为series")
print(data.unstack().stack())
# 将dataframe 转化为series
# a 1 0
# 2 1
# b 1 2
# 2 3
# c 1 4
# 2 5
# d 1 6
# 2 7
# dtype: int32
print("dataframe 多层次索引")
frame = DataFrame(numpy.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['A', 'A', 'B'], ['A1', 'A2', 'B1']]
)
print(frame) # 两层行索引两层列索引
# A B
# A1 A2 B1
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
print(frame.index)
#MultiIndex(levels=[['a', 'b'], [1, 2]],
# labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
#MultiIndex(levels=[['A', 'B'], ['A1', 'A2', 'B1']],
# labels=[[0, 0, 1], [0, 1, 2]])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'more']
print(frame)
# state A B
# more A1 A2 B1
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
print(frame.ix['a', 1]['B'])
#more
#B1 2
#Name: (a, 1), dtype: int32
print(frame.ix['a', 1]['A']['A1'])
# 0
print("直接使用 MultiIndex 创建层次索引结构index")
print(MultiIndex.from_arrays([['A', 'A', 'B'], ['Gree', 'Red', 'Green']],
names=['state', 'color']
))
#直接使用 MultiIndex 创建层次索引结构index
#MultiIndex(levels=[['A', 'B'], ['Gree', 'Green', 'Red']],
# labels=[[0, 0, 1], [0, 2, 1]],
# names=['state', 'color'])
将索引层进行交换:swaplevel函数。对某个索引层进行排序:sortlevel函数
print("索引层交换")
frame = DataFrame(numpy.arange(12).reshape((4, 3)),
index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns = [['A', 'A', 'B'], ['A1', 'A2', 'B1']])
frame.index.names = ['key1', 'key2']
print(frame)
# A B
# A1 A2 B1
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
frame_swapped = frame.swaplevel('key1', 'key2')
print("交换索引层")
print(frame_swapped)
# A B
# A1 A2 B1
# key2 key1
# 1 a 0 1 2
# 2 a 3 4 5
# 1 b 6 7 8
# 2 b 9 10 11
print(frame_swapped.swaplevel(0, 1)) # 交换回来
A B
A1 A2 B1
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
print("对某个索引层进行排序")
print(frame.sort_index(level="key2"))
# A B
# A1 A2 B1
# key1 key2
# a 1 0 1 2
# b 1 6 7 8
# a 2 3 4 5
# b 2 9 10 11
根据某个索引层进行统计计算
print("根据索引层进行统计")
print(frame.sum(level='key2'))
A B
A1 A2 B1
key2
1 6 8 10
2 12 14 16
将某列转化为层次的行索引,列名为索引名,列的值为索引值:set_index函数;恢复重置行索引且恢复列:reset_index函数
print("将列索引转换行层次索引")
frame = DataFrame({'a': range(7),
'b': range(7, 0, -1),
'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd':[0, 1, 2, 0, 1, 2, 3]
})
print(frame)
# a b c d
# 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# 3 3 4 two 0
# 4 4 3 two 1
# 5 5 2 two 2
# 6 6 1 two 3
print(frame.set_index(['c', 'd'])) # 把c d 列索引变成行索引
# a b
# c d
# one 0 0 7
# 1 1 6
# 2 2 5
# two 0 3 4
# 1 4 3
# 2 5 2
# 3 6 1
print(frame.set_index(['c', 'd'], drop= False)) # 列依然保留
# a b c d
# c d
# one 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# two 0 3 4 two 0
# 1 4 3 two 1
# 2 5 2 two 2
# 3 6 1 two 3
print("恢复列")
frame2 = frame.set_index(['c', 'd'])
print(frame2.reset_index())
# c d a b
# 0 one 0 0 7
# 1 one 1 1 6
# 2 one 2 2 5
# 3 two 0 3 4
# 4 two 1 4 3
# 5 two 2 5 2
# 6 two 3 6 1