pandas索引
1、创建s序列
import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(5),index = list('abcde'))
s
'''
a -0.206894
b 1.042893
c -0.495746
d -0.178118
e 0.531067
dtype: float64
'''
2、s序列的索引
s.index
'''Index(['a', 'b', 'c', 'd', 'e'], dtype='object')'''
s.index.name = 'alpha'
s
'''
alpha
a -0.206894
b 1.042893
c -0.495746
d -0.178118
e 0.531067
dtype: float64
'''
3、创建df的DataFrame
df = pd.DataFrame(np.random.randn(4,3),columns = ['one','two','three'])
df
4、df的行索引,列索引,以及命名
df.index
'''
RangeIndex(start=0, stop=4, step=1)
'''
df.columns
'''
Index(['one', 'two', 'three'], dtype='object')
'''
df.index.name = 'row'
df.columns.name = 'col'
print(df)
'''
col a b c d
row
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
'''
5、创建多级索引
a =[['a','a','a','b','b','c','c'],[1,2,3,1,2,2,3]]
t = list(zip(*a))
t
'''
[('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 2), ('c', 2), ('c', 3)]
'''
index = pd.MultiIndex.from_tuples(t,names = ['level1','level2'])
index
'''
MultiIndex(levels=[['a', 'b', 'c'], [1, 2, 3]],
codes=[[0, 0, 0, 1, 1, 2, 2], [0, 1, 2, 0, 1, 1, 2]],
names=['level1', 'level2'])
'''
s = pd.Series(np.random.rand(7),index = index)
s
'''
level1 level2
a 1 0.561636
2 0.907344
3 0.782276
b 1 0.674353
2 0.203832
c 2 0.591604
3 0.615422
dtype: float64
'''
df = pd.DataFrame(np.random.randint(1,10,(4,3)),
index = [['a','a','b','b'],[1,2,1,2]],
columns = [['one','one','two'],['blue','red','blue']])
df.index.names = ['row-1','row-2']
df.columns.names = ['col-1','col-2']
df
'''
col-1 one two
col-2 blue red blue
row-1 row-2
a 1 1 1 3
2 4 4 7
b 1 6 4 9
2 2 7 3
'''
type(df.loc['a'])
'''
pandas.core.frame.DataFrame
'''
df.loc['a',1]
'''
col-1 col-2
one blue 1
red 1
two blue 3
Name: (a, 1), dtype: int32
'''
#交换
df2 = df.swaplevel('row-1','row-2')
print(df2)
'''
col-1 one two
col-2 blue red blue
row-2 row-1
1 a 1 1 3
2 a 4 4 7
1 b 6 4 9
2 b 2 7 3
'''
#排序
df2.sort_index(1)
'''
col-1 one two
col-2 blue red blue
row-2 row-1
1 a 1 1 3
2 a 4 4 7
1 b 6 4 9
2 b 2 7 3
'''
#求和
print(df.sum(level = 0))
'''
col-1 one two
col-2 blue red blue
row-1
a 5 5 10
b 8 11 12
'''
df = pd.DataFrame({
'a':range(7),
'b':range (7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
df
'''
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
'''
print(df2.reset_index().sort_index('columns'))
'''
col-1 one row-1 row-2 two
col-2 blue red blue
0 1 1 a 1 3
1 4 4 a 2 7
2 6 4 b 1 9
3 2 7 b 2 3
'''
pandas分组运算
1、分组计算三部曲:
拆分、应用、合并
拆分:根据什么进行分组
应用:每个分组进行什么样的计算
合并:把每个分组的计算结果合并起来。
2、对Series,DataFrame的分组
import pandas as pd
import numpy as np
df = pd.DataFrame({
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5)})
print(df)
'''
key1 key2 data1 data2
0 a one 9 3
1 a two 3 8
2 b one 2 5
3 b two 3 4
4 a one 9 7
'''
对Series的分组
df['data1'].groupby(df['key1']).mean()
'''
key1
a 7.0
b 2.5
'''
#自己创建key
key = [1,2,1,1,2]
df['data1'].groupby(key).mean()
'''
1 4.666667
2 6.000000
'''
#自己创建的key可为多层列表(求和)
df['data1'].groupby([df['key1'],df['key2']]).sum()
'''
key1 key2
a one 18
two 3
b one 2
two 3
'''
#求个数
df['data1'].groupby([df['key1'],df['key2']]).size()
key1 key2
a one 2
two 1
b one 1
two 1
Name: data1, dtype: int64
#key1,key2分组求和转化成DataFrame
mean = df.groupby(['key1','key2']).sum()['data1']
mean
'''
key1 key2
a one 18
two 3
b one 2
two 3
'''
print(mean.unstack())
'''
key2 one two
key1
a 18 3
b 2 3
'''
#for迭代
for name,group in df.groupby('key1'):
print (name)
print (group)
'''
a
key1 key2 data1 data2
0 a one 9 3
1 a two 3 8
4 a one 9 7
b
key1 key2 data1 data2
2 b one 2 5
3 b two 3 4
'''
#转化成字典
dict(list(df.groupby('key1')))
'''
{'a': key1 key2 data1 data2
0 a one 9 3
1 a two 3 8
4 a one 9 7, 'b': key1 key2 data1 data2
2 b one 2 5
3 b two 3 4}
'''
#每一列的数据类型
df.dtypes
'''
key1 object
key2 object
data1 int32
data2 int32
'''
#根据dtypes按列分组
df.groupby(df.dtypes,axis = 1).sum()
'''
int32
0 27
1 19
2 26
3 24
4 24
'''
3、通过函数进行分组
df = pd.DataFrame(np.random.randint(1,10,(5,5)),
columns = ['a','b','c','d','e'],
index = ['Alice','Bob','Candy','Dark','Emily'])
df.ix[1,1:3] = np