pandas索引
>import pandas as pd
>s.index.name='alpha'#给一维数组索引定义名字
#行索引
>df.index
#列索引
>df.columns
#索引值有重复项时
>s.index.is_unique #判断是否有重复索引
>s.groupby(s.index).sum() #将重复项求和
#多级索引
>pd.MultiIndex
#索引类
分组计算
拆分-应用-合并
拆分:根据什么进行分组
应用:每个分组进行什么样的计算
合并:把每个分组的计算结果合并起来
>df=pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randint(1,10,5),'data2':np.random.randint(1,10,5)})
df
Out[10]:
key1 key2 data1 data2
0 a one 4 2
1 a two 9 1
2 b one 7 8
3 b two 9 8
4 a one 2 2
>df['data1'].groupby(df['key1']).mean() #也可以多层分组,可在‘df['key1']’后面加上df['key2']
Out[13]:
key1
a 5
b 8
Name: data1, dtype: int32
>df.groupby('key1').sum()['data1']
Out[15]:
key1
a 15
b 16
Name: data1, dtype: int32
>sum=df.groupby(['key1','key2']).sum()['data1']
sum.unstack()
Out[24]:
key2 one two
key1
a 6 9
b 7 9
#用for循环迭代,分成两个分组
>for name,group in df.groupby('key1'):
print (name)
print (group)
a
key1 key2 data1 data2
0 a one 4 2
1 a two 9 1
4 a one 2 2
b
key1 key2 data1 data2
2 b one 7 8
3 b two 9 8
通过字典分组
>df=pd.DataFrame(np.random.randint(1,10,(5,5)),columns=['a','b','c','d','e'],index=['Alice','Bob','Candy','Dark','Emily'])
df.iloc[1,1:3]=np.NaN
df
Out[37]:
a b c d e
Alice 3 2.0 9.0 1 6
Bob 3 NaN NaN 4 7
Candy 8 3.0 8.0 4 9
Dark 5 8.0 9.0 3 1
Emily 7 7.0 8.0 9 6
#关注分组对于非数字的处理
>mapping={'a':'red','b':'red','c':'blue','d':'orange','e':'blue'}
grouped=df.groupby(mapping,axis=1)
grouped.sum()
Out[46]: #对NaN当成0处理
blue orange red
Alice 15.0 1.0 5.0
Bob 7.0 4.0 3.0
Candy 17.0 4.0 11.0
Dark 10.0 3.0 13.0
Emily 14.0 9.0 14.0
通过函数分组
>def _group_key(idx):
print(idx)
return (idx)
df.groupby(_group_key)
Alice
Bob
Candy
Dark
Emily
聚合运算
#内置聚合函数
>df.groupby('key1').desceibe()
#自定义聚合函数
>grouped=df.groupby('key1')
>def peak_range(s):
> print type(s)
> return s.max()-s.min()
>grouped.agg(peak_range)