import pandas as pd
import datetime
import numpy as np
# rank函数
S = pd.Series([3 ,5 ,1 ,9 ])
S.rank(method='average' )
0 2.0
1 3.0
2 1.0
3 4.0
dtype: float64
S = pd.Series([4 ,5 ,1 ,9 ,2 ,5 ,4 ])
S.rank(method='average' )
0 3.5
1 5.5
2 1.0
3 7.0
4 2.0
5 5.5
6 3.5
dtype: float64
S = pd.Series([4 ,5 ,1 ,9 ,2 ,5 ,4 ])
S.rank(method='min' )
0 3.0
1 5.0
2 1.0
3 7.0
4 2.0
5 5.0
6 3.0
dtype: float64
S = pd.Series([4 ,5 ,1 ,9 ,2 ,5 ,4 ])
S.rank(method='max' )
0 4.0
1 6.0
2 1.0
3 7.0
4 2.0
5 6.0
6 4.0
dtype: float64
S = pd.Series([4 ,5 ,1 ,9 ,2 ,5 ,4 ])
S.rank(method='first' )
0 3.0
1 5.0
2 1.0
3 7.0
4 2.0
5 6.0
6 4.0
dtype: float64
S = pd.Series([4 ,5 ,1 ,9 ,2 ,5 ,4 ])
S.rank(method='dense' )
0 3.0
1 4.0
2 1.0
3 5.0
4 2.0
5 4.0
6 3.0
dtype: float64
#groupby分组运算
df=pd.DataFrame({'a' :['A' ,'B' ,'C' ,'A' ,'B' ,'C' ,'A' ,'B' ,'C' ],\
'b' :[1 ,3 ,5 ,7 ,9 ,2 ,4 ,6 ,8 ],\
'c' :[1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ]})
df.groupby('a' ,as_index=False )['b' ].agg({'mean_value' :'mean' })
df.groupby('a' ,as_index=False )['b' ].agg({'mean_value' :'mean' ,'max_vale' :'max' })
a mean_value max_vale 0 A 4 7 1 B 6 9 2 C 5 8
df['rank' ]=df.groupby('a' )['b' ].rank(ascending=False )
df
a b c rank 0 A 1 1 3.0 1 B 3 2 3.0 2 C 5 3 2.0 3 A 7 4 1.0 4 B 9 5 1.0 5 C 2 6 3.0 6 A 4 7 2.0 7 B 6 8 2.0 8 C 8 9 1.0
对分组进行迭代
df=pd.DataFrame({'key1' :['a' ,'a' ,'b' ,'b' ,'a' ],\
'key2' :['one' ,'two' ,'one' ,'two' ,'one' ],\
'data_1' :[np.random.randint(0 ,8 ) for i in range(5 )],\
'data_2' :[np.random.randint(5 ,10 ) for i in range(5 )]})
df
data_1 data_2 key1 key2 0 2 7 a one 1 3 5 a two 2 5 9 b one 3 1 6 b two 4 4 6 a one
tmp=df.groupby(['key1' ,'key2' ],as_index=False )['data_1' ]\
.agg({'max_value' :'max' ,'mean_value' :'mean' })
tmp
key1 key2 max_value mean_value 0 a one 4 3 1 a two 3 3 2 b one 5 5 3 b two 1 1
for key,df in tmp.groupby('key1' ,as_index=False ):
print key
print df
a
key1 key2 max_value mean_value
0 a one 4 3
1 a two 3 3
b
key1 key2 max_value mean_value
2 b one 5 5
3 b two 1 1
#DataFrame.iterrows()函数
for index,row in tmp.iterrows():
print index
print row
print type(row)
0
key1 a
key2 one
max_value 4
mean_value 3
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
1
key1 a
key2 two
max_value 3
mean_value 3
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
2
key1 b
key2 one
max_value 5
mean_value 5
Name: 2, dtype: object
<class 'pandas.core.series.Series'>
3
key1 b
key2 two
max_value 1
mean_value 1
Name: 3, dtype: object
<class 'pandas.core.series.Series'>
tmp=pd.DataFrame({'item_id' :['A' ,'A' ,'A' ,'B' ,'B' ,'B' ,'C' ,'C' ,'C' ],\
'day' :[1 ,2 ,3 ,1 ,2 ,3 ,1 ,2 ,3 ],\
'buy' :[5 ,4 ,8 ,6 ,4 ,2 ,12 ,18 ,10 ],\
'cnt' :[10 ,20 ,32 ,30 ,16 ,4 ,16 ,30 ,20 ]})
tmp
buy cnt day item_id 0 5 10 1 A 1 4 20 2 A 2 8 32 3 A 3 6 30 1 B 4 4 16 2 B 5 2 4 3 B 6 12 16 1 C 7 18 30 2 C 8 10 20 3 C
item='item_id'
features = []
for key, df in tmp.groupby(item, as_index=False ):
feature = {}
feature[item] = key
for index, row in df.iterrows():
feature[item + 'buy' + str(int(row['day' ]))] = row['buy' ]
feature[item + 'cnt' + str(int(row['day' ]))] = row['cnt' ]
features.append(feature)
features
[{'item_id': 'A',
'item_idbuy1': 5L,
'item_idbuy2': 4L,
'item_idbuy3': 8L,
'item_idcnt1': 10L,
'item_idcnt2': 20L,
'item_idcnt3': 32L},
{'item_id': 'B',
'item_idbuy1': 6L,
'item_idbuy2': 4L,
'item_idbuy3': 2L,
'item_idcnt1': 30L,
'item_idcnt2': 16L,
'item_idcnt3': 4L},
{'item_id': 'C',
'item_idbuy1': 12L,
'item_idbuy2': 18L,
'item_idbuy3': 10L,
'item_idcnt1': 16L,
'item_idcnt2': 30L,
'item_idcnt3': 20L}]
pd.DataFrame(features)
item_id item_idbuy1 item_idbuy2 item_idbuy3 item_idcnt1 item_idcnt2 item_idcnt3 0 A 5 4 8 10 20 32 1 B 6 4 2 30 16 4 2 C 12 18 10 16 30 20