import pandas as pd
import numpy as np
data = pd. read_excel( r'F:\数据分析—pandas\meal_order_detail.xlsx' ) . head( 10 )
print ( data. columns)
data1 = data[ 'place_order_time' ] . astype( 'str' )
print ( data1)
Index(['detail_id', 'order_id', 'dishes_id', 'logicprn_name',
'parent_class_name', 'dishes_name', 'itemis_add', 'counts', 'amounts',
'cost', 'place_order_time', 'discount_amt', 'discount_reason',
'kick_back', 'add_inprice', 'add_info', 'bar_code', 'picture_file',
'emp_id'],
dtype='object')
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
5 2016-08-01 11:15:57
6 2016-08-01 11:19:12
7 2016-08-01 11:22:21
8 2016-08-01 11:22:53
9 2016-08-01 11:23:56
Name: place_order_time, dtype: object
1 将字符串类型转换为标准时间格式
data2 = pd. to_datetime( data[ 'place_order_time' ] )
print ( data2)
print ( data2. dtype)
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
5 2016-08-01 11:15:57
6 2016-08-01 11:19:12
7 2016-08-01 11:22:21
8 2016-08-01 11:22:53
9 2016-08-01 11:23:56
Name: place_order_time, dtype: datetime64[ns]
datetime64[ns]
print ( data1. dt. week. head( ) )
0 31
1 31
2 31
3 31
4 31
Name: place_order_time, dtype: int64
time1 = data[ 'place_order_time' ] . head( )
time2 = data[ 'place_order_time' ] . tail( ) . reset_index( )
time3 = time2[ 'place_order_time' ]
print ( time1)
print ( time2)
print ( time3- time1)
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
Name: place_order_time, dtype: datetime64[ns]
index place_order_time
0 5 2016-08-01 11:15:57
1 6 2016-08-01 11:19:12
2 7 2016-08-01 11:22:21
3 8 2016-08-01 11:22:53
4 9 2016-08-01 11:23:56
0 00:10:21
1 00:12:05
2 00:14:41
3 00:11:42
4 00:12:26
Name: place_order_time, dtype: timedelta64[ns]
print ( ( data[ 'place_order_time' ] + pd. Timedelta( days= 1 ) ) . head( ) )
print ( ( data[ 'place_order_time' ] - pd. Timedelta( hours= 3 ) ) . head( ) )
0 2016-08-02 11:05:36
1 2016-08-02 11:07:07
2 2016-08-02 11:07:40
3 2016-08-02 11:11:11
4 2016-08-02 11:11:30
Name: place_order_time, dtype: datetime64[ns]
0 2016-08-01 08:05:36
1 2016-08-01 08:07:07
2 2016-08-01 08:07:40
3 2016-08-01 08:11:11
4 2016-08-01 08:11:30
Name: place_order_time, dtype: datetime64[ns]
分组聚合
data. groupby( by= 'order_id' ) [ 'amounts' , 'counts' ] . sum ( ) . head( )
data. groupby( by= 'order_id' ) [ 'amounts' , 'counts' ] . sum ( ) . sort_values( by= 'counts' , ascending= False )
data. groupby( by= 'order_id' ) [ [ 'amounts' ] ] . sum ( ) . sort_values( by= 'amounts' , ascending= False )
amounts order_id 301 311 417 165
data[ [ 'amounts' , 'counts' ] ] . agg( [ np. sum , np. mean, np. std, np. var] )
amounts counts sum 476.000000 10.0 mean 47.600000 1.0 std 24.905153 0.0 var 620.266667 0.0
data[ [ 'amounts' , 'counts' ] ] . agg( { 'amounts' : np. mean, 'counts' : np. sum } )
amounts 47.6
counts 10.0
dtype: float64
data[ [ 'amounts' , 'counts' ] ] . agg( { 'amounts' : np. mean, 'counts' : [ np. mean, np. std] } )
amounts counts mean 47.6 1.0 std NaN 0.0
data. groupby( by= 'order_id' ) [ [ 'counts' , 'amounts' ] ] . agg( [ np. mean, np. sum ] ) [ 'amounts' ] [ 'mean' ]
order_id
301 62.2
417 33.0
Name: mean, dtype: float64
data. loc[ : , ( 'amounts' , 'counts' ) ] . agg( { 'amounts' : np. mean, 'counts' : np. sum } )
amounts 47.6
counts 10.0
dtype: float64
series中字符串处理的方法
aa= data[ 'dishes_name' ] . str . strip( )
bb= data[ 'dishes_name' ] . str . contains( '番茄' )
print ( aa)
print ( bb)
0 蒜蓉生蚝
1 蒙古烤羊腿
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
5 白斩鸡
6 香烤牛排
7 干锅田鸡
8 桂圆枸杞鸽子汤
9 番茄有机花菜
Name: dishes_name, dtype: object
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 True
Name: dishes_name, dtype: bool
六、pandas中的去重
data. drop_duplicates( subset= [ 'dishes_name' , 'dishes_id' ] ) . head( )
data[ 'dishes_name' ] . drop_duplicates( ) . head( )
0 蒜蓉生蚝
1 蒙古烤羊腿
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
Name: dishes_name, dtype: object
def transform ( value) :
values = value* 2
return values
data[ 'counts' ] . transform( transform)
0 2
1 2
2 2
3 2
4 2
5 2
6 2
7 2
8 2
9 2
Name: counts, dtype: int64
三、透视表(重点)
pd. pivot_table( data, index= [ 'order_id' , 'dishes_name' ] , aggfunc= [ np. mean, np. sum ] , values = [ 'counts' , 'add_inprice' ] ) . head( )
mean sum add_inprice counts add_inprice counts order_id dishes_name 301 干锅田鸡 0 1 0 1 桂圆枸杞鸽子汤 0 1 0 1 番茄有机花菜 0 1 0 1 白斩鸡 0 1 0 1 香烤牛排 0 1 0 1
pd. pivot_table( data,
columns= [ 'order_id' ] ,
index= [ 'dishes_name' ] ,
values= [ 'counts' ] ,
aggfunc= [ np. sum ] ,
fill_value = 0 ,
margins = False ,
margins_name = 'ALL'
)
sum counts order_id 301 417 dishes_name 大蒜苋菜 0 1 干锅田鸡 1 0 桂圆枸杞鸽子汤 1 0 番茄有机花菜 1 0 白斩鸡 1 0 芝麻烤紫菜 0 1 蒙古烤羊腿 0 1 蒜蓉生蚝 0 1 蒜香包 0 1 香烤牛排 1 0
四、 交叉表(了解)
pd. crosstab( index = data[ 'order_id' ] , columns = data[ 'dishes_name' ] , values= data[ 'counts' ] , aggfunc= np. sum )
dishes_name 大蒜苋菜 干锅田鸡 桂圆枸杞鸽子汤 番茄有机花菜 白斩鸡 芝麻烤紫菜 蒙古烤羊腿 蒜蓉生蚝 蒜香包 香烤牛排 order_id 301 NaN 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 1.0 417 1.0 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 NaN
五、表格合并方法
df = pd. concat( df1, df2, df3. . . . dfn, axis = 0 )
df = pd. concat( df1, df2, df3. . . . dfn, axis = 1 )
File "<ipython-input-57-aea27ea24be5>", line 3
df = pd.concat(df1,df2,df3....dfn,axis = 0)#将n个表纵向合并,列一致,一般用纵向拼接
^
SyntaxError: invalid syntax
left = pd. DataFrame( { 'key1' : [ 'K0' , 'K0' , 'K1' , 'K3' ] ,
'key2' : [ 'K0' , 'K1' , 'K0' , 'K1' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pd. DataFrame( { 'key1' : [ 'K0' , 'K1' , 'K1' , 'K2' ] ,
'key2' : [ 'K0' , 'K0' , 'K0' , 'K0' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
print ( left)
print ( right)
all_1 = pd. concat( ( left, right) , join = 'inner' , axis= 0 , verify_integrity= False )
all_2 = pd. concat( ( left, right) , join = 'outer' , axis= 1 )
print ( all_1)
print ( all_2)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K3 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
key1 key2
0 K0 K0
1 K0 K1
2 K1 K0
3 K3 K1
0 K0 K0
1 K1 K0
2 K1 K0
3 K2 K0
key1 key2 A B key1 key2 C D
0 K0 K0 A0 B0 K0 K0 C0 D0
1 K0 K1 A1 B1 K1 K0 C1 D1
2 K1 K0 A2 B2 K1 K0 C2 D2
3 K3 K1 A3 B3 K2 K0 C3 D3
print ( left)
print ( right)
pd. merge( left, right, on = 'key1' , how = 'inner' )
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K3 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
key1 key2_x A B key2_y C D 0 K0 K0 A0 B0 K0 C0 D0 1 K0 K1 A1 B1 K0 C0 D0 2 K1 K0 A2 B2 K0 C1 D1 3 K1 K0 A2 B2 K0 C2 D2
pd. merge( left, right, on = 'key1' , how= 'outer' )
key1 key2_x A B key2_y C D 0 K0 K0 A0 B0 K0 C0 D0 1 K0 K1 A1 B1 K0 C0 D0 2 K1 K0 A2 B2 K0 C1 D1 3 K1 K0 A2 B2 K0 C2 D2 4 K3 K1 A3 B3 NaN NaN NaN 5 K2 NaN NaN NaN K0 C3 D3
pd. merge( left, right, on = 'key1' , how = 'left' )
key1 key2_x A B key2_y C D 0 K0 K0 A0 B0 K0 C0 D0 1 K0 K1 A1 B1 K0 C0 D0 2 K1 K0 A2 B2 K0 C1 D1 3 K1 K0 A2 B2 K0 C2 D2 4 K3 K1 A3 B3 NaN NaN NaN
pd. merge( left, right, on = 'key1' , how= 'right' )
key1 key2_x A B key2_y C D 0 K0 K0 A0 B0 K0 C0 D0 1 K0 K1 A1 B1 K0 C0 D0 2 K1 K0 A2 B2 K0 C1 D1 3 K1 K0 A2 B2 K0 C2 D2 4 K2 NaN NaN NaN K0 C3 D3
pd. merge( left, right, on = [ 'key1' , 'key2' ] , how= 'right' )
key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 3 K2 K0 NaN NaN C3 D3
pd. merge( left, right, right_on= 'key1' , left_on= 'key2' , how= 'right' )
key1_x key2_x A B key1_y key2_y C D 0 K0 K0 A0 B0 K0 K0 C0 D0 1 K1 K0 A2 B2 K0 K0 C0 D0 2 K0 K1 A1 B1 K1 K0 C1 D1 3 K3 K1 A3 B3 K1 K0 C1 D1 4 K0 K1 A1 B1 K1 K0 C2 D2 5 K3 K1 A3 B3 K1 K0 C2 D2 6 NaN NaN NaN NaN K2 K0 C3 D3
补充:更改表格名的方法
left. rename( columns= { 'key1' : 'key222' } , inplace= False )
print ( left)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K3 K1 A3 B3
两个表格的合并(重叠合并)
dict1 = { 'ID' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ] ,
'System' : [ 'W10' , 'w10' , np. nan, 'w10' , np. nan, np. nan, 'w7' , 'w7' , 'w8' ] }
dict2 = { 'ID' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ] ,
'System' : [ np. nan, np. nan, 'w7' , 'w7' , 'w7' , 'w7' , 'w8' , np. nan, np. nan] }
df1 = pd. DataFrame( dict1)
df2 = pd. DataFrame( dict2)
print ( df1)
print ( df2)
ID System
0 1 W10
1 2 w10
2 3 NaN
3 4 w10
4 5 NaN
5 6 NaN
6 7 w7
7 8 w7
8 9 w8
ID System
0 1 NaN
1 2 NaN
2 3 w7
3 4 w7
4 5 w7
5 6 w7
6 7 w8
7 8 NaN
8 9 NaN
df1. combine_first( df2)
ID System 0 1 W10 1 2 w10 2 3 w7 3 4 w10 4 5 w7 5 6 w7 6 7 w7 7 8 w7 8 9 w8