import numpy as np
import pandas as pd
分组与聚合
DataFrame.groupby(key, as_index=False) as_index=False添加一列索引,默认为true key:分组的列数据,可以多个
col = pd. DataFrame( { 'color' : [ 'white' , 'red' , 'green' , 'red' , 'green' ] , 'object' : [ 'pen' , 'pencil' , 'pencil' , 'ashtray' , 'pen' ] , 'price1' : [ 5.56 , 4.20 , 1.30 , 0.56 , 2.75 ] , 'price2' : [ 4.75 , 4.12 , 1.60 , 0.75 , 3.15 ] } )
col
color object price1 price2 0 white pen 5.56 4.75 1 red pencil 4.20 4.12 2 green pencil 1.30 1.60 3 red ashtray 0.56 0.75 4 green pen 2.75 3.15
col. groupby( [ 'color' ] ) [ 'price1' ] . mean( )
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
col. groupby( [ 'color' ] , as_index= False ) [ 'price1' ] . mean( )
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
实例:星巴克零售店铺数据
数据获取
starbucks = pd. read_csv( "./data/starbucks/directory.csv" )
starbucks. head( )
Brand Store Number Store Name Ownership Type Street Address City State/Province Country Postcode Phone Number Timezone Longitude Latitude 0 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, 96 Andorra la Vella 7 AD AD500 376818720 GMT+1:00 Europe/Andorra 1.53 42.51 1 Starbucks 22331-212325 Ajman Drive Thru Licensed 1 Street 69, Al Jarf Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.42 2 Starbucks 47089-256771 Dana Mall Licensed Sheikh Khalifa Bin Zayed St. Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.39 3 Starbucks 22126-218024 Twofour 54 Licensed Al Salam Street Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.38 24.48 4 Starbucks 17127-178586 Al Ain Tower Licensed Khaldiya Area, Abu Dhabi Island Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.54 24.51
进行分组聚合
count = starbucks. groupby( [ 'Country' ] ) . count( )
count
Brand Store Number Store Name Ownership Type Street Address City State/Province Postcode Phone Number Timezone Longitude Latitude Country AD 1 1 1 1 1 1 1 1 1 1 1 1 AE 144 144 144 144 144 144 144 24 78 144 144 144 AR 108 108 108 108 108 108 108 100 29 108 108 108 AT 18 18 18 18 18 18 18 18 17 18 18 18 AU 22 22 22 22 22 22 22 22 0 22 22 22 ... ... ... ... ... ... ... ... ... ... ... ... ... TT 3 3 3 3 3 3 3 3 0 3 3 3 TW 394 394 394 394 394 394 394 365 39 394 394 394 US 13608 13608 13608 13608 13608 13608 13608 13607 13122 13608 13608 13608 VN 25 25 25 25 25 25 25 25 23 25 25 25 ZA 3 3 3 3 3 3 3 3 2 3 3 3
73 rows × 12 columns
画图显示结果
count[ 'Brand' ] . plot( kind= 'bar' , figsize= ( 20 , 8 ) )
count1 = count[ 'Brand' ] . sort_values( ascending= False ) . head( 10 )
count1
Country
US 13608
CN 2734
CA 1468
JP 1237
KR 993
GB 901
MX 579
TW 394
TR 326
PH 298
Name: Brand, dtype: int64
count1. plot( kind= 'bar' , figsize= ( 8 , 5 ) )
data1 = starbucks. groupby( [ 'Country' , 'State/Province' ] ) . count( )
data1
Brand Store Number Store Name Ownership Type Street Address City Postcode Phone Number Timezone Longitude Latitude Country State/Province AD 7 1 1 1 1 1 1 1 1 1 1 1 AE AJ 2 2 2 2 2 2 0 0 2 2 2 AZ 48 48 48 48 48 48 7 20 48 48 48 DU 82 82 82 82 82 82 16 50 82 82 82 FU 2 2 2 2 2 2 1 0 2 2 2 ... ... ... ... ... ... ... ... ... ... ... ... ... US WV 25 25 25 25 25 25 25 23 25 25 25 WY 23 23 23 23 23 23 23 22 23 23 23 VN HN 6 6 6 6 6 6 6 6 6 6 6 SG 19 19 19 19 19 19 19 17 19 19 19 ZA GT 3 3 3 3 3 3 3 2 3 3 3
545 rows × 11 columns