import pandas as pd
print ( "*" * 25 + "数据合并操作" + "*" * 25 )
import numpy as np
df1= pd. DataFrame( np. ones( ( 2 , 4 ) ) , index= [ "A" , "B" ] , columns= list ( "abcd" ) )
df2= pd. DataFrame( np. zeros( ( 3 , 3 ) ) , index= [ "A" , "B" , "C" ] , columns= list ( "xyz" ) )
print ( df1)
a b c d
A 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
print ( df2)
x y z
A 0.0 0.0 0.0
B 0.0 0.0 0.0
C 0.0 0.0 0.0
df1. join( df2)
a b c d x y z A 1.0 1.0 1.0 1.0 0.0 0.0 0.0 B 1.0 1.0 1.0 1.0 0.0 0.0 0.0
df2. join( df1)
x y z a b c d A 0.0 0.0 0.0 1.0 1.0 1.0 1.0 B 0.0 0.0 0.0 1.0 1.0 1.0 1.0 C 0.0 0.0 0.0 NaN NaN NaN NaN
df3= pd. DataFrame( np. zeros( ( 3 , 3 ) ) , columns= list ( "fax" ) )
df3
f a x 0 0.0 0.0 0.0 1 0.0 0.0 0.0 2 0.0 0.0 0.0
df1. merge( df3, on= "a" )
df3. loc[ 1 , "a" ] = 1
df1. merge( df3, on= "a" )
a b c d f x 0 1.0 1.0 1.0 1.0 0.0 0.0 1 1.0 1.0 1.0 1.0 0.0 0.0
df3= pd. DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , columns= list ( "fax" ) )
df3
df1. merge( df3, on= "a" )
a b c d f x 0 1.0 1.0 1.0 1.0 0 2 1 1.0 1.0 1.0 1.0 0 2
df1. loc[ "A" , "a" ] = 100
df1. merge( df3, on= "a" )
df1. merge( df3, on= "a" , how= "outer" )
a b c d f x 0 100.0 1.0 1.0 1.0 NaN NaN 1 1.0 1.0 1.0 1.0 0.0 2.0 2 4.0 NaN NaN NaN 3.0 5.0 3 7.0 NaN NaN NaN 6.0 8.0
df1. merge( df3, on= "a" , how= "left" )
a b c d f x 0 100.0 1.0 1.0 1.0 NaN NaN 1 1.0 1.0 1.0 1.0 0.0 2.0
df1. merge( df3, on= "a" , how= "right" )
a b c d f x 0 1.0 1.0 1.0 1.0 0 2 1 4.0 NaN NaN NaN 3 5 2 7.0 NaN NaN NaN 6 8
print ( "*" * 25 + "数据分组聚合" + "*" * 25 )
*************************数据分组聚合*************************
import pandas as pd
file_path= "./starbucks_store_worldwide.csv"
df= pd. read_csv( file_path)
print ( df. head( 1 ) )
print ( df. info( ) )
Brand Store Number Store Name Ownership Type Street Address \
0 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, 96
City State/Province Country Postcode Phone Number \
0 Andorra la Vella 7 AD AD500 376818720
Timezone Longitude Latitude
0 GMT+1:00 Europe/Andorra 1.53 42.51
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25600 entries, 0 to 25599
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Brand 25600 non-null object
1 Store Number 25600 non-null object
2 Store Name 25600 non-null object
3 Ownership Type 25600 non-null object
4 Street Address 25598 non-null object
5 City 25585 non-null object
6 State/Province 25600 non-null object
7 Country 25600 non-null object
8 Postcode 24078 non-null object
9 Phone Number 18739 non-null object
10 Timezone 25600 non-null object
11 Longitude 25599 non-null float64
12 Latitude 25599 non-null float64
dtypes: float64(2), object(11)
memory usage: 2.5+ MB
None
groups= df. groupby( by= "Country" )
print ( groups)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11bb150d0>
for i in groups:
print ( i[ 0 ] )
print ( type ( i[ 1 ] ) )
break
country_count= groups[ "Brand" ] . count( )
print ( country_count[ "US" ] )
print ( country_count[ "CN" ] )
AD
<class 'pandas.core.frame.DataFrame'>
13608
2734
china_data= df[ df[ "Country" ] == "CN" ]
groups= china_data. groupby( by= "State/Province" ) . count( ) [ "Brand" ]
print ( groups)
State/Province
11 236
12 58
13 24
14 8
15 8
21 57
22 13
23 16
31 551
32 354
33 315
34 26
35 75
36 13
37 75
41 21
42 76
43 35
44 333
45 21
46 16
50 41
51 104
52 9
53 24
61 42
62 3
63 3
64 2
91 162
92 13
Name: Brand, dtype: int64
groups= df[ "Brand" ] . groupby( by= [ df[ "Country" ] , df[ "State/Province" ] ] ) . count( )
type ( groups)
groups
Country State/Province
AD 7 1
AE AJ 2
AZ 48
DU 82
FU 2
..
US WV 25
WY 23
VN HN 6
SG 19
ZA GT 3
Name: Brand, Length: 545, dtype: int64