pandas之数据合并与分组聚合

最新推荐文章于 2024-05-06 11:32:06 发布

simon5169

最新推荐文章于 2024-05-06 11:32:06 发布

阅读量638

点赞数

分类专栏： python 文章标签： python 数据分析 numpy

本文链接：https://blog.csdn.net/blances/article/details/105620033

版权

python 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

import pandas as pd
print("*"*25+"数据合并操作"+"*"*25)
#merge按行索引操作，join按列索引操作

import numpy as np
df1=pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2=pd.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=list("xyz"))

print(df1)

     a    b    c    d
A  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0

print(df2)

     x    y    z
A  0.0  0.0  0.0
B  0.0  0.0  0.0
C  0.0  0.0  0.0

df1.join(df2)

	a	b	c	d	x	y	z
A	1.0	1.0	1.0	1.0	0.0	0.0	0.0
B	1.0	1.0	1.0	1.0	0.0	0.0	0.0

df2.join(df1)

	a	b	c	d
A	1.0	1.0	1.0	1.0
B	1.0	1.0	1.0	1.0
C	NaN	NaN	NaN	NaN

df3=pd.DataFrame(np.zeros((3,3)),columns=list("fax"))
df3

	f	a	x
0	0.0	0.0	0.0
1	0.0	0.0	0.0
2	0.0	0.0	0.0

df1.merge(df3,on="a") #默认并集所以为空

	a	b	c	d	f	x

df3.loc[1,"a"]=1
df1.merge(df3,on="a")

	a	b	c	d	f	x
0	1.0	1.0	1.0	1.0	0.0	0.0
1	1.0	1.0	1.0	1.0	0.0	0.0

df3=pd.DataFrame(np.arange(9).reshape(3,3),columns=list("fax"))
df3

	f	a	x
0	0	1	2
1	3	4	5
2	6	7	8

df1.merge(df3,on="a")

	a	b	c	d	f	x
0	1.0	1.0	1.0	1.0	0	2
1	1.0	1.0	1.0	1.0	0	2

df1.loc["A","a"]=100
df1.merge(df3,on="a")

	a	b	c	d	f	x
0	1.0	1.0	1.0	1.0	0	2

df1.merge(df3,on="a",how="outer") #外连接，NaN补全

	a	b	c	d	f	x
0	100.0	1.0	1.0	1.0	NaN	NaN
1	1.0	1.0	1.0	1.0	0.0	2.0
2	4.0	NaN	NaN	NaN	3.0	5.0
3	7.0	NaN	NaN	NaN	6.0	8.0

df1.merge(df3,on="a",how="left") #左链接，以df1为准

	a	b	c	d	f	x
0	100.0	1.0	1.0	1.0	NaN	NaN
1	1.0	1.0	1.0	1.0	0.0	2.0

df1.merge(df3,on="a",how="right") #右连接，以df3为准

	a	b	c	d	f	x
0	1.0	1.0	1.0	1.0	0	2
1	4.0	NaN	NaN	NaN	3	5
2	7.0	NaN	NaN	NaN	6	8

print("*"*25+"数据分组聚合"+"*"*25)

*************************数据分组聚合*************************

import pandas as pd
file_path="./starbucks_store_worldwide.csv"
df=pd.read_csv(file_path)
print(df.head(1))
print(df.info())

       Brand  Store Number     Store Name Ownership Type     Street Address  \
0  Starbucks  47370-257954  Meritxell, 96       Licensed  Av. Meritxell, 96   

               City State/Province Country Postcode Phone Number  \
0  Andorra la Vella              7      AD    AD500    376818720   

                  Timezone  Longitude  Latitude  
0  GMT+1:00 Europe/Andorra       1.53     42.51  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25600 entries, 0 to 25599
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Brand           25600 non-null  object 
 1   Store Number    25600 non-null  object 
 2   Store Name      25600 non-null  object 
 3   Ownership Type  25600 non-null  object 
 4   Street Address  25598 non-null  object 
 5   City            25585 non-null  object 
 6   State/Province  25600 non-null  object 
 7   Country         25600 non-null  object 
 8   Postcode        24078 non-null  object 
 9   Phone Number    18739 non-null  object 
 10  Timezone        25600 non-null  object 
 11  Longitude       25599 non-null  float64
 12  Latitude        25599 non-null  float64
dtypes: float64(2), object(11)
memory usage: 2.5+ MB
None

#?统计美国和中国的星巴克数量groupby

groups=df.groupby(by="Country")

print(groups)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11bb150d0>

for i in groups:
    print(i[0])
    print(type(i[1]))
    break
country_count=groups["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

AD
<class 'pandas.core.frame.DataFrame'>
13608
2734

#统计中国每个省份数量
china_data=df[df["Country"]=="CN"]
groups=china_data.groupby(by="State/Province").count()["Brand"]
print(groups)

State/Province
11    236
12     58
13     24
14      8
15      8
21     57
22     13
23     16
31    551
32    354
33    315
34     26
35     75
36     13
37     75
41     21
42     76
43     35
44    333
45     21
46     16
50     41
51    104
52      9
53     24
61     42
62      3
63      3
64      2
91    162
92     13
Name: Brand, dtype: int64

# dataframegroupby有很多方法
# count（分组中非NA值的数量）
# sum（非NA的和）
# mean（非NA平均数）
# median(非NA的算术中位数)
# std、var无便（分母n-1）标准差和方差
# min、max（非NA最大和最小）

# 按多个条件进行分组
groups=df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
type(groups)
groups

Country  State/Province
AD       7                  1
AE       AJ                 2
         AZ                48
         DU                82
         FU                 2
                           ..
US       WV                25
         WY                23
VN       HN                 6
         SG                19
ZA       GT                 3
Name: Brand, Length: 545, dtype: int64

simon5169

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pandas之数据合并与分组聚合

import pandas as pdprint("*"*25+"数据合并操作"+"*"*25)#merge按行索引操作，join按列索引操作import numpy as npdf1=pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))df2=pd.DataFrame(np.zeros((3,3)),ind...
复制链接

扫一扫