第一章Pandas快速入门

《深入浅出Pandas》第一章代码
数据来源:https://www.gairuo.com/file/data/dataset/team.xlsx

import numpy as np
import pandas as pd
file_path = 'E:\\Data_python\\anconda_code\\Dive_into_Pandas\\data_files\\'
team_path = 'team.xlsx'
team_df = pd.read_excel(file_path + team_path)
team_df.head(5)
nameteamQ1Q2Q3Q4
0LiverE89212464
1ArryC36373757
2AckA57601884
3EorgeC93967178
4OahD65496186
team_df.tail(5)
nameteamQ1Q2Q3Q4
95GabrielC48598774
96Austin7C21313043
97Lincoln4C9893120
98EliE11745891
99BenE21434174
team_df.sample(6)
nameteamQ1Q2Q3Q4
33AdamC90324739
54Ollie3C10763036
82FinnE415532
85LiamB2802425
58LewisB4347728
27FinleyD62738468
team_df.shape
(100, 6)
team_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    100 non-null    object
 1   team    100 non-null    object
 2   Q1      100 non-null    int64 
 3   Q2      100 non-null    int64 
 4   Q3      100 non-null    int64 
 5   Q4      100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB
team_df.describe()
Q1Q2Q3Q4
count100.000000100.000000100.000000100.000000
mean49.20000052.55000052.67000052.780000
std29.96260329.84518126.54367727.818524
min1.0000001.0000001.0000002.000000
25%19.50000026.75000029.50000029.500000
50%51.50000049.50000055.00000053.000000
75%74.25000077.75000076.25000075.250000
max98.00000099.00000099.00000099.000000
team_df.dtypes
name    object
team    object
Q1       int64
Q2       int64
Q3       int64
Q4       int64
dtype: object
team_df.axes
[RangeIndex(start=0, stop=100, step=1),
 Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')]
team_df.columns
Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
# 建立索引(将team的name字段作为索引)
team_df.set_index('name', inplace=True)
team_df
teamQ1Q2Q3Q4
name
LiverE89212464
ArryC36373757
AckA57601884
EorgeC93967178
OahD65496186
..................
GabrielC48598774
Austin7C21313043
Lincoln4C9893120
EliE11745891
BenE21434174

100 rows × 5 columns

team_df['Q1']
name
Liver       89
Arry        36
Ack         57
Eorge       93
Oah         65
            ..
Gabriel     48
Austin7     21
Lincoln4    98
Eli         11
Ben         21
Name: Q1, Length: 100, dtype: int64
team_df.Q1
name
Liver       89
Arry        36
Ack         57
Eorge       93
Oah         65
            ..
Gabriel     48
Austin7     21
Lincoln4    98
Eli         11
Ben         21
Name: Q1, Length: 100, dtype: int64
team_df[['Q1','Q3']]
Q1Q3
name
Liver8924
Arry3637
Ack5718
Eorge9371
Oah6561
.........
Gabriel4887
Austin72130
Lincoln4981
Eli1158
Ben2141

100 rows × 2 columns

team_df.loc[:,['Q1','Q2']]
Q1Q2
name
Liver8921
Arry3637
Ack5760
Eorge9396
Oah6549
.........
Gabriel4859
Austin72131
Lincoln49893
Eli1174
Ben2143

100 rows × 2 columns

# 获取行
team_df[team_df.index == 'Liver']  #输出name==Liver的这行                                                                                                          
teamQ1Q2Q3Q4
name
LiverE89212464
team_df[0:3]
teamQ1Q2Q3Q4
name
LiverE89212464
ArryC36373757
AckA57601884
team_df[0:10:2]
teamQ1Q2Q3Q4
name
LiverE89212464
AckA57601884
OahD65496186
AcobB6195948
ReddieD64935772
team_df.iloc[:10, :]
teamQ1Q2Q3Q4
name
LiverE89212464
ArryC36373757
AckA57601884
EorgeC93967178
OahD65496186
HarlieC24138743
AcobB6195948
LfieA9109937
ReddieD64935772
OscarA7792667
# 指定行和列
team_df.loc['Ben', 'Q1':'Q4'] # 只看Ben的四个季度成绩
team_df.loc['Eorge':'Alexander', 'team':'Q4'] # 指定行区间
teamQ1Q2Q3Q4
name
EorgeC93967178
OahD65496186
HarlieC24138743
AcobB6195948
LfieA9109937
ReddieD64935772
OscarA7792667
LeoB1743379
LoganB9893565
ArchieC83895968
TheoC51868727
ThomasB80485641
JamesE48775211
JoshuaA6348030
HenryA91157517
WilliamC8068326
MaxE9775413
LucasA60417762
EthanD79458988
ArthurA44534240
MasonD80962649
IsaacE74232865
HarrisonB89131875
TeddyE71912148
FinleyD62738468
DanielC50507261
RileyE35265983
EdwardB57388687
JosephE67878793
AlexanderC91762679
# 条件选择
# 单一条件
team_df[team_df.team == 'C']
team_df[team_df.Q1 > 90]
team_df[team_df.index=='Qsar']
teamQ1Q2Q3Q4
name
OscarA7792667
# 多条件
team_df[(team_df.Q1 > 90) & (team_df.team == 'C')] # and 关系
teamQ1Q2Q3Q4
name
EorgeC93967178
AlexanderC91762679
Lincoln4C9893120
team_df[team_df['team']=='C'].loc[team_df.Q1 > 90] # 多重条件筛选
teamQ1Q2Q3Q4
name
EorgeC93967178
AlexanderC91762679
Lincoln4C9893120
排序
team_df.sort_values(by='Q1') # 按Q1列排序
teamQ1Q2Q3Q4
name
SebastianC1146848
HarleyB2991213
LiamB2802425
LewisB4347728
FinnE415532
..................
EorgeC93967178
AaronA9675558
ElijahB97891546
MaxE9775413
Lincoln4C9893120

100 rows × 5 columns

team_df.sort_values(by='Q3', ascending=False) # 降序
teamQ1Q2Q3Q4
name
DavidB2147992
LfieA9109937
Luke6D15979599
AcobB6195948
BlakeA7823939
..................
HarleyB2991213
Jake3C69231140
Jayden6D64211021
WilliamC8068326
Lincoln4C9893120

100 rows × 5 columns

team_df.sort_values(['team','Q2'], ascending=[True, False]) # team升序,Q2降序
teamQ1Q2Q3Q4
name
DylanA86876520
NathanA87776213
AaronA9675558
StanleyA69713997
AckA57601884
..................
LiverE89212464
Rory9E8125827
Jackson5E6101533
RomanE7312544
FinnE415532

100 rows × 5 columns

分组聚合
# 按团队分组求和
team_df.groupby('team').sum()
Q1Q2Q3Q4
team
A1066639875783
B975121812021136
C1056119410681127
D860119112411199
E96310138811033
team_df.groupby('team').mean() # 分组求平均
Q1Q2Q3Q4
team
A62.70588237.58823551.47058846.058824
B44.31818255.36363654.63636451.636364
C48.00000054.27272748.54545551.227273
D45.26315862.68421165.31578963.105263
E48.15000050.65000044.05000051.650000
# 不同列不同计算方法
team_df.groupby('team').agg({'Q1': sum,  # 求和
                             'Q2': 'count', # 计数
                             'Q3': 'mean', # 平均
                             'Q4': max}) # 最大值
Q1Q2Q3Q4
team
A10661751.47058897
B9752254.63636499
C10562248.54545598
D8601965.31578999
E9632044.05000098
数据转换
# 转置
team_df.groupby('team').sum()
Q1Q2Q3Q4
team
A1066639875783
B975121812021136
C1056119410681127
D860119112411199
E96310138811033
team_df.groupby('team').sum().T  # 二维数组可认为行转列
teamABCDE
Q110669751056860963
Q26391218119411911013
Q3875120210681241881
Q47831136112711991033
team_df.groupby('team').sum().stack()  # 分组透视到行
team    
A     Q1    1066
      Q2     639
      Q3     875
      Q4     783
B     Q1     975
      Q2    1218
      Q3    1202
      Q4    1136
C     Q1    1056
      Q2    1194
      Q3    1068
      Q4    1127
D     Q1     860
      Q2    1191
      Q3    1241
      Q4    1199
E     Q1     963
      Q2    1013
      Q3     881
      Q4    1033
dtype: int64
team_df.groupby('team').sum().unstack() # 分组后透视到列
    team
Q1  A       1066
    B        975
    C       1056
    D        860
    E        963
Q2  A        639
    B       1218
    C       1194
    D       1191
    E       1013
Q3  A        875
    B       1202
    C       1068
    D       1241
    E        881
Q4  A        783
    B       1136
    C       1127
    D       1199
    E       1033
dtype: int64
添加列
# 添加一个固定值的列
team_df['grade'] = 'Grade 1'
team_df.head(5)
teamQ1Q2Q3Q4grade
name
LiverE89212464Grade 1
ArryC36373757Grade 1
AckA57601884Grade 1
EorgeC93967178Grade 1
OahD65496186Grade 1
# 添加汇总列
team_df['total'] = team_df['Q1'] + team_df['Q2'] + team_df['Q3'] + team_df['Q4']
team_df.head(5)
teamQ1Q2Q3Q4gradetotal
name
LiverE89212464Grade 1198
ArryC36373757Grade 1167
AckA57601884Grade 1219
EorgeC93967178Grade 1338
OahD65496186Grade 1261
# 将计算结果赋给新列
team_df['total1'] = team_df.loc[:,'Q1':'Q4'].apply(lambda x : sum(x), axis=1)
team_df['total2'] = team_df.sum(axis=1) # 把所有为数字的列相加
team_df['avg'] = team_df.total/4 # 添加平均值列
team_df
teamQ1Q2Q3Q4gradetotaltotal1total2avg
name
LiverE89212464Grade 1198198118849.50
ArryC36373757Grade 1167167100241.75
AckA57601884Grade 1219219131454.75
EorgeC93967178Grade 1338338202884.50
OahD65496186Grade 1261261156665.25
.................................
GabrielC48598774Grade 1268268160867.00
Austin7C21313043Grade 112512575031.25
Lincoln4C9893120Grade 1212212127253.00
EliE11745891Grade 1234234140458.50
BenE21434174Grade 1179179107444.75

100 rows × 10 columns



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值