《深入浅出Pandas》第一章代码 数据来源:https://www.gairuo.com/file/data/dataset/team.xlsx
import numpy as np
import pandas as pd
file_path = 'E:\\Data_python\\anconda_code\\Dive_into_Pandas\\data_files\\'
team_path = 'team.xlsx'
team_df = pd. read_excel( file_path + team_path)
team_df. head( 5 )
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86
team_df. tail( 5 )
name team Q1 Q2 Q3 Q4 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
team_df. sample( 6 )
name team Q1 Q2 Q3 Q4 33 Adam C 90 32 47 39 54 Ollie3 C 10 76 30 36 82 Finn E 4 1 55 32 85 Liam B 2 80 24 25 58 Lewis B 4 34 77 28 27 Finley D 62 73 84 68
team_df. shape
(100, 6)
team_df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 100 non-null object
1 team 100 non-null object
2 Q1 100 non-null int64
3 Q2 100 non-null int64
4 Q3 100 non-null int64
5 Q4 100 non-null int64
dtypes: int64(4), object(2)
memory usage: 4.8+ KB
team_df. describe( )
Q1 Q2 Q3 Q4 count 100.000000 100.000000 100.000000 100.000000 mean 49.200000 52.550000 52.670000 52.780000 std 29.962603 29.845181 26.543677 27.818524 min 1.000000 1.000000 1.000000 2.000000 25% 19.500000 26.750000 29.500000 29.500000 50% 51.500000 49.500000 55.000000 53.000000 75% 74.250000 77.750000 76.250000 75.250000 max 98.000000 99.000000 99.000000 99.000000
team_df. dtypes
name object
team object
Q1 int64
Q2 int64
Q3 int64
Q4 int64
dtype: object
team_df. axes
[RangeIndex(start=0, stop=100, step=1),
Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')]
team_df. columns
Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
team_df. set_index( 'name' , inplace= True )
team_df
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Arry C 36 37 37 57 Ack A 57 60 18 84 Eorge C 93 96 71 78 Oah D 65 49 61 86 ... ... ... ... ... ... Gabriel C 48 59 87 74 Austin7 C 21 31 30 43 Lincoln4 C 98 93 1 20 Eli E 11 74 58 91 Ben E 21 43 41 74
100 rows × 5 columns
team_df[ 'Q1' ]
name
Liver 89
Arry 36
Ack 57
Eorge 93
Oah 65
..
Gabriel 48
Austin7 21
Lincoln4 98
Eli 11
Ben 21
Name: Q1, Length: 100, dtype: int64
team_df. Q1
name
Liver 89
Arry 36
Ack 57
Eorge 93
Oah 65
..
Gabriel 48
Austin7 21
Lincoln4 98
Eli 11
Ben 21
Name: Q1, Length: 100, dtype: int64
team_df[ [ 'Q1' , 'Q3' ] ]
Q1 Q3 name Liver 89 24 Arry 36 37 Ack 57 18 Eorge 93 71 Oah 65 61 ... ... ... Gabriel 48 87 Austin7 21 30 Lincoln4 98 1 Eli 11 58 Ben 21 41
100 rows × 2 columns
team_df. loc[ : , [ 'Q1' , 'Q2' ] ]
Q1 Q2 name Liver 89 21 Arry 36 37 Ack 57 60 Eorge 93 96 Oah 65 49 ... ... ... Gabriel 48 59 Austin7 21 31 Lincoln4 98 93 Eli 11 74 Ben 21 43
100 rows × 2 columns
team_df[ team_df. index == 'Liver' ]
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64
team_df[ 0 : 3 ]
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Arry C 36 37 37 57 Ack A 57 60 18 84
team_df[ 0 : 10 : 2 ]
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Ack A 57 60 18 84 Oah D 65 49 61 86 Acob B 61 95 94 8 Reddie D 64 93 57 72
team_df. iloc[ : 10 , : ]
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Arry C 36 37 37 57 Ack A 57 60 18 84 Eorge C 93 96 71 78 Oah D 65 49 61 86 Harlie C 24 13 87 43 Acob B 61 95 94 8 Lfie A 9 10 99 37 Reddie D 64 93 57 72 Oscar A 77 9 26 67
team_df. loc[ 'Ben' , 'Q1' : 'Q4' ]
team_df. loc[ 'Eorge' : 'Alexander' , 'team' : 'Q4' ]
team Q1 Q2 Q3 Q4 name Eorge C 93 96 71 78 Oah D 65 49 61 86 Harlie C 24 13 87 43 Acob B 61 95 94 8 Lfie A 9 10 99 37 Reddie D 64 93 57 72 Oscar A 77 9 26 67 Leo B 17 4 33 79 Logan B 9 89 35 65 Archie C 83 89 59 68 Theo C 51 86 87 27 Thomas B 80 48 56 41 James E 48 77 52 11 Joshua A 63 4 80 30 Henry A 91 15 75 17 William C 80 68 3 26 Max E 97 75 41 3 Lucas A 60 41 77 62 Ethan D 79 45 89 88 Arthur A 44 53 42 40 Mason D 80 96 26 49 Isaac E 74 23 28 65 Harrison B 89 13 18 75 Teddy E 71 91 21 48 Finley D 62 73 84 68 Daniel C 50 50 72 61 Riley E 35 26 59 83 Edward B 57 38 86 87 Joseph E 67 87 87 93 Alexander C 91 76 26 79
team_df[ team_df. team == 'C' ]
team_df[ team_df. Q1 > 90 ]
team_df[ team_df. index== 'Qsar' ]
team Q1 Q2 Q3 Q4 name Oscar A 77 9 26 67
team_df[ ( team_df. Q1 > 90 ) & ( team_df. team == 'C' ) ]
team Q1 Q2 Q3 Q4 name Eorge C 93 96 71 78 Alexander C 91 76 26 79 Lincoln4 C 98 93 1 20
team_df[ team_df[ 'team' ] == 'C' ] . loc[ team_df. Q1 > 90 ]
team Q1 Q2 Q3 Q4 name Eorge C 93 96 71 78 Alexander C 91 76 26 79 Lincoln4 C 98 93 1 20
排序
team_df. sort_values( by= 'Q1' )
team Q1 Q2 Q3 Q4 name Sebastian C 1 14 68 48 Harley B 2 99 12 13 Liam B 2 80 24 25 Lewis B 4 34 77 28 Finn E 4 1 55 32 ... ... ... ... ... ... Eorge C 93 96 71 78 Aaron A 96 75 55 8 Elijah B 97 89 15 46 Max E 97 75 41 3 Lincoln4 C 98 93 1 20
100 rows × 5 columns
team_df. sort_values( by= 'Q3' , ascending= False )
team Q1 Q2 Q3 Q4 name David B 21 47 99 2 Lfie A 9 10 99 37 Luke6 D 15 97 95 99 Acob B 61 95 94 8 Blake A 78 23 93 9 ... ... ... ... ... ... Harley B 2 99 12 13 Jake3 C 69 23 11 40 Jayden6 D 64 21 10 21 William C 80 68 3 26 Lincoln4 C 98 93 1 20
100 rows × 5 columns
team_df. sort_values( [ 'team' , 'Q2' ] , ascending= [ True , False ] )
team Q1 Q2 Q3 Q4 name Dylan A 86 87 65 20 Nathan A 87 77 62 13 Aaron A 96 75 55 8 Stanley A 69 71 39 97 Ack A 57 60 18 84 ... ... ... ... ... ... Liver E 89 21 24 64 Rory9 E 8 12 58 27 Jackson5 E 6 10 15 33 Roman E 73 1 25 44 Finn E 4 1 55 32
100 rows × 5 columns
分组聚合
team_df. groupby( 'team' ) . sum ( )
Q1 Q2 Q3 Q4 team A 1066 639 875 783 B 975 1218 1202 1136 C 1056 1194 1068 1127 D 860 1191 1241 1199 E 963 1013 881 1033
team_df. groupby( 'team' ) . mean( )
Q1 Q2 Q3 Q4 team A 62.705882 37.588235 51.470588 46.058824 B 44.318182 55.363636 54.636364 51.636364 C 48.000000 54.272727 48.545455 51.227273 D 45.263158 62.684211 65.315789 63.105263 E 48.150000 50.650000 44.050000 51.650000
team_df. groupby( 'team' ) . agg( { 'Q1' : sum ,
'Q2' : 'count' ,
'Q3' : 'mean' ,
'Q4' : max } )
Q1 Q2 Q3 Q4 team A 1066 17 51.470588 97 B 975 22 54.636364 99 C 1056 22 48.545455 98 D 860 19 65.315789 99 E 963 20 44.050000 98
数据转换
team_df. groupby( 'team' ) . sum ( )
Q1 Q2 Q3 Q4 team A 1066 639 875 783 B 975 1218 1202 1136 C 1056 1194 1068 1127 D 860 1191 1241 1199 E 963 1013 881 1033
team_df. groupby( 'team' ) . sum ( ) . T
team A B C D E Q1 1066 975 1056 860 963 Q2 639 1218 1194 1191 1013 Q3 875 1202 1068 1241 881 Q4 783 1136 1127 1199 1033
team_df. groupby( 'team' ) . sum ( ) . stack( )
team
A Q1 1066
Q2 639
Q3 875
Q4 783
B Q1 975
Q2 1218
Q3 1202
Q4 1136
C Q1 1056
Q2 1194
Q3 1068
Q4 1127
D Q1 860
Q2 1191
Q3 1241
Q4 1199
E Q1 963
Q2 1013
Q3 881
Q4 1033
dtype: int64
team_df. groupby( 'team' ) . sum ( ) . unstack( )
team
Q1 A 1066
B 975
C 1056
D 860
E 963
Q2 A 639
B 1218
C 1194
D 1191
E 1013
Q3 A 875
B 1202
C 1068
D 1241
E 881
Q4 A 783
B 1136
C 1127
D 1199
E 1033
dtype: int64
添加列
team_df[ 'grade' ] = 'Grade 1'
team_df. head( 5 )
team Q1 Q2 Q3 Q4 grade name Liver E 89 21 24 64 Grade 1 Arry C 36 37 37 57 Grade 1 Ack A 57 60 18 84 Grade 1 Eorge C 93 96 71 78 Grade 1 Oah D 65 49 61 86 Grade 1
team_df[ 'total' ] = team_df[ 'Q1' ] + team_df[ 'Q2' ] + team_df[ 'Q3' ] + team_df[ 'Q4' ]
team_df. head( 5 )
team Q1 Q2 Q3 Q4 grade total name Liver E 89 21 24 64 Grade 1 198 Arry C 36 37 37 57 Grade 1 167 Ack A 57 60 18 84 Grade 1 219 Eorge C 93 96 71 78 Grade 1 338 Oah D 65 49 61 86 Grade 1 261
team_df[ 'total1' ] = team_df. loc[ : , 'Q1' : 'Q4' ] . apply ( lambda x : sum ( x) , axis= 1 )
team_df[ 'total2' ] = team_df. sum ( axis= 1 )
team_df[ 'avg' ] = team_df. total/ 4
team_df
team Q1 Q2 Q3 Q4 grade total total1 total2 avg name Liver E 89 21 24 64 Grade 1 198 198 1188 49.50 Arry C 36 37 37 57 Grade 1 167 167 1002 41.75 Ack A 57 60 18 84 Grade 1 219 219 1314 54.75 Eorge C 93 96 71 78 Grade 1 338 338 2028 84.50 Oah D 65 49 61 86 Grade 1 261 261 1566 65.25 ... ... ... ... ... ... ... ... ... ... ... Gabriel C 48 59 87 74 Grade 1 268 268 1608 67.00 Austin7 C 21 31 30 43 Grade 1 125 125 750 31.25 Lincoln4 C 98 93 1 20 Grade 1 212 212 1272 53.00 Eli E 11 74 58 91 Grade 1 234 234 1404 58.50 Ben E 21 43 41 74 Grade 1 179 179 1074 44.75
100 rows × 10 columns