对象操作
对象的增删改查
data = [ 10 , 11 , 12 ]
index = [ 'a' , 'b' , 'c' ]
s = pd. Series( data= data, index= index)
s
a 10
b 11
c 12
dtype: int64
查
s[ 0 ]
10
s[ 0 : 2 ]
a 10
b 11
dtype: int64
s. loc[ 'b' ]
11
s. iloc[ 1 ]
11
改
s1 = s. copy( )
s1
a 10
b 11
c 12
dtype: int64
s1[ 'a' ] = 100
s1
a 100
b 11
c 12
dtype: int64
s1. replace( to_replace= 100 , value= 101 , inplace= True )
s1
a 101
b 11
c 12
dtype: int64
s1. index
Index(['a', 'b', 'c'], dtype='object')
s1. index = [ 'a' , 'b' , 'd' ]
s1
a 101
b 11
d 12
dtype: int64
s1. rename( index= { 'a' : 'A' } , inplace= True )
s1
A 101
b 11
d 12
dtype: int64
增
data = [ 100 , 110 ]
index = [ 'h' , 'k' ]
s2 = pd. Series( data= data, index= index)
s2
h 100
k 110
dtype: int64
s3 = s1. append( s2)
s3
A 101
b 11
d 12
h 100
k 110
dtype: int64
s3[ 'j' ] = 500
s3
A 101
b 11
d 12
h 100
k 110
j 500
dtype: int64
删
data = [ 10 , 11 , 12 ]
index= [ 'a' , 'b' , 'c' ]
s1 = pd. Series( data= data, index= index)
s1
a 10
b 11
c 12
dtype: int64
del s1[ 'a' ]
s1
b 11
c 12
dtype: int64
s1. drop( [ 'b' , 'c' ] , inplace= True )
s1
Series([], dtype: int64)
Dataframe增删改查
data = [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ]
index = [ 'a' , 'b' ]
columns = [ 'A' , 'B' , 'C' ]
df = pd. DataFrame( data= data, index= index, columns= columns)
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
查
df[ 'A' ]
a 1
b 4
Name: A, dtype: int64
df. loc[ 'a' ] [ 'A' ]
1
改
df. loc[ 'a' ] [ 'A' ] = 150
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df. index = [ 'f' , 'g' ]
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
增
df. loc[ 'c' ] = [ 1 , 2 , 3 ]
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
data = [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ]
index = [ 'j' , 'k' ]
columns = [ 'A' , 'B' , 'C' ]
df2 = pd. DataFrame( data= data, index= index, columns= columns)
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df3 = pd. concat( [ df, df2] , axis= 0 )
df3
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A B C f 150 2 3 g 4 5 6 c 1 2 3 j 1 2 3 k 4 5 6
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df2[ 'c' ] = [ 2 , 3 ]
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
删
df2. drop( [ 'c' ] , axis= 1 , inplace= True )
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df3
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A B C f 150 2 3 g 4 5 6 c 1 2 3 j 1 2 3 k 4 5 6
df4 = pd. concat( [ df2, df3] , axis= 1 )
df4
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A B C A B C j 1.0 2.0 3.0 1 2 3 k 4.0 5.0 6.0 4 5 6 f NaN NaN NaN 150 2 3 g NaN NaN NaN 4 5 6 c NaN NaN NaN 1 2 3
del df4[ 'A' ]
df4
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
B C B C j 2.0 3.0 2 3 k 5.0 6.0 5 6 f NaN NaN 2 3 g NaN NaN 5 6 c NaN NaN 2 3
合并
left = pd. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pd. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 B3
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key C D 0 K0 C0 D0 1 K1 C1 D1 2 K2 C2 D2 3 K3 C3 D3
pd. merge( left, right)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3
pd. merge( left, right, on= 'key' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3
left = pd. DataFrame( { 'key1' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'key2' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pd. DataFrame( { 'key1' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'key2' : [ 'K4' , 'K1' , 'K2' , 'K3' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 A B 0 K0 K0 A0 B0 1 K1 K1 A1 B1 2 K2 K2 A2 B2 3 K3 K3 A3 B3
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 C D 0 K0 K4 C0 D0 1 K1 K1 C1 D1 2 K2 K2 C2 D2 3 K3 K3 C3 D3
pd. merge( left, right, on= 'key1' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2_x A B key2_y C D 0 K0 K0 A0 B0 K4 C0 D0 1 K1 K1 A1 B1 K1 C1 D1 2 K2 K2 A2 B2 K2 C2 D2 3 K3 K3 A3 B3 K3 C3 D3
pd. merge( left, right, on= [ 'key1' , 'key2' ] )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 A B C D 0 K1 K1 A1 B1 C1 D1 1 K2 K2 A2 B2 C2 D2 2 K3 K3 A3 B3 C3 D3
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 A B C D 0 K0 K0 A0 B0 NaN NaN 1 K1 K1 A1 B1 C1 D1 2 K2 K2 A2 B2 C2 D2 3 K3 K3 A3 B3 C3 D3 4 K0 K4 NaN NaN C0 D0
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'left' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 A B C D 0 K0 K0 A0 B0 NaN NaN 1 K1 K1 A1 B1 C1 D1 2 K2 K2 A2 B2 C2 D2 3 K3 K3 A3 B3 C3 D3
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'right' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 key2 A B C D 0 K0 K4 NaN NaN C0 D0 1 K1 K1 A1 B1 C1 D1 2 K2 K2 A2 B2 C2 D2 3 K3 K3 A3 B3 C3 D3
显示设置
pd. get_option( 'display.max_rows' )
60
pd. Series( index= range ( 0 , 100 ) )
<ipython-input-164-b590a3b37414>:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
pd.Series(index=range(0,100))
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
..
95 NaN
96 NaN
97 NaN
98 NaN
99 NaN
Length: 100, dtype: float64
pd. set_option( 'display.max_rows' , 6 )
pd. Series( index= range ( 0 , 100 ) )
<ipython-input-166-ff8d23c4aa8c>:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
pd.Series(index=range(0,100)) # 只显示六行
0 NaN
1 NaN
2 NaN
..
97 NaN
98 NaN
99 NaN
Length: 100, dtype: float64
pd. get_option( 'display.max_columns' )
20
pd. DataFrame( columns= range ( 0 , 30 ) )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 1 2 3 4 5 6 7 8 9 ... 20 21 22 23 24 25 26 27 28 29
0 rows × 30 columns
pd. set_option( 'display.max_columns' , 5 )
pd. DataFrame( columns= range ( 0 , 30 ) )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 rows × 30 columns
pd. get_option( 'display.max_colwidth' )
50
pd. Series( index= [ 'A' ] , data= [ 't' * 70 ] )
A tttttttttttttttttttttttttttttttttttttttttttttt...
dtype: object
pd. set_option( 'display.max_colwidth' , 10 )
pd. Series( index= [ 'A' ] , data= [ 't' * 70 ] )
A tttttt...
dtype: object
显示精度
pd. get_option( 'display.precision' )
6
pd. Series( data = [ 1.2342454543534534 ] )
0 1.234245
dtype: float64
pd. set_option( 'display.precision' , 3 )
pd. Series( data= [ 1.232423424432434 ] )
0 1.232
dtype: float64
apply操作
titantic = pd. read_csv( './can.csv' )
titantic. head( )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 20 1.004 0.090 -0.125 0 1 20 1.004 -0.043 -0.125 1 1 20 0.969 0.090 -0.121 2 1 20 0.973 -0.012 -0.137 3 1 20 1.000 -0.016 -0.121 4 1 20 0.961 0.082 -0.121
def hundredth_row ( columns) :
item = columns. iloc[ 0 ]
return item
hundredth_row = titantic. apply ( hundredth_row)
hundredth_row
1 1.000
20 20.000
1.004 1.004
0.090 -0.043
-0.125 -0.125
dtype: float64
计算缺失值
def not_null_cont ( columns) :
columns_null = pd. isnull( columns)
null = columns[ columns_null]
return len ( null)
columns_null_cont = titantic. apply ( not_null_cont)
columns_null_cont
1 0
20 0
1.004 0
0.090 0
-0.125 0
dtype: int64
常用操作
pd. set_option( 'display.max_rows' , 20 )
data = pd. DataFrame( { 'group' : [ 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'c' , 'c' , 'c' ] ,
'data' : [ 4 , 3 , 2 , 1 , 12 , 3 , 4 , 5 , 7 ] } )
data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
group data 0 a 4 1 a 3 2 a 2 3 b 1 4 b 12 5 b 3 6 c 4 7 c 5 8 c 7
去重
data = pd. DataFrame( { 'k1' : [ 'one' ] * 3 + [ 'two' ] * 4 ,
'k2' : [ 3 , 2 , 1 , 3 , 3 , 4 , 4 ] } )
data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
k1 k2 0 one 3 1 one 2 2 one 1 3 two 3 4 two 3 5 two 4 6 two 4
data. drop_duplicates( )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
k1 k2 0 one 3 1 one 2 2 one 1 3 two 3 5 two 4
data. drop_duplicates( subset= 'k1' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pivot操作
数据透视表
example = pd. DataFrame( { 'Month' : [ 'January' , 'January' , 'January' , 'January' , 'February' , 'February' , 'February' , 'February' , 'March' , "March" , "March" , "March" ] ,
'Category' : [ "Transportation" , "Grocery" , "Household" , "Entertainment" , 'Transportation' , "Grocery" , "Household" , "Entertainment" , 'Transportation' , "Grocery" , "Household" , "Entertainment" ] ,
'Amount' : [ 74 . , 235 . , 175 . , 100 . , 115 . , 240 . , 225 . , 125 . , 90 . , 260 . , 200 . , 120 . ] } )
example
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Month Category Amount 0 January Transp... 74.0 1 January Grocery 235.0 2 January Household 175.0 3 January Entert... 100.0 4 February Transp... 115.0 5 February Grocery 240.0 6 February Household 225.0 7 February Entert... 125.0 8 March Transp... 90.0 9 March Grocery 260.0 10 March Household 200.0 11 March Entert... 120.0
example_pivot = example. pivot( index= 'Category' , columns= 'Month' , values= 'Amount' )
example_pivot
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Month February January March Category Entertainment 125.0 100.0 120.0 Grocery 240.0 235.0 260.0 Household 225.0 175.0 200.0 Transportation 115.0 74.0 90.0
df = pd. read_csv( './can.csv' )
df. head( )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 20 1.004 0.090 -0.125 0 1 20 1.004 -0.043 -0.125 1 1 20 0.969 0.090 -0.121 2 1 20 0.973 -0.012 -0.137 3 1 20 1.000 -0.016 -0.121 4 1 20 0.961 0.082 -0.121
df. pivot_table( index= '20' , columns= '1' , values= '1.004' )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 1 2 3 20 20 0.995 0.995 0.993 25 0.995 0.994 0.995 30 0.995 0.994 0.995 35 0.995 0.993 0.995 40 0.994 0.995 0.991 45 0.994 0.994 0.991 50 0.992 0.998 0.991 55 0.995 0.990 0.991 60 0.992 0.995 0.991 65 1.019 0.998 0.990 70 0.967 0.997 0.990 75 0.992 0.942 0.994 80 1.101 1.011 0.996 85 1.124 0.993 0.960 90 0.907 0.971 0.993 95 0.989 1.066 0.971 100 0.957 1.043 0.981
df = pd. read_csv( './xiecheng.csv' )
pd. set_option( 'display.max_columns' , 20 )
df. head( )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id c_id c_name h_id h_name h_url 0 1 21656 犹他州 97942 布莱斯峡谷国家公园 https:... 1 2 21656 犹他州 89380 大盐湖 https:... 2 3 21656 犹他州 89748 大提顿国家公园 https:... 3 4 21656 犹他州 10522793 拱门国家公园 https:... 4 5 21656 犹他州 89746 锡安国家公园 https:...
data = df[ pd. notnull( df[ 'c_name' ] ) ]
data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id c_id c_name h_id h_name h_url 0 1 21656 犹他州 97942 布莱斯峡谷国家公园 https:... 1 2 21656 犹他州 89380 大盐湖 https:... 2 3 21656 犹他州 89748 大提顿国家公园 https:... 3 4 21656 犹他州 10522793 拱门国家公园 https:... 4 5 21656 犹他州 89746 锡安国家公园 https:... ... ... ... ... ... ... ... 381362 381363 1088 威斯巴登 80362 国王大厦 https:... 381363 381364 1088 威斯巴登 15054850 尿尿少女 https:... 381364 381365 1088 威斯巴登 20903944 布鲁塞尔皇家... https:... 381365 381366 1088 威斯巴登 104145815 Besuch... https:... 381366 381367 1088 威斯巴登 50537923 Herman... https:...
381367 rows × 6 columns
grouped = data[ 'h_id' ] . groupby( by= data[ 'c_name' ] ) . mean( )
grouped
c_name
Barra de Santiago 4.725e+07
Barsana 4.587e+07
Castromediano 5.176e+07
Citluk 5.188e+07
Curtea de Arges 4.470e+07
...
齐斯泰兹 4.528e+07
龙目岛 4.230e+07
龙脊梯田 2.364e+07
龙运 3.717e+07
龙里 3.638e+07
Name: h_id, Length: 1355, dtype: float64
_x = grouped. index
_x
Index(['Barra de Santiago', 'Barsana', 'Castromediano', 'Citluk',
'Curtea de Arges', 'Darmanesti', 'Dcheira El Jihadia', 'Dofteana',
'El Sunzal', 'El Zonte',
...
'麦凯', '麦德林', '黄山风景区', '黔西', '黟县', '齐斯泰兹', '龙目岛', '龙脊梯田', '龙运', '龙里'],
dtype='object', name='c_name', length=1355)
_y = grouped. values
_y
array([47245870.39130435, 45865902.91304348, 51759147.98058253, ...,
23639994.64705882, 37166859.5875 , 36377040.8 ])
plt. figure( figsize= ( 20 , 8 ) , dpi= 80 )
plt. plot( range ( len ( _x) ) , _y)
print ( len ( _x) )
plt. xticks( list ( range ( len ( _x) ) ) [ : : 200 ] , rotation= 45 )
plt. show
1355
<function matplotlib.pyplot.show(close=None, block=None)>
df = pd. read_csv( './xiecheng.csv' )
df. head( )
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id c_id c_name h_id h_name h_url 0 1 21656 犹他州 97942 布莱斯峡谷国家公园 https:... 1 2 21656 犹他州 89380 大盐湖 https:... 2 3 21656 犹他州 89748 大提顿国家公园 https:... 3 4 21656 犹他州 10522793 拱门国家公园 https:... 4 5 21656 犹他州 89746 锡安国家公园 https:...
data = { 'title' : [ 'EMS: BACK PAINS' , 'Fire: DJIJFIFFDS' ] }
df = pd. DataFrame( data)
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
title 0 EMS: B... 1 Fire: ...
temp = df[ 'title' ] . str . split( ':' ) . tolist( )
temp
[['EMS', ' BACK PAINS'], ['Fire', ' DJIJFIFFDS']]
添加
字符串操作
s = pd. Series( [ 'A' , 'b' , 'B' , 'gaer' , 'AGRE' , np. nan] )
s
0 A
1 b
2 B
3 gaer
4 AGRE
5 NaN
dtype: object
s. str . lower( )
0 a
1 b
2 b
3 gaer
4 agre
5 NaN
dtype: object
s. str . upper( )
0 A
1 B
2 B
3 GAER
4 AGRE
5 NaN
dtype: object
s. str . len ( )
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
dtype: float64
index = pd. Index( [ ' tang' , ' yu ' , 'di' ] )
index
Index([' tang', ' yu ', 'di'], dtype='object')
index. str . strip( )
Index(['tang', 'yu', 'di'], dtype='object')
index. str . lstrip( )
Index(['tang', 'yu ', 'di'], dtype='object')
index. str . rstrip( )
Index([' tang', ' yu', 'di'], dtype='object')
df = pd. DataFrame( np. random. randn( 3 , 2 ) , columns= [ 'A a' , 'B b' ] , index= range ( 3 ) )
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A a B b 0 -1.184 -1.025 1 -0.663 1.280 2 0.570 -0.043
字符串操作
s = pd. Series( [ 'A' , 'b' , 'B' , 'gaer' , 'AGRE' , np. nan] )
s
0 A
1 b
2 B
3 gaer
4 AGRE
5 NaN
dtype: object
s. str . lower( )
0 a
1 b
2 b
3 gaer
4 agre
5 NaN
dtype: object
s. str . upper( )
0 A
1 B
2 B
3 GAER
4 AGRE
5 NaN
dtype: object
s. str . len ( )
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
dtype: float64
index = pd. Index( [ ' tang' , ' yu ' , 'di' ] )
index
Index([' tang', ' yu ', 'di'], dtype='object')
index. str . strip( )
Index(['tang', 'yu', 'di'], dtype='object')
index. str . lstrip( )
Index(['tang', 'yu ', 'di'], dtype='object')
index. str . rstrip( )
Index([' tang', ' yu', 'di'], dtype='object')
df = pd. DataFrame( np. random. randn( 3 , 2 ) , columns= [ 'A a' , 'B b' ] , index= range ( 3 ) )
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A a B b 0 1.272 -1.511 1 -0.231 2.208 2 -1.033 0.833
df. columns = df. columns. str . replace( ' ' , '_' )
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A_a B_b 0 1.272 -1.511 1 -0.231 2.208 2 -1.033 0.833