1. 数据索引
import pandas as pd
left = pd. DataFrame( { 'key' : [ 'k1' , 'k2' , 'k3' , 'k4' ] ,
'name' : [ 'n1' , 'n2' , 'n3' , 'n4' ] ,
'id' : [ 'id1' , 'id2' , 'id3' , 'id4' ]
}
)
right = pd. DataFrame( { 'key' : [ 'k221' , 'k222' , 'k223' , 'k224' ] ,
'name' : [ 'n221' , 'n222' , 'n223' , 'n224' ] ,
'id' : [ 'id221' , 'id222' , 'id223' , 'id224' ]
}
)
print ( left)
print ( right)
id key name
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
id key name
0 id221 k221 n221
1 id222 k222 n222
2 id223 k223 n223
3 id224 k224 n224
A 选取行
left. loc[ 2 ]
id id3
key k3
name n3
Name: 2, dtype: object
left. loc[ [ 1 , 2 , 3 ] ]
id key name 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. loc[ 1 : 2 ]
id key name 1 id2 k2 n2 2 id3 k3 n3
mask = left[ 'name' ] > 'n1'
mask
0 False
1 True
2 True
3 True
Name: name, dtype: bool
left. loc[ mask]
id key name 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. iloc[ 3 ]
id id4
key k4
name n4
Name: 3, dtype: object
left. iloc[ [ 0 , 1 ] ]
id key name 0 id1 k1 n1 1 id2 k2 n2
left. iloc[ 0 : 2 ]
id key name 0 id1 k1 n1 1 id2 k2 n2
left. loc[ 0 : 2 ]
id key name 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3
left. set_index( 'key' )
id name key k1 id1 n1 k2 id2 n2 k3 id3 n3 k4 id4 n4
left. set_index( 'name' )
id key name n1 id1 k1 n2 id2 k2 n3 id3 k3 n4 id4 k4
left. loc[ 0 : 2 ]
id key name 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3
left_reset_index = left. set_index( "key" )
left_reset_index
id name key k1 id1 n1 k2 id2 n2 k3 id3 n3 k4 id4 n4
left_reset_index. loc[ [ 'k1' ] ]
left_reset_index. iloc[ [ 0 , 2 ] ]
B. 选取列
left[ 'name' ]
0 n1
1 n2
2 n3
3 n4
Name: name, dtype: object
left[ [ 'name' , 'id' ] ]
name id 0 n1 id1 1 n2 id2 2 n3 id3 3 n4 id4
left[ 0 : 2 ]
id key name 0 id1 k1 n1 1 id2 k2 n2
left[ left[ 'name' ] > 'n2' ]
id key name 2 id3 k3 n3 3 id4 k4 n4
C. 选取行和列
left
left. loc[ 0 , "key" ]
'k1'
left. loc[ [ 1 , 2 ] , [ 'name' , 'id' ] ]
left. loc[ 0 : 2 , : ]
id key name 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3
left. iloc[ [ 0 , 1 ] , [ 0 , 2 ] ]
left. iloc[ [ 0 , 1 ] , [ 0 , 1 ] ]
基本操作
left. columns
Index(['id', 'key', 'name'], dtype='object')
left. columns = [ "xuhao" , 'guanjianzhi' , 'mingzi' ]
left
xuhao guanjianzhi mingzi 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. xuhao
0 id1
1 id2
2 id3
3 id4
Name: xuhao, dtype: object
left. rename( columns= { "xuhao" : "ID" , "guanjianzhi" : "KEY" , "mingzi" : "NAME" } )
ID KEY NAME 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left
xuhao guanjianzhi mingzi 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. rename( columns= { "xuhao" : "ID" , "guanjianzhi" : "KEY" , "mingzi" : "NAME" } , inplace= True )
left
ID KEY NAME 0 id1 k1 n1 1 id2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. replace( { "ID" : { 'id1' : "ID1" , 'id2' : 'ID2' } } , inplace= True ) ;
left
ID KEY NAME 0 ID1 k1 n1 1 ID2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
基本的排序和增删
left. ID. unique( )
array(['ID1', 'ID2', 'id3', 'id4'], dtype=object)
left. ID. value_counts( )
ID1 1
id3 1
ID2 1
id4 1
Name: ID, dtype: int64
left. sort_values( 'ID' )
ID KEY NAME 0 ID1 k1 n1 1 ID2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. sort_values( [ 'ID' , 'NAME' ] , ascending= [ True , False ] )
ID KEY NAME 0 ID1 k1 n1 1 ID2 k2 n2 2 id3 k3 n3 3 id4 k4 n4
left. sort_values( [ 'ID' , "NAME" ] , ascending= [ False , False ] )
ID KEY NAME 3 id4 k4 n4 2 id3 k3 n3 1 ID2 k2 n2 0 ID1 k1 n1
left. replace( { "ID" : { 'id3' : 'ID3' , 'ID2' : 'id2' } } , inplace= True )
left
ID KEY NAME 0 ID1 k1 n1 1 id2 k2 n2 2 ID3 k3 n3 3 id4 k4 n4
left. sort_values( [ 'ID' ] , ascending= [ True ] )
ID KEY NAME 0 ID1 k1 n1 2 ID3 k3 n3 1 id2 k2 n2 3 id4 k4 n4
left[ 'mailema' ] = 1
left
ID KEY NAME mailema 0 ID1 k1 n1 1 1 id2 k2 n2 1 2 ID3 k3 n3 1 3 id4 k4 n4 1
left[ 'rongyuceshilie' ] = 1
left
ID KEY NAME mailema rongyuceshilie 0 ID1 k1 n1 1 1 1 id2 k2 n2 1 1 2 ID3 k3 n3 1 1 3 id4 k4 n4 1 1
del left[ 'rongyuceshilie' ]
left
ID KEY NAME mailema 0 ID1 k1 n1 1 1 id2 k2 n2 1 2 ID3 k3 n3 1 3 id4 k4 n4 1
ret = left. drop( labels= 2 )
print ( left)
print ( ret)
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
2 ID3 k3 n3 1
3 id4 k4 n4 1
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
3 id4 k4 n4 1
left. NAME
ret = left. NAME. map ( { 'n1' : 'NNN1' , 'n2' : "N2" , 'n3' : 'NN3' , 'n4' : "NN4" } )
print ( ret)
print ( left. NAME)
0 NNN1
1 N2
2 NN3
3 NN4
Name: NAME, dtype: object
0 n1
1 n2
2 n3
3 n4
Name: NAME, dtype: object
import numpy as np
left. NAME. map ( "hello my name is {}" . format )
test_series = left. NAME
yy = pd. Series( [ 'name_ext' , np. nan] )
print ( yy)
test_series = test_series. append( yy)
test_series = test_series. map ( "my name is {}" . format , na_action= 'ignore' )
print ( test_series)
0 name_ext
1 NaN
dtype: object
0 my name is n1
1 my name is n2
2 my name is n3
3 my name is n4
0 my name is name_ext
1 NaN
dtype: object
from collections import Counter
counter = Counter( )
counter[ 'bar' ] += 1
counter
Counter({'bar': 1})
left. min ( )
ID ID1
KEY k1
NAME n1
mailema 1
rongyuceshilie 1
dtype: object
left. max ( )
ID id4
KEY k4
NAME n4
mailema 1
rongyuceshilie 1
dtype: object
left. sum ( )
ID ID1id2ID3id4
KEY k1k2k3k4
NAME n1n2n3n4
mailema 4
rongyuceshilie 4
dtype: object
left. cumsum( )
ID KEY NAME mailema rongyuceshilie 0 ID1 k1 n1 1 1 1 ID1id2 k1k2 n1n2 2 2 2 ID1id2ID3 k1k2k3 n1n2n3 3 3 3 ID1id2ID3id4 k1k2k3k4 n1n2n3n4 4 4
pandas 常用操作矩阵运算
df = pd. DataFrame( np. random. random( ( 5 , 10 ) ) , columns = list ( 'abcdefghij' ) )
df
a b c d e f g h i j 0 0.920187 0.468342 0.251023 0.012371 0.410920 0.823567 0.644462 0.971567 0.363926 0.125066 1 0.388891 0.017500 0.400027 0.907302 0.747884 0.850026 0.871119 0.361581 0.313365 0.517795 2 0.858049 0.818444 0.429782 0.953707 0.362399 0.597594 0.583865 0.363218 0.671535 0.887737 3 0.981348 0.884835 0.333632 0.930583 0.990436 0.264681 0.349709 0.938117 0.729409 0.371021 4 0.409097 0.270208 0.093192 0.985581 0.389709 0.891675 0.551112 0.897221 0.573714 0.854950
df. idxmax( )
a 3
b 3
c 2
d 4
e 3
f 4
g 1
h 0
i 3
j 2
dtype: int64
df* 10
a b c d e f g h i j 0 9.201869 4.683421 2.510227 0.123709 4.109205 8.235670 6.444622 9.715667 3.639265 1.250659 1 3.888908 0.174995 4.000269 9.073024 7.478841 8.500256 8.711186 3.615811 3.133648 5.177951 2 8.580488 8.184435 4.297818 9.537071 3.623993 5.975941 5.838652 3.632183 6.715348 8.877372 3 9.813483 8.848354 3.336317 9.305834 9.904355 2.646808 3.497094 9.381173 7.294089 3.710212 4 4.090968 2.702080 0.931915 9.855806 3.897086 8.916751 5.511120 8.972206 5.737144 8.549502
df* df
a b c d e f g h i j 0 0.846744 0.219344 0.063012 0.000153 0.168856 0.678263 0.415332 0.943942 0.132442 0.015641 1 0.151236 0.000306 0.160022 0.823198 0.559331 0.722544 0.758848 0.130741 0.098197 0.268112 2 0.736248 0.669850 0.184712 0.909557 0.131333 0.357119 0.340899 0.131928 0.450959 0.788077 3 0.963045 0.782934 0.111310 0.865985 0.980962 0.070056 0.122297 0.880064 0.532037 0.137657 4 0.167360 0.073012 0.008685 0.971369 0.151873 0.795084 0.303724 0.805005 0.329148 0.730940
df. dot( df. T)
0 1 2 3 4 0 3.483729 2.576565 3.018221 3.486335 2.975670 1 2.576565 3.672533 3.474288 3.405148 3.571698 2 3.018221 3.474288 4.700682 4.478348 4.018175 3 3.486335 3.405148 4.478348 5.446347 3.980907 4 2.975670 3.571698 4.018175 3.980907 4.336201
df. T. dot( df)
a b c d e f g h i j a 2.864633 2.118905 1.120862 2.498977 2.111316 2.225693 2.001426 2.633968 1.983464 1.792030 b 2.118905 1.745446 0.796707 1.891951 1.483817 1.364820 1.253284 1.831142 1.525968 1.353505 c 1.120862 0.796707 0.527741 1.178257 0.924835 0.975005 0.929213 0.941231 0.802141 0.823518 d 2.498977 1.891951 1.178257 3.570262 2.335035 2.476474 2.223776 2.443766 2.173484 2.505874 e 2.111316 1.483817 0.924835 2.335035 1.992355 1.800352 1.689049 2.080087 1.573283 1.461012 f 2.225693 1.364820 0.975005 2.476474 1.800352 2.623065 2.204120 2.372892 1.672019 1.934186 g 2.001426 1.253284 0.929213 2.223776 1.689049 2.204120 1.941099 1.975726 1.470863 1.650903 h 2.633968 1.831142 0.941231 2.443766 2.080087 2.372892 1.975726 2.891679 1.909819 1.746318 i 1.983464 1.525968 0.802141 2.173484 1.573283 1.672019 1.470863 1.909819 1.542785 1.565043 j 1.792030 1.353505 0.823518 2.505874 1.461012 1.934186 1.650903 1.746318 1.565043 1.940427