pandas向量化字符串处理以及数据透视表笔记
import numpy as np
import pandas as pd
import re
df= pd. DataFrame( np. random. randint( 2 , 15 , ( 5 , 4 ) ) , columns= [ 'data1' , 'data2' , 'data3' , 'data4' ] )
df
data1 data2 data3 data4 0 10 11 5 4 1 10 13 11 7 2 10 8 7 10 3 12 7 14 6 4 12 12 6 9
df. groupby( [ 'data1' , 'data2' ] ) [ [ 'data3' ] ] . mean( )
data3 data1 data2 10 8 7 11 5 13 11 12 7 14 12 6
df. groupby( [ 'data1' , 'data2' ] ) [ [ 'data3' , 'data4' ] ] . mean( )
data3 data4 data1 data2 10 8 7 10 11 5 4 13 11 7 12 7 14 6 12 6 9
df. groupby( [ 'data1' , 'data2' ] ) . mean( ) . unstack( level= 1 ) . fillna( 'x' )
data3 data4 data2 7 8 11 12 13 7 8 11 12 13 data1 10 x 7 5 x 11 x 10 4 x 7 12 14 x x 6 x 6 x x 9 x
df = pd. DataFrame( { "A" : [ "foo" , "foo" , "foo" , "foo" , "foo" ,
"bar" , "bar" , "bar" , "bar" ] ,
"B" : [ "one" , "one" , "one" , "two" , "two" ,
"one" , "one" , "two" , "two" ] ,
"C" : [ "small" , "large" , "large" , "small" ,
"small" , "large" , "small" , "small" ,
"large" ] ,
"D" : [ 1 , 2 , 2 , 3 , 3 , 4 , 5 , 6 , 7 ] ,
"E" : [ 2 , 4 , 5 , 5 , 6 , 6 , 8 , 9 , 9 ] } )
df
A B C D E 0 foo one small 1 2 1 foo one large 2 4 2 foo one large 2 5 3 foo two small 3 5 4 foo two small 3 6 5 bar one large 4 6 6 bar one small 5 8 7 bar two small 6 9 8 bar two large 7 9
df. pivot_table( index= 'A' , columns= [ 'B' , 'C' ] )
D E B one two one two C large small large small large small large small A bar 4.0 5.0 7.0 6.0 6.0 8.0 9.0 9.0 foo 2.0 1.0 NaN 3.0 4.5 2.0 NaN 5.5
df. pivot_table( index= 'A' , columns= [ 'B' , 'C' ] , aggfunc= { 'D' : np. mean, 'E' : np. sum } , margins= True , margins_name= 'YY' )
D E B one two YY one two YY C large small large small large small large small A bar 4.000000 5.0 7.0 6.0 5.500000 6.0 8.0 9.0 9.0 32 foo 2.000000 1.0 NaN 3.0 2.200000 9.0 2.0 NaN 11.0 22 YY 2.666667 3.0 7.0 4.0 3.666667 15.0 10.0 9.0 20.0 54
向量化字符串方法
针对Series和Index对象,DataFrame并没有
data= pd. Series( [ 'peter' , 'paul' , 'mary' , 'gUIDO' ] )
data. str . capitalize( )
0 Peter
1 Paul
2 Mary
3 Guido
dtype: object
L= str . maketrans( 'pa' , 'be' )
data. str . translate( L)
0 beter
1 beul
2 mery
3 gUIDO
dtype: object
monte = pd. Series( [ 'Graham Chapman' , 'John Cleese' , 'Terry Gilliam' ,
'Eric Idle' , 'Terry Jones' , 'Michael Palin' ] )
monte
0 Graham Chapman
1 John Cleese
2 Terry Gilliam
3 Eric Idle
4 Terry Jones
5 Michael Palin
dtype: object
monte. str . extract( '([a-zA-Z]+)' )
0 0 Graham 1 John 2 Terry 3 Eric 4 Terry 5 Michael
text= 'Graham Chapman'
m= re. match( '[a-zA-Z]+' , text)
m. group( 0 )
'Graham'
monte. str . extract( r'(^[^AEIOU].*[^aeiou]$)' )
0 0 Graham Chapman 1 NaN 2 Terry Gilliam 3 NaN 4 Terry Jones 5 Michael Palin
monte. str [ 0 : 3 ]
monte. str . slice ( 0 , 3 )
0 Gra
1 Joh
2 Ter
3 Eri
4 Ter
5 Mic
dtype: object
monte. str . wrap( 3 )
0 Gra\nham\nCha\npma\nn
1 Joh\nn C\nlee\nse
2 Ter\nry \nGil\nlia\nm
3 Eri\nc I\ndle
4 Ter\nry \nJon\nes
5 Mic\nhae\nl P\nali\nn
dtype: object
monte. str . repeat( 3 )
0 Graham ChapmanGraham ChapmanGraham Chapman
1 John CleeseJohn CleeseJohn Cleese
2 Terry GilliamTerry GilliamTerry Gilliam
3 Eric IdleEric IdleEric Idle
4 Terry JonesTerry JonesTerry Jones
5 Michael PalinMichael PalinMichael Palin
dtype: object
方法分为以下三类:
pandas向量化字符串处理与re库API: pandas其他字符串处理方法: