'''
【课程2.5】 Pandas数据结构Dataframe:基本概念及创建
"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
'''
import pandas as pd
import numpy as np
df= pd. DataFrame( {
'name' : [ 'J' , 'M' , 'D' ] ,
'age' : [ 18 , 19 , 20 ] ,
'gender' : [ 'm' , 'w' , 'm' ] ,
} , index= list ( 'abc' ) )
print ( df)
age gender name
a 18 m J
b 19 w M
c 20 m D
Index(['a', 'b', 'c'], dtype='object')
print ( df. index)
Index(['a', 'b', 'c'], dtype='object')
print ( df. columns)
Index(['age', 'gender', 'name'], dtype='object')
df1= pd. DataFrame( {
'name' : [ 'a' , 'b' , 'c' ] ,
'age' : [ 20 , 21 , 22 ]
} )
print ( df1)
df2= pd. DataFrame( {
'one' : np. random. rand( 3 ) ,
'two' : np. random. rand( 3 )
} , index= list ( 'abc' ) )
print ( df2)
age name
0 20 a
1 21 b
2 22 c
one two
a 0.154260 0.372618
b 0.544828 0.497320
c 0.811307 0.071709
df2= pd. DataFrame( {
'one' : np. random. rand( 3 ) ,
'two' : np. random. rand( 3 )
} , index= list ( 'abc' ) , columns= [ 'two' , 'three' , 'one' ] )
print ( df2)
two three one
a 0.593346 NaN 0.772806
b 0.556199 NaN 0.025852
c 0.705040 NaN 0.173555
df1= pd. DataFrame( {
'one' : pd. Series( np. random. rand( 3 ) ) ,
'two' : pd. Series( np. random. rand( 3 ) )
} )
print ( df1)
one two
0 0.414850 0.665624
1 0.249675 0.699969
2 0.702770 0.900113
df2= pd. DataFrame( {
'one' : pd. Series( np. random. rand( 3 ) , index= list ( 'abc' ) ) ,
'two' : pd. Series( np. random. rand( 2 ) , index= list ( 'ab' ) )
} )
print ( df2)
one two
a 0.288482 0.007029
b 0.314350 0.708146
c 0.891189 NaN
df1 = pd. DataFrame( np. arange( 9 ) . reshape( ( 3 , 3 ) ) , index= list ( 'abc' ) , columns= list ( 'xyz' ) )
print ( df1)
x y z
a 0 1 2
b 3 4 5
c 6 7 8
df1= pd. DataFrame( [
{ 'one' : 'a' , 'two' : 'b' } ,
{ 'three' : 'c' , 'four' : 'd' }
] , index= list ( 'ab' ) )
print ( df1)
four one three two
a NaN a NaN b
b d NaN c NaN
df= pd. DataFrame( {
'A' : { 'math' : 80 , 'art' : 90 } ,
'B' : { 'math' : 80 , 'art' : 90 }
} )
print ( df)
A B
art 90 90
math 80 80
'''
【课程2.6】 Pandas数据结构Dataframe:索引
Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
'''
df= pd. DataFrame( np. random. rand( 12 ) . reshape( 3 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' ] ,
columns= list ( 'abcd' ) )
print ( df)
a b c d
one 43.412719 76.530491 16.054262 1.146777
two 69.950579 67.093175 23.407335 43.765119
three 70.160305 81.423366 20.290999 78.110449
data1= df[ 'a' ]
print ( data1)
data2= df[ [ 'a' , 'c' , 'd' ] ]
print ( data2)
one 43.412719
two 69.950579
three 70.160305
Name: a, dtype: float64
a c d
one 43.412719 16.054262 1.146777
two 69.950579 23.407335 43.765119
three 70.160305 20.290999 78.110449
data1= df. loc[ 'one' ]
print ( data1)
data2= df. loc[ [ 'one' , 'three' ] ]
print ( data2)
a 43.412719
b 76.530491
c 16.054262
d 1.146777
Name: one, dtype: float64
a b c d
one 43.412719 76.530491 16.054262 1.146777
three 70.160305 81.423366 20.290999 78.110449
data1= df[ : 1 ]
print ( data1)
a b c d
one 43.412719 76.530491 16.054262 1.146777
df= pd. DataFrame( np. random. rand( 9 ) . reshape( ( 3 , 3 ) ) ,
)
print ( df)
data= df. loc[ [ 1 , 2 ] ]
print ( data)
0 1 2
0 0.373822 0.144124 0.919020
1 0.946216 0.037750 0.719912
2 0.432987 0.870418 0.060462
0 1 2
1 0.946216 0.037750 0.719912
2 0.432987 0.870418 0.060462
print ( df)
0 1 2
0 0.373822 0.144124 0.919020
1 0.946216 0.037750 0.719912
2 0.432987 0.870418 0.060462
print ( df. iloc[ 0 ] )
print ( df. iloc[ - 1 ] )
0 0.373822
1 0.144124
2 0.919020
Name: 0, dtype: float64
0 0.432987
1 0.870418
2 0.060462
Name: 2, dtype: float64
print ( df. iloc[ [ 0 , 2 ] ] )
0 1 2
0 0.373822 0.144124 0.919020
2 0.432987 0.870418 0.060462
print ( df. iloc[ 0 : 2 ] )
print ( df. iloc[ : : 2 ] )
0 1 2
0 0.373822 0.144124 0.919020
1 0.946216 0.037750 0.719912
0 1 2
0 0.373822 0.144124 0.919020
2 0.432987 0.870418 0.060462
df= pd. DataFrame( np. random. rand( 12 ) . reshape( ( 3 , 4 ) ) * 100 )
print ( df)
0 1 2 3
0 29.892506 58.911433 8.225744 82.950538
1 0.025366 38.071510 63.716461 48.258320
2 45.380903 70.560190 30.495624 4.703547
print ( df> 50 )
0 1 2 3
0 False True False True
1 False False True False
2 False True False False
print ( df[ df> 50 ] )
0 1 2 3
0 NaN 58.911433 NaN 82.950538
1 NaN NaN 63.716461 NaN
2 NaN 70.560190 NaN NaN
df= pd. DataFrame( np. random. rand( 16 ) . reshape( ( 4 , 4 ) ) ,
index= list ( 'abcd' ) ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
print ( df)
one two three four
a 0.325516 0.274562 0.876918 0.627544
b 0.303586 0.012632 0.387609 0.237904
c 0.826723 0.575291 0.560848 0.001186
d 0.730478 0.284428 0.822887 0.095292
print ( df[ 'one' ] . loc[ [ 'a' , 'c' ] ] )
a 0.325516
c 0.826723
Name: one, dtype: float64
print ( df[ [ 'two' , 'three' , 'four' ] ] . loc[ [ 'a' , 'c' ] ] )
two three four
a 0.274562 0.876918 0.627544
c 0.575291 0.560848 0.001186
'''
【课程2.7】 Pandas数据结构Dataframe:基本技巧
数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序
'''
df= pd. DataFrame( np. random. rand( 16 ) . reshape( ( 8 , 2 ) ) )
print ( df)
0 1
0 0.134166 0.010255
1 0.316007 0.524728
2 0.994353 0.544601
3 0.926491 0.686111
4 0.317319 0.109836
5 0.871562 0.614354
6 0.444111 0.805243
7 0.256888 0.037670
print ( df. head( ) )
0 1
0 0.134166 0.010255
1 0.316007 0.524728
2 0.994353 0.544601
3 0.926491 0.686111
4 0.317319 0.109836
print ( df. tail( ) )
0 1
3 0.926491 0.686111
4 0.317319 0.109836
5 0.871562 0.614354
6 0.444111 0.805243
7 0.256888 0.037670
print ( df. T)
0 1 2 3 4 5 6 \
0 0.134166 0.316007 0.994353 0.926491 0.317319 0.871562 0.444111
1 0.010255 0.524728 0.544601 0.686111 0.109836 0.614354 0.805243
7
0 0.256888
1 0.037670
df= pd. DataFrame( np. random. rand( 16 ) . reshape( ( 4 , 4 ) ) ,
columns= list ( 'abcd' ) )
print ( df)
a b c d
0 0.833651 0.422714 0.064032 0.176095
1 0.438021 0.878705 0.102511 0.468040
2 0.811842 0.080692 0.127141 0.154094
3 0.507186 0.920764 0.549470 0.935110
df[ 'd' ] = 100
print ( df)
df. loc[ 2 ] = 200
print ( df)
a b c d
0 0.833651 0.422714 0.064032 100
1 0.438021 0.878705 0.102511 100
2 0.811842 0.080692 0.127141 100
3 0.507186 0.920764 0.549470 100
a b c d
0 0.833651 0.422714 0.064032 100
1 0.438021 0.878705 0.102511 100
2 200.000000 200.000000 200.000000 200
3 0.507186 0.920764 0.549470 100
df[ 'e' ] = 300
print ( df)
a b c d e
0 0.833651 0.422714 0.064032 100 300
1 0.438021 0.878705 0.102511 100 300
2 200.000000 200.000000 200.000000 200 300
3 0.507186 0.920764 0.549470 100 300
print ( df)
a b c d e
0 0.833651 0.422714 0.064032 100 300
1 0.438021 0.878705 0.102511 100 300
2 200.000000 200.000000 200.000000 200 300
3 0.507186 0.920764 0.549470 100 300
del df[ 'a' ]
print ( df)
b c d e
0 0.422714 0.064032 100 300
1 0.878705 0.102511 100 300
2 200.000000 200.000000 200 300
3 0.920764 0.549470 100 300
print ( df. drop( 0 ) )
print ( df. drop( [ 'b' ] , axis= 1 ) )
b c d e
1 0.878705 0.102511 100 300
2 200.000000 200.000000 200 300
3 0.920764 0.549470 100 300
c d e
0 0.064032 100 300
1 0.102511 100 300
2 200.000000 200 300
3 0.549470 100 300
df1= pd. DataFrame( np. random. randn( 10 , 4 ) , columns= [ 'a' , 'b' , 'c' , 'd' ] )
df2= pd. DataFrame( np. random. randn( 7 , 3 ) , columns= [ 'a' , 'b' , 'c' ] )
print ( df1+ df2)
a b c d
0 -0.118693 -0.587134 1.240605 NaN
1 -0.439958 2.960476 -0.728936 NaN
2 0.855115 -0.659808 1.018583 NaN
3 -3.006156 0.376281 0.559385 NaN
4 -2.946353 0.428331 -1.788409 NaN
5 0.820590 -1.077892 -0.506990 NaN
6 -0.628339 0.215887 -2.513543 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
df= pd. DataFrame( np. random. rand( 16 ) . reshape( ( 4 , 4 ) ) * 100 ,
columns= [ 'a' , 'b' , 'c' , 'd' ] )
print ( df)
a b c d
0 53.044153 6.949537 76.201332 72.157734
1 2.389659 21.536710 62.266274 86.264873
2 21.737004 2.606586 76.871439 52.364927
3 64.429832 14.861729 16.369679 72.744620
print ( df. sort_values( [ 'a' ] ) )
print ( df. sort_values( [ 'b' ] , ascending= False ) )
a b c d
1 2.389659 21.536710 62.266274 86.264873
2 21.737004 2.606586 76.871439 52.364927
0 53.044153 6.949537 76.201332 72.157734
3 64.429832 14.861729 16.369679 72.744620
a b c d
1 2.389659 21.536710 62.266274 86.264873
3 64.429832 14.861729 16.369679 72.744620
0 53.044153 6.949537 76.201332 72.157734
2 21.737004 2.606586 76.871439 52.364927
df = df. sort_values( [ 'a' , 'b' ] )
print ( df)
a b c d
1 2.389659 21.536710 62.266274 86.264873
2 21.737004 2.606586 76.871439 52.364927
0 53.044153 6.949537 76.201332 72.157734
3 64.429832 14.861729 16.369679 72.744620
print ( df. sort_index( ) )
a b c d
0 53.044153 6.949537 76.201332 72.157734
1 2.389659 21.536710 62.266274 86.264873
2 21.737004 2.606586 76.871439 52.364927
3 64.429832 14.861729 16.369679 72.744620