'''
【课程2.14】 数值计算和统计基础
常用数学、统计方法
'''
import pandas as pd
import numpy as np
df= pd. DataFrame( {
'key1' : [ 4 , 5 , 3 , np. nan, 2 ] ,
'key2' : [ 1 , 2 , np. nan, 3 , 5 ] ,
'key3' : [ 1 , 2 , 3 , 'h' , 'k' ] } ,
index= list ( 'abcde' )
)
print ( df)
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 3.0 h
e 2.0 5.0 k
print ( df[ 'key1' ] )
a 4.0
b 5.0
c 3.0
d NaN
e 2.0
Name: key1, dtype: float64
m1= df. mean( )
print ( m1)
print ( df[ 'key2' ] . mean( ) )
key1 3.50
key2 2.75
dtype: float64
2.75
m2= df. mean( axis= 1 )
print ( m2)
a 2.5
b 3.5
c 3.0
d 3.0
e 3.5
dtype: float64
m3= df. mean( skipna= False )
print ( m3)
m4= df. mean( axis= 1 , skipna= False )
print ( m4)
key1 NaN
key2 NaN
dtype: float64
a 2.5
b 3.5
c NaN
d NaN
e 3.5
dtype: float64
df= pd. DataFrame( {
'key1' : np. arange( 10 ) ,
'key2' : np. random. rand( 10 ) * 10 ,
'key3' : [ 1 , 2 , 3 , 4 , 5 , 6 , 8 , 9 , 7 , np. nan]
} )
print ( df)
key1 key2 key3
0 0 7.414184 1.0
1 1 1.086788 2.0
2 2 3.489052 3.0
3 3 0.204554 4.0
4 4 8.666828 5.0
5 5 8.846589 6.0
6 6 5.601258 8.0
7 7 4.295757 9.0
8 8 0.190201 7.0
9 9 0.579211 NaN
print ( df. count( ) )
key1 10
key2 10
key3 9
dtype: int64
print ( df. min ( ) )
key1 0.000000
key2 0.190201
key3 1.000000
dtype: float64
print ( df. max ( ) )
key1 9.000000
key2 8.846589
key3 9.000000
dtype: float64
print ( df. quantile( q= 0.5 ) )
key1 4.500000
key2 3.892404
key3 NaN
dtype: float64
F:\software\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
print ( df. sum ( ) )
key1 45.000000
key2 40.374422
key3 45.000000
dtype: float64
print ( df. median( ) )
key1 4.500000
key2 3.892404
key3 5.000000
dtype: float64
print ( df. std( ) )
key1 3.027650
key2 3.479024
key3 2.738613
dtype: float64
print ( df. var( ) )
key1 9.166667
key2 12.103610
key3 7.500000
dtype: float64
print ( df. skew( ) , '→ skew样本的偏度\n' )
print ( df. kurt( ) , '→ kurt样本的峰度\n' )
key1 0.000000
key2 0.238528
key3 0.000000
dtype: float64 → skew样本的偏度
key1 -1.200000
key2 -1.681431
key3 -1.200000
dtype: float64 → kurt样本的峰度
df[ 'key1_sum' ] = df[ 'key1' ] . cumsum( )
df[ 'key2_sum' ] = df[ 'key2' ] . cumsum( )
df[ 'key3_sum' ] = df[ 'key3' ] . cumsum( )
print ( df)
key1 key2 key3 key1_sum key2_sum key3_sum
0 0 7.414184 1.0 0 7.414184 1.0
1 1 1.086788 2.0 1 8.500972 3.0
2 2 3.489052 3.0 3 11.990024 6.0
3 3 0.204554 4.0 6 12.194578 10.0
4 4 8.666828 5.0 10 20.861406 15.0
5 5 8.846589 6.0 15 29.707995 21.0
6 6 5.601258 8.0 21 35.309254 29.0
7 7 4.295757 9.0 28 39.605010 38.0
8 8 0.190201 7.0 36 39.795211 45.0
9 9 0.579211 NaN 45 40.374422 NaN
df[ 'key1_prod' ] = df[ 'key1' ] . cumprod( )
df[ 'key2_prod' ] = df[ 'key2' ] . cumprod( )
df[ 'key3_prod' ] = df[ 'key3' ] . cumprod( )
print ( df)
key1 key2 key3 key1_sum key2_sum key3_sum key1_prod \
0 0 7.414184 1.0 0 7.414184 1.0 0
1 1 1.086788 2.0 1 8.500972 3.0 0
2 2 3.489052 3.0 3 11.990024 6.0 0
3 3 0.204554 4.0 6 12.194578 10.0 0
4 4 8.666828 5.0 10 20.861406 15.0 0
5 5 8.846589 6.0 15 29.707995 21.0 0
6 6 5.601258 8.0 21 35.309254 29.0 0
7 7 4.295757 9.0 28 39.605010 38.0 0
8 8 0.190201 7.0 36 39.795211 45.0 0
9 9 0.579211 NaN 45 40.374422 NaN 0
key2_prod key3_prod
0 7.414184 1.0
1 8.057643 2.0
2 28.113540 6.0
3 5.750740 24.0
4 49.840677 120.0
5 440.919971 720.0
6 2469.706722 5760.0
7 10609.258712 51840.0
8 2017.889816 362880.0
9 1168.783411 NaN
s= pd. Series( list ( 'ffccdbaa' ) )
print ( s)
0 f
1 f
2 c
3 c
4 d
5 b
6 a
7 a
dtype: object
print ( s. unique( ) )
sq= s. unique( )
print ( pd. Series( sq) )
['f' 'c' 'd' 'b' 'a']
0 f
1 c
2 d
3 b
4 a
dtype: object
sq. sort( )
print ( sq)
['a' 'b' 'c' 'd' 'f']
print ( s. value_counts( sort= False ) )
print ( pd. value_counts( s) )
b 1
d 1
c 2
f 2
a 2
dtype: int64
a 2
f 2
c 2
d 1
b 1
dtype: int64
s= pd. Series( np. arange( 10 , 15 ) )
df= pd. DataFrame( {
'key1' : list ( 'abcdefghi1' ) ,
'key2' : np. arange( 4 , 14 )
} )
print ( s)
print ( df)
0 10
1 11
2 12
3 13
4 14
dtype: int32
key1 key2
0 a 4
1 b 5
2 c 6
3 d 7
4 e 8
5 f 9
6 g 10
7 h 11
8 i 12
9 1 13
print ( s. isin( [ 5 , 13 , 14 ] ) )
0 False
1 False
2 False
3 True
4 True
dtype: bool
print ( df. isin( [ 'a' , 'bc' , 10 , 8 , 1 ] ) )
key1 key2
0 True False
1 False False
2 False False
3 False False
4 False True
5 False False
6 False True
7 False False
8 False False
9 False False
'''
【课程2.15】 文本数据
Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作
'''
s= pd. Series( [ 'A' , 'b' , 'C' , 'bbhello' , '123' , np. nan, 'hj' ] )
df = pd. DataFrame( { 'key1' : list ( 'abcdef' ) ,
'key2' : [ 'hee' , 'fv' , 'w' , 'hija' , '123' , np. nan] } )
print ( s)
print ( df)
0 A
1 b
2 C
3 bbhello
4 123
5 NaN
6 hj
dtype: object
key1 key2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
print ( s. str . count( 'b' ) )
0 0.0
1 1.0
2 0.0
3 2.0
4 0.0
5 NaN
6 0.0
dtype: float64
print ( df[ 'key2' ] . str . upper( ) )
0 HEE
1 FV
2 W
3 HIJA
4 123
5 NaN
Name: key2, dtype: object
df. columns = df. columns. str . upper( )
print ( df)
KEY1 KEY2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
s= pd. Series( [ 'A' , 'b' , 'AAbb' , '123' , np. nan] )
print ( s)
0 A
1 b
2 AAbb
3 123
4 NaN
dtype: object
print ( s. str . lower( ) )
0 a
1 b
2 aabb
3 123
4 NaN
dtype: object
print ( s. str . upper( ) )
0 A
1 B
2 AABB
3 123
4 NaN
dtype: object
print ( s. str . len ( ) )
0 1.0
1 1.0
2 4.0
3 3.0
4 NaN
dtype: float64
print ( s. str . startswith( 'b' ) )
0 False
1 True
2 False
3 False
4 NaN
dtype: object
print ( s. str . endswith( '3' ) )
0 False
1 False
2 False
3 True
4 NaN
dtype: object
s= pd. Series( [ ' jack' , 'jill ' , ' jesse ' , 'frank' ] )
df = pd. DataFrame( np. random. randn( 3 , 2 ) , columns= [ ' Column A ' , ' Column B ' ] ,
index= range ( 3 ) )
print ( s)
print ( df)
0 jack
1 jill
2 jesse
3 frank
dtype: object
Column A Column B
0 -0.510577 -1.234627
1 1.338442 0.466668
2 -0.776209 -0.036641
print ( s. str . strip( ) )
print ( s. str . lstrip( ) )
print ( s. str . rstrip( ) )
0 jack
1 jill
2 jesse
3 frank
dtype: object
0 jack
1 jill
2 jesse
3 frank
dtype: object
0 jack
1 jill
2 jesse
3 frank
dtype: object
df. columns= df. columns. str . strip( )
print ( df)
Column A Column B
0 -0.510577 -1.234627
1 1.338442 0.466668
2 -0.776209 -0.036641
df= pd. DataFrame( np. random. randn( 3 , 2 ) ,
index= list ( 'abc' ) , columns= [ ' Column A ' , ' Column B ' ] )
print ( df)
Column A Column B
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
df. columns= df. columns. str . replace( ' ' , '--' )
print ( df)
--Column--A-- --Column--B--
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
df. columns= df. columns. str . replace( '--' , 'aaa' , n= 1 )
print ( df)
aaaColumn--A-- aaaColumn--B--
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
s= pd. Series( [ 'a,b,c' , '1,2,3' , [ 'a,,,c' ] , np. nan] )
print ( s)
0 a,b,c
1 1,2,3
2 [a,,,c]
3 NaN
dtype: object
print ( s. str . split( ',' ) )
0 [a, b, c]
1 [1, 2, 3]
2 NaN
3 NaN
dtype: object
print ( s. str . split( ',' ) [ 0 ] )
print ( s. str . split( ',' ) [ 1 ] )
['a', 'b', 'c']
['1', '2', '3']
print ( s. str . split( ',' , expand= True ) )
print ( s. str . split( ',' , expand= True , n = 1 ) )
print ( s. str . rsplit( ',' , expand= True , n = 1 ) )
0 1 2
0 a b c
1 1 2 3
2 NaN None None
3 NaN None None
0 1
0 a b,c
1 1 2,3
2 NaN None
3 NaN None
0 1
0 a,b c
1 1,2 3
2 NaN None
3 NaN None
-----
df = pd. DataFrame( { 'key1' : [ 'a,b,c' , '1,2,3' , [ ':,., ' ] ] ,
'key2' : [ 'a-b-c' , '1-2-3' , [ ':-.- ' ] ] } )
print ( df[ 'key2' ] . str . split( '-' ) )
0 [a, b, c]
1 [1, 2, 3]
2 NaN
Name: key2, dtype: object
s = pd. Series( [ 'A' , 'b' , 'C' , 'bbhello' , '123' , np. nan, 'hj' ] )
df = pd. DataFrame( { 'key1' : list ( 'abcdef' ) ,
'key2' : [ 'hee' , 'fv' , 'w' , 'hija' , '123' , np. nan] } )
print ( s)
print ( df)
0 A
1 b
2 C
3 bbhello
4 123
5 NaN
6 hj
dtype: object
key1 key2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
print ( s. str [ 0 ] )
0 A
1 b
2 C
3 b
4 1
5 NaN
6 h
dtype: object
print ( s. str [ : 2 ] )
0 A
1 b
2 C
3 bb
4 12
5 NaN
6 hj
dtype: object
print ( df[ 'key2' ] . str [ 0 ] )
0 h
1 f
2 w
3 h
4 1
5 NaN
Name: key2, dtype: object
'''
【课程2.16】 合并 merge、join
Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
'''
df1 = pd. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
df2 = pd. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
df3 = pd. DataFrame( { 'key1' : [ 'K0' , 'K0' , 'K1' , 'K2' ] ,
'key2' : [ 'K0' , 'K1' , 'K0' , 'K1' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
df4 = pd. DataFrame( { 'key1' : [ 'K0' , 'K1' , 'K1' , 'K2' ] ,
'key2' : [ 'K0' , 'K0' , 'K0' , 'K0' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
print ( pd. merge( df1, df2, on= 'key' ) )
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
print ( pd. merge( df3, df4, on= [ 'key1' , 'key2' ] ) )
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
print ( pd. merge( df3, df4, on= [ 'key1' , 'key2' ] , how= 'inner' ) )
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
print ( pd. merge( df3, df4, on= [ 'key1' , 'key2' ] , how= 'outer' ) )
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
print ( pd. merge( df3, df4, on= [ 'key1' , 'key2' ] , how = 'left' ) )
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
------
print ( pd. merge( df3, df4, on= [ 'key1' , 'key2' ] , how= 'right' ) )
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
df1 = pd. DataFrame( { 'lkey' : list ( 'bbacaab' ) ,
'data1' : range ( 7 ) } )
df2 = pd. DataFrame( { 'rkey' : list ( 'abd' ) ,
'date2' : range ( 3 ) } )
print ( pd. merge( df1, df2, left_on= 'lkey' , right_on= 'rkey' ) )
data1 lkey date2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
df1 = pd. DataFrame( { 'key' : list ( 'abcdfeg' ) ,
'data1' : range ( 7 ) } )
df2 = pd. DataFrame( { 'date2' : range ( 100 , 105 ) } ,
index = list ( 'abcde' ) )
print ( pd. merge( df1, df2, left_on= 'key' , right_index= True ) )
data1 key date2
0 0 a 100
1 1 b 101
2 2 c 102
3 3 d 103
5 5 e 104
df1 = pd. DataFrame( { 'key' : list ( 'bbacaab' ) ,
'data1' : [ 1 , 3 , 2 , 4 , 5 , 9 , 7 ] } )
df2 = pd. DataFrame( { 'key' : list ( 'abd' ) ,
'date2' : [ 11 , 2 , 33 ] } )
x1 = pd. merge( df1, df2, on = 'key' , how = 'outer' )
x2 = pd. merge( df1, df2, on = 'key' , sort= True , how = 'outer' )
print ( x1)
print ( x2)
data1 key date2
0 1.0 b 2.0
1 3.0 b 2.0
2 7.0 b 2.0
3 2.0 a 11.0
4 5.0 a 11.0
5 9.0 a 11.0
6 4.0 c NaN
7 NaN d 33.0
data1 key date2
0 2.0 a 11.0
1 5.0 a 11.0
2 9.0 a 11.0
3 1.0 b 2.0
4 3.0 b 2.0
5 7.0 b 2.0
6 4.0 c NaN
7 NaN d 33.0
print ( x2. sort_values( 'data1' ) )
data1 key date2
3 1.0 b 2.0
0 2.0 a 11.0
4 3.0 b 2.0
6 4.0 c NaN
1 5.0 a 11.0
5 7.0 b 2.0
2 9.0 a 11.0
7 NaN d 33.0
left = pd. DataFrame( { 'A' : [ 'A0' , 'A1' , 'A2' ] ,
'B' : [ 'B0' , 'B1' , 'B2' ] } ,
index= [ 'K0' , 'K1' , 'K2' ] )
right = pd. DataFrame( { 'C' : [ 'C0' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D2' , 'D3' ] } ,
index= [ 'K0' , 'K2' , 'K3' ] )
print ( left)
print ( right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
print ( left. join( right) )
print ( left. join( right, how= 'outer' ) )
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
df1 = pd. DataFrame( { 'key' : list ( 'bbacaab' ) ,
'data1' : [ 1 , 3 , 2 , 4 , 5 , 9 , 7 ] } )
df2 = pd. DataFrame( { 'key' : list ( 'abd' ) ,
'date2' : [ 11 , 2 , 33 ] } )
print ( df1)
print ( df2)
data1 key
0 1 b
1 3 b
2 2 a
3 4 c
4 5 a
5 9 a
6 7 b
date2 key
0 11 a
1 2 b
2 33 d
print ( pd. merge( df1, df2, left_index= True , right_index= True , suffixes= ( '_1' , '_2' ) ) )
print ( df1. join( df2[ 'date2' ] ) )
print ( '-----' )
data1 key_1 date2 key_2
0 1 b 11 a
1 3 b 2 b
2 2 a 33 d
data1 key date2
0 1 b 11.0
1 3 b 2.0
2 2 a 33.0
3 4 c NaN
4 5 a NaN
5 9 a NaN
6 7 b NaN
-----
left = pd. DataFrame( { 'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] ,
'key' : [ 'K0' , 'K1' , 'K0' , 'K1' ] } )
right = pd. DataFrame( { 'C' : [ 'C0' , 'C1' ] ,
'D' : [ 'D0' , 'D1' ] } ,
index= [ 'K0' , 'K1' ] )
print ( left)
print ( right)
print ( left. join( right, on = 'key' ) )
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K0
3 A3 B3 K1
C D
K0 C0 D0
K1 C1 D1
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1
'''
【课程2.17】 连接与修补 concat、combine_first
连接 - 沿轴执行连接操作
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
'''
import pandas as pd
import numpy as np
s1= pd. Series( [ 1 , 2 , 3 ] )
s2= pd. Series( [ 2 , 3 , 4 ] )
print ( pd. concat( [ s1, s2] ) )
0 1
1 2
2 3
0 2
1 3
2 4
dtype: int64
s3 = pd. Series( [ 1 , 2 , 3 ] , index = [ 'a' , 'c' , 'h' ] )
s4 = pd. Series( [ 2 , 3 , 4 ] , index = [ 'b' , 'e' , 'd' ] )
print ( pd. concat( [ s3, s4] ) . sort_index( ) )
a 1
b 2
c 2
d 4
e 3
h 3
dtype: int64
s5 = pd. Series( [ 1 , 2 , 3 ] , index = [ 'a' , 'b' , 'c' ] )
s6 = pd. Series( [ 2 , 3 , 4 ] , index = [ 'b' , 'c' , 'd' ] )
print ( pd. concat( [ s5, s6] , axis= 1 ) )
0 1
a 1.0 NaN
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
print ( [ pd. concat( [ s5, s6] , axis= 1 , join= 'inner' ) ] )
[ 0 1
b 2 2
c 3 3]
print ( pd. concat( [ s5, s6] , axis= 1 , join_axes= [ [ 'a' , 'b' , 'd' ] ] ) )
0 1
a 1.0 NaN
b 2.0 2.0
d NaN 4.0
sre= pd. concat( [ s5, s6] , keys= [ 'one' , 'two' ] )
print ( sre)
one a 1
b 2
c 3
two b 2
c 3
d 4
dtype: int64
sre = pd. concat( [ s5, s6] , axis= 1 , keys = [ 'one' , 'two' ] )
print ( sre, type ( sre) )
one two
a 1.0 NaN
b 2.0 2.0
c 3.0 3.0
d NaN 4.0 <class 'pandas.core.frame.DataFrame'>
df1 = pd. DataFrame( [ [ np. nan, 3 . , 5 . ] , [ - 4.6 , np. nan, np. nan] , [ np. nan, 7 . , np. nan] ] )
df2 = pd. DataFrame( [ [ - 42.6 , np. nan, - 8.2 ] , [ - 5 . , 1.6 , 4 ] ] , index= [ 1 , 2 ] )
print ( df1)
print ( df2)
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN NaN
2 NaN 7.0 NaN
0 1 2
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
print ( df1. combine_first( df2) )
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN -8.2
2 -5.0 7.0 4.0
df1. update( df2)
print ( df1)
0 1 2
0 NaN 3.0 5.0
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
'''
【课程2.18】 去重及替换
.duplicated / .replace
'''
s = pd. Series( [ 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 4 , 5 , 5 , 5 , 5 ] )
print ( s)
0 1
1 1
2 1
3 1
4 2
5 2
6 2
7 3
8 4
9 5
10 5
11 5
12 5
dtype: int64
print ( s. duplicated( ) )
0 False
1 True
2 True
3 True
4 False
5 True
6 True
7 False
8 False
9 False
10 True
11 True
12 True
dtype: bool
print ( s[ s. duplicated( ) == False ] )
0 1
4 2
7 3
8 4
9 5
dtype: int64
s_re = s. drop_duplicates( )
print ( s_re)
0 1
4 2
7 3
8 4
9 5
dtype: int64
df = pd. DataFrame( { 'key1' : [ 'a' , 'a' , 3 , 4 , 5 ] ,
'key2' : [ 'a' , 'a' , 'b' , 'b' , 'c' ] } )
print ( df)
key1 key2
0 a a
1 a a
2 3 b
3 4 b
4 5 c
print ( df. duplicated( ) )
0 False
1 True
2 False
3 False
4 False
dtype: bool
print ( df. duplicated( ) )
print ( df[ 'key2' ] . duplicated( ) )
0 False
1 True
2 False
3 False
4 False
dtype: bool
0 False
1 True
2 False
3 True
4 False
Name: key2, dtype: bool
s = pd. Series( list ( 'ascaazsd' ) )
print ( s)
0 a
1 s
2 c
3 a
4 a
5 z
6 s
7 d
dtype: object
print ( s. replace( 'a' , np. nan) )
0 NaN
1 s
2 c
3 NaN
4 NaN
5 z
6 s
7 d
dtype: object
print ( s. replace( [ 'a' , 's' ] , np. nan) )
0 NaN
1 NaN
2 c
3 NaN
4 NaN
5 z
6 NaN
7 d
dtype: object
print ( s. replace( { 'a' : 'hello world' , 's' : 'aaaaaaaaa' } ) )
0 hello world
1 aaaaaaaaa
2 c
3 hello world
4 hello world
5 z
6 aaaaaaaaa
7 d
dtype: object
'''
【课程2.19】 数据分组
分组统计 - groupby功能
① 根据某些条件将数据拆分成组
② 对每个组独立应用函数
③ 将结果合并到一个数据结构中
Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。
df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
'''
df = pd. DataFrame( { 'A' : [ 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'foo' ] ,
'B' : [ 'one' , 'one' , 'two' , 'three' , 'two' , 'two' , 'one' , 'three' ] ,
'C' : np. random. randn( 8 ) ,
'D' : np. random. randn( 8 ) } )
print ( df)
A B C D
0 foo one -1.420779 2.205477
1 bar one 0.514083 1.440050
2 foo two -0.265449 -1.354243
3 bar three -0.011561 -0.482113
4 foo two 2.030139 0.009457
5 bar two -0.106477 -0.014610
6 foo one 0.938704 0.228842
7 foo three 0.194304 2.537552
print ( df. groupby( 'A' ) )
<pandas.core.groupby.DataFrameGroupBy object at 0x00000141C13891D0>
print ( df. groupby( 'A' ) . mean( ) )
C D
A
bar 0.132015 0.314442
foo 0.295384 0.725417
print ( df. groupby( [ 'A' , 'B' ] ) . mean( ) )
C D
A B
bar one 0.514083 1.440050
three -0.011561 -0.482113
two -0.106477 -0.014610
foo one -0.241037 1.217159
three 0.194304 2.537552
two 0.882345 -0.672393
print ( df. groupby( [ 'A' ] ) [ 'D' ] . mean( ) )
A
bar 0.314442
foo 0.725417
Name: D, dtype: float64
df = pd. DataFrame( { 'X' : [ 'A' , 'B' , 'A' , 'B' ] , 'Y' : [ 1 , 4 , 3 , 2 ] } )
print ( df)
X Y
0 A 1
1 B 4
2 A 3
3 B 2
print ( df. groupby( [ 'X' ] ) )
<pandas.core.groupby.DataFrameGroupBy object at 0x00000141C1389208>
print ( list ( df. groupby( [ 'X' ] ) ) )
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)]
print ( list ( df. groupby( 'X' ) ) [ 0 ] )
('A', X Y
0 A 1
2 A 3)
for n, g in df. groupby( 'X' ) :
print ( n)
print ( g)
print ( '###' )
print ( '-----' )
A
X Y
0 A 1
2 A 3
###
B
X Y
1 B 4
3 B 2
###
-----
print ( df. groupby( [ 'X' ] ) . get_group( 'A' ) )
X Y
0 A 1
2 A 3
print ( df. groupby( [ 'X' ] ) . get_group( 'B' ) )
X Y
1 B 4
3 B 2
grouped = df. groupby( [ 'X' ] )
print ( grouped. groups)
print ( grouped. groups[ 'A' ] )
{'A': [0, 2], 'B': [1, 3]}
[0, 2]
sz = grouped. size( )
print ( sz, type ( sz) )
X
A 2
B 2
dtype: int64 <class 'pandas.core.series.Series'>
df = pd. DataFrame( { 'A' : [ 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'foo' ] ,
'B' : [ 'one' , 'one' , 'two' , 'three' , 'two' , 'two' , 'one' , 'three' ] ,
'C' : np. random. randn( 8 ) ,
'D' : np. random. randn( 8 ) } )
print ( df)
A B C D
0 foo one 0.628069 2.369622
1 bar one 0.624670 -1.729496
2 foo two 1.748096 -0.674956
3 bar three -0.154640 -0.436490
4 foo two -0.604173 -0.081001
5 bar two 0.311972 -2.065370
6 foo one 0.723137 -0.125561
7 foo three 0.988974 -1.429884
group= df. groupby( [ 'A' , 'B' ] ) . groups
print ( grouped)
<pandas.core.groupby.DataFrameGroupBy object at 0x00000141C13CB6A0>
print ( group[ ( 'foo' , 'three' ) ] )
[7]
df = pd. DataFrame( { 'data1' : np. random. rand( 2 ) ,
'data2' : np. random. rand( 2 ) ,
'key1' : [ 'a' , 'b' ] ,
'key2' : [ 'one' , 'two' ] } )
print ( df)
data1 data2 key1 key2
0 0.762760 0.661971 a one
1 0.785907 0.058554 b two
for n, p in df. groupby( df. dtypes, axis= 1 ) :
print ( n)
print ( p)
print ( '##' )
float64
data1 data2
0 0.762760 0.661971
1 0.785907 0.058554
##
object
key1 key2
0 a one
1 b two
##
df = pd. DataFrame( np. arange( 16 ) . reshape( 4 , 4 ) ,
columns = [ 'a' , 'b' , 'c' , 'd' ] )
print ( df)
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
mapping = { 'a' : 'one' , 'b' : 'one' , 'c' : 'two' , 'd' : 'two' , 'e' : 'three' }
by_column= df. groupby( mapping, axis= 1 )
print ( by_column. sum ( ) )
one two
0 1 5
1 9 13
2 17 21
3 25 29
s = pd. Series( mapping)
print ( s, '\n' )
print ( s. groupby( s) . count( ) )
a one
b one
c two
d two
e three
dtype: object
one 2
three 1
two 2
dtype: int64
df = pd. DataFrame( np. arange( 16 ) . reshape( 4 , 4 ) ,
columns = [ 'a' , 'b' , 'c' , 'd' ] ,
index = [ 'abc' , 'bcd' , 'aa' , 'b' ] )
print ( df)
a b c d
abc 0 1 2 3
bcd 4 5 6 7
aa 8 9 10 11
b 12 13 14 15
print ( df. groupby( len ) . sum ( ) )
a b c d
1 12 13 14 15
2 8 9 10 11
3 4 6 8 10
s = pd. Series( [ 1 , 2 , 3 , 10 , 20 , 30 ] , index = [ 1 , 2 , 3 , 1 , 2 , 3 ] )
print ( s)
1 1
2 2
3 3
1 10
2 20
3 30
dtype: int64
grouped = s. groupby( level= 0 )
print ( grouped)
<pandas.core.groupby.SeriesGroupBy object at 0x00000141C1473CF8>
print ( grouped. first( ) , '→ first:非NaN的第一个值\n' )
print ( grouped. last( ) , '→ last:非NaN的最后一个值\n' )
print ( grouped. sum ( ) , '→ sum:非NaN的和\n' )
print ( grouped. mean( ) , '→ mean:非NaN的平均值\n' )
print ( grouped. median( ) , '→ median:非NaN的算术中位数\n' )
print ( grouped. count( ) , '→ count:非NaN的值\n' )
print ( grouped. min ( ) , '→ min、max:非NaN的最小值、最大值\n' )
print ( grouped. std( ) , '→ std,var:非NaN的标准差和方差\n' )
print ( grouped. prod( ) , '→ prod:非NaN的积\n' )
1 1
2 2
3 3
dtype: int64 → first:非NaN的第一个值
1 10
2 20
3 30
dtype: int64 → last:非NaN的最后一个值
1 11
2 22
3 33
dtype: int64 → sum:非NaN的和
1 5.5
2 11.0
3 16.5
dtype: float64 → mean:非NaN的平均值
1 5.5
2 11.0
3 16.5
dtype: float64 → median:非NaN的算术中位数
1 2
2 2
3 2
dtype: int64 → count:非NaN的值
1 1
2 2
3 3
dtype: int64 → min、max:非NaN的最小值、最大值
1 6.363961
2 12.727922
3 19.091883
dtype: float64 → std,var:非NaN的标准差和方差
1 10
2 40
3 90
dtype: int64 → prod:非NaN的积
df = pd. DataFrame( { 'a' : [ 1 , 1 , 2 , 2 ] ,
'b' : np. random. rand( 4 ) ,
'c' : np. random. rand( 4 ) ,
'd' : np. random. rand( 4 ) , } )
print ( df)
a b c d
0 1 0.772478 0.417508 0.068269
1 1 0.087774 0.063274 0.991277
2 2 0.229447 0.062397 0.985243
3 2 0.025157 0.130426 0.664778
print ( df. groupby( 'a' ) . agg( [ 'mean' , np. sum ] ) )
print ( df. groupby( 'a' ) [ 'b' ] . agg( { 'result1' : np. mean,
'result2' : np. sum } ) )
b c d
mean sum mean sum mean sum
a
1 0.430126 0.860252 0.240391 0.480782 0.529773 1.059545
2 0.127302 0.254605 0.096412 0.192824 0.825010 1.650021
result2 result1
a
1 0.860252 0.430126
2 0.254605 0.127302
'''
【课程2.20】 分组转换及一般性“拆分-应用-合并”
transform / apply
'''
df = pd. DataFrame( { 'data1' : np. random. rand( 5 ) ,
'data2' : np. random. rand( 5 ) ,
'key1' : list ( 'aabba' ) ,
'key2' : [ 'one' , 'two' , 'one' , 'two' , 'one' ] } )
print ( df)
data1 data2 key1 key2
0 0.906503 0.944454 a one
1 0.608210 0.481486 a two
2 0.109676 0.276845 b one
3 0.817092 0.008300 b two
4 0.266458 0.540617 a one
k_mean= df. groupby( 'key1' ) . mean( )
print ( k_mean)
data1 data2
key1
a 0.593724 0.655519
b 0.463384 0.142573
print ( pd. merge( df, k_mean, left_on= 'key1' , right_index= True ) . add_prefix( 'mean_' ) )
mean_data1_x mean_data2_x mean_key1 mean_key2 mean_data1_y mean_data2_y
0 0.906503 0.944454 a one 0.593724 0.655519
1 0.608210 0.481486 a two 0.593724 0.655519
4 0.266458 0.540617 a one 0.593724 0.655519
2 0.109676 0.276845 b one 0.463384 0.142573
3 0.817092 0.008300 b two 0.463384 0.142573
print ( df. groupby( 'key2' ) . mean( ) )
print ( df. groupby( 'key2' ) . transform( np. mean) )
data1 data2
key2
one 0.427546 0.587305
two 0.712651 0.244893
data1 data2
0 0.427546 0.587305
1 0.712651 0.244893
2 0.427546 0.587305
3 0.712651 0.244893
4 0.427546 0.587305
df = pd. DataFrame( { 'data1' : np. random. rand( 5 ) ,
'data2' : np. random. rand( 5 ) ,
'key1' : list ( 'aabba' ) ,
'key2' : [ 'one' , 'two' , 'one' , 'two' , 'one' ] } )
print ( df)
print ( df. groupby( 'key1' ) . apply ( lambda x: x. describe( ) ) )
data1 data2 key1 key2
0 0.285345 0.497243 a one
1 0.525780 0.196928 a two
2 0.407405 0.829274 b one
3 0.329079 0.689100 b two
4 0.856305 0.436768 a one
data1 data2
key1
a count 3.000000 3.000000
mean 0.555810 0.376980
std 0.286662 0.158834
min 0.285345 0.196928
25% 0.405563 0.316848
50% 0.525780 0.436768
75% 0.691042 0.467006
max 0.856305 0.497243
b count 2.000000 2.000000
mean 0.368242 0.759187
std 0.055385 0.099118
min 0.329079 0.689100
25% 0.348660 0.724144
50% 0.368242 0.759187
75% 0.387824 0.794230
max 0.407405 0.829274
def f_df1 ( d, n) :
return ( d. sort_index( ) [ : n] )
def f_df2 ( d, k1) :
return ( d[ k1] )
print ( df. groupby( 'key1' ) . apply ( f_df1, 2 ) , '\n' )
print ( df. groupby( 'key1' ) . apply ( f_df2, 'data2' ) )
print ( type ( df. groupby( 'key1' ) . apply ( f_df2, 'data2' ) ) )
data1 data2 key1 key2
key1
a 0 0.285345 0.497243 a one
1 0.525780 0.196928 a two
b 2 0.407405 0.829274 b one
3 0.329079 0.689100 b two
key1
a 0 0.497243
1 0.196928
4 0.436768
b 2 0.829274
3 0.689100
Name: data2, dtype: float64
<class 'pandas.core.series.Series'>
'''
【课程2.21】 透视表及交叉表
类似excel数据透视 - pivot table / crosstab
'''
date = [ '2017-5-1' , '2017-5-2' , '2017-5-3' ] * 3
rng = pd. to_datetime( date)
df = pd. DataFrame( { 'date' : rng,
'key' : list ( 'abcdabcda' ) ,
'values' : np. random. rand( 9 ) * 10 } )
print ( df)
date key values
0 2017-05-01 a 4.092407
1 2017-05-02 b 7.389150
2 2017-05-03 c 9.946409
3 2017-05-01 d 5.282872
4 2017-05-02 a 8.591360
5 2017-05-03 b 7.475860
6 2017-05-01 c 5.490445
7 2017-05-02 d 5.947843
8 2017-05-03 a 2.806096
print ( pd. pivot_table( df, values = 'values' , index = 'date' , columns = 'key' , aggfunc= np. sum ) )
print ( '-----' )
print ( pd. pivot_table( df, values = 'values' , index = [ 'date' , 'key' ] , aggfunc= len ) )
print ( '-----' )
key a b c d
date
2017-05-01 4.092407 NaN 5.490445 5.282872
2017-05-02 8.591360 7.38915 NaN 5.947843
2017-05-03 2.806096 7.47586 9.946409 NaN
-----
date key
2017-05-01 a 1.0
c 1.0
d 1.0
2017-05-02 a 1.0
b 1.0
d 1.0
2017-05-03 a 1.0
b 1.0
c 1.0
Name: values, dtype: float64
-----
df = pd. DataFrame( { 'A' : [ 1 , 2 , 2 , 2 , 2 ] ,
'B' : [ 3 , 3 , 4 , 4 , 4 ] ,
'C' : [ 1 , 1 , np. nan, 1 , 1 ] } )
print ( df)
A B C
0 1 3 1.0
1 2 3 1.0
2 2 4 NaN
3 2 4 1.0
4 2 4 1.0
print ( pd. crosstab( df[ 'A' ] , df[ 'B' ] ) )
B 3 4
A
1 1 0
2 1 3
print ( pd. crosstab( df[ 'A' ] , df[ 'B' ] , normalize= True ) )
print ( '-----' )
B 3 4
A
1 0.2 0.0
2 0.2 0.6
-----
print ( pd. crosstab( df[ 'A' ] , df[ 'B' ] , values= df[ 'C' ] , aggfunc= np. sum ) )
print ( '-----' )
B 3 4
A
1 1.0 NaN
2 1.0 2.0
-----
print ( pd. crosstab( df[ 'A' ] , df[ 'B' ] , values= df[ 'C' ] , aggfunc= np. sum , margins= True ) )
print ( '-----' )
B 3 4 All
A
1 1.0 NaN 1.0
2 1.0 2.0 3.0
All 2.0 2.0 4.0
-----
'''
【课程2.22】 数据读取
核心:read_table, read_csv, read_excel
'''
import os
os. chdir( 'C:/Users/Hjx/Desktop/' )
data1 = pd. read_table( 'data1.txt' , delimiter= ',' , header = 0 , index_col= 1 )
print ( data1)
data2 = pd. read_csv( 'data2.csv' , engine = 'python' )
print ( data2. head( ) )
data3 = pd. read_excel( '地市级党委书记数据库(2000-10).xlsx' , sheetname= '中国人民共和国地市级党委书记数据库(2000-10)' , header= 0 )
print ( data3)