series和DataFrame的创建
import numpy as np
import pandas as pd
pandas中有两种基本数据结构,series 和dataframe
s = pd. Series( [ 1 , 3 , 5 , 7 , np. NaN] )
print ( s)
0 1.0
1 3.0
2 5.0
3 7.0
4 NaN
dtype: float64
运行结果中,第一列是自动生成的索引,第二列是值 下面创建一个日期序列
date = pd. date_range( '20200704' , periods = 10 )
print ( date)
DatetimeIndex(['2020-07-04', '2020-07-05', '2020-07-06', '2020-07-07',
'2020-07-08', '2020-07-09', '2020-07-10', '2020-07-11',
'2020-07-12', '2020-07-13'],
dtype='datetime64[ns]', freq='D')
下面创建一个dataframe,dataframe有三个参数,值,行索引,列索引
data = pd. DataFrame( np. random. randn( 10 , 4 )
, index= date
, columns= list ( 'ABCD' ) )
print ( data)
A B C D
2020-07-04 1.461007 0.330546 0.069813 0.753270
2020-07-05 -1.467785 0.847787 0.689302 -0.272599
2020-07-06 0.607181 -0.681571 0.480336 1.924793
2020-07-07 0.993651 -1.429534 1.063160 -0.477975
2020-07-08 0.317019 1.150236 1.235036 1.173084
2020-07-09 2.302594 0.536270 -0.806829 0.786662
2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971
2020-07-11 -1.408476 -0.708248 1.382285 -1.137585
2020-07-12 0.099601 0.570422 0.183447 -0.985196
2020-07-13 2.066843 -0.036730 -0.114365 0.233777
另一个创建DataFrame的方法是利用字典来创建,如下所示
dic = { 'A' : range ( 4 ) , 'B' : list ( 'abcd' ) }
df = pd. DataFrame( dic)
print ( df)
A B
0 0 a
1 1 b
2 2 c
3 3 d
DataFrame的基本操作
查看数据
可以输入列名访问列的数据
data. A
2020-07-04 1.461007
2020-07-05 -1.467785
2020-07-06 0.607181
2020-07-07 0.993651
2020-07-08 0.317019
2020-07-09 2.302594
2020-07-10 -0.101381
2020-07-11 -1.408476
2020-07-12 0.099601
2020-07-13 2.066843
Freq: D, Name: A, dtype: float64
data. B
2020-07-04 0.330546
2020-07-05 0.847787
2020-07-06 -0.681571
2020-07-07 -1.429534
2020-07-08 1.150236
2020-07-09 0.536270
2020-07-10 -0.477274
2020-07-11 -0.708248
2020-07-12 0.570422
2020-07-13 -0.036730
Freq: D, Name: B, dtype: float64
查看列的数据类型,发现它是一个series
type ( data. A)
pandas.core.series.Series
可以查看dataframe的头部和尾部,行标签,列标签还有值
DataFrame的基本操作
查看数据
data. head( )
A B C D 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-06 0.607181 -0.681571 0.480336 1.924793 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 2020-07-08 0.317019 1.150236 1.235036 1.173084
data. tail( 2 )
A B C D 2020-07-12 0.099601 0.570422 0.183447 -0.985196 2020-07-13 2.066843 -0.036730 -0.114365 0.233777
df. index
RangeIndex(start=0, stop=4, step=1)
data. columns
Index(['A', 'B', 'C', 'D'], dtype='object')
data. values
array([[ 1.46100717, 0.33054584, 0.06981279, 0.75326991],
[-1.46778504, 0.84778667, 0.6893022 , -0.27259934],
[ 0.60718094, -0.68157073, 0.4803361 , 1.92479259],
[ 0.99365125, -1.42953449, 1.06315959, -0.47797516],
[ 0.31701933, 1.1502355 , 1.23503615, 1.17308395],
[ 2.30259364, 0.5362702 , -0.80682917, 0.7866624 ],
[-0.10138056, -0.47727408, -0.98897458, -2.49297128],
[-1.4084756 , -0.7082477 , 1.38228526, -1.13758454],
[ 0.0996013 , 0.57042191, 0.18344733, -0.98519631],
[ 2.06684278, -0.03672988, -0.11436519, 0.23377719]])
data. describe( )
A B C D count 10.000000 10.000000 10.000000 10.000000 mean 0.487026 0.010190 0.319321 -0.049474 std 1.288896 0.816621 0.811938 1.294423 min -1.467785 -1.429534 -0.988975 -2.492971 25% -0.051135 -0.630497 -0.068321 -0.858391 50% 0.462100 0.146908 0.331892 -0.019411 75% 1.344168 0.561884 0.969695 0.778314 max 2.302594 1.150236 1.382285 1.924793
data. T
2020-07-04 2020-07-05 2020-07-06 2020-07-07 2020-07-08 2020-07-09 2020-07-10 2020-07-11 2020-07-12 2020-07-13 A 1.461007 -1.467785 0.607181 0.993651 0.317019 2.302594 -0.101381 -1.408476 0.099601 2.066843 B 0.330546 0.847787 -0.681571 -1.429534 1.150236 0.536270 -0.477274 -0.708248 0.570422 -0.036730 C 0.069813 0.689302 0.480336 1.063160 1.235036 -0.806829 -0.988975 1.382285 0.183447 -0.114365 D 0.753270 -0.272599 1.924793 -0.477975 1.173084 0.786662 -2.492971 -1.137585 -0.985196 0.233777
data. sort_index( axis= 0 )
A B C D 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-06 0.607181 -0.681571 0.480336 1.924793 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 2020-07-08 0.317019 1.150236 1.235036 1.173084 2020-07-09 2.302594 0.536270 -0.806829 0.786662 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 2020-07-12 0.099601 0.570422 0.183447 -0.985196 2020-07-13 2.066843 -0.036730 -0.114365 0.233777
data. sort_index( axis= 1 , ascending= False )
D C B A 2020-07-04 0.753270 0.069813 0.330546 1.461007 2020-07-05 -0.272599 0.689302 0.847787 -1.467785 2020-07-06 1.924793 0.480336 -0.681571 0.607181 2020-07-07 -0.477975 1.063160 -1.429534 0.993651 2020-07-08 1.173084 1.235036 1.150236 0.317019 2020-07-09 0.786662 -0.806829 0.536270 2.302594 2020-07-10 -2.492971 -0.988975 -0.477274 -0.101381 2020-07-11 -1.137585 1.382285 -0.708248 -1.408476 2020-07-12 -0.985196 0.183447 0.570422 0.099601 2020-07-13 0.233777 -0.114365 -0.036730 2.066843
data. sort_values( by= 'A' )
A B C D 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 2020-07-12 0.099601 0.570422 0.183447 -0.985196 2020-07-08 0.317019 1.150236 1.235036 1.173084 2020-07-06 0.607181 -0.681571 0.480336 1.924793 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-13 2.066843 -0.036730 -0.114365 0.233777 2020-07-09 2.302594 0.536270 -0.806829 0.786662
数据的选择
data[ 'A' ]
2020-07-04 1.461007
2020-07-05 -1.467785
2020-07-06 0.607181
2020-07-07 0.993651
2020-07-08 0.317019
2020-07-09 2.302594
2020-07-10 -0.101381
2020-07-11 -1.408476
2020-07-12 0.099601
2020-07-13 2.066843
Freq: D, Name: A, dtype: float64
data. loc[ '20200704' : '20200713' ]
A B C D 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-06 0.607181 -0.681571 0.480336 1.924793 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 2020-07-08 0.317019 1.150236 1.235036 1.173084 2020-07-09 2.302594 0.536270 -0.806829 0.786662 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 2020-07-12 0.099601 0.570422 0.183447 -0.985196 2020-07-13 2.066843 -0.036730 -0.114365 0.233777
data. loc[ '20200707' : '20200709' , [ 'B' , 'C' ] ]
B C 2020-07-07 -1.429534 1.063160 2020-07-08 1.150236 1.235036 2020-07-09 0.536270 -0.806829
data. loc[ '20200707' , 'C' ]
1.0631595862205687
data. iloc[ 1 : 3 ]
A B C D 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-06 0.607181 -0.681571 0.480336 1.924793
data. iloc[ 0 : 3 , 1 : 4 ]
B C D 2020-07-04 0.330546 0.069813 0.753270 2020-07-05 0.847787 0.689302 -0.272599 2020-07-06 -0.681571 0.480336 1.924793
data. at[ pd. Timestamp( '20200707' ) , 'C' ]
1.0631595862205687
data. iat[ 2 , 3 ]
1.9247925915867081
data[ data > 0 ]
A B C D 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-05 NaN 0.847787 0.689302 NaN 2020-07-06 0.607181 NaN 0.480336 1.924793 2020-07-07 0.993651 NaN 1.063160 NaN 2020-07-08 0.317019 1.150236 1.235036 1.173084 2020-07-09 2.302594 0.536270 NaN 0.786662 2020-07-10 NaN NaN NaN NaN 2020-07-11 NaN NaN 1.382285 NaN 2020-07-12 0.099601 0.570422 0.183447 NaN 2020-07-13 2.066843 NaN NaN 0.233777
修改数据
data2 = data. copy( )
data2
A B C D 2020-07-04 1.461007 0.330546 0.069813 0.753270 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 2020-07-06 0.607181 -0.681571 0.480336 1.924793 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 2020-07-08 0.317019 1.150236 1.235036 1.173084 2020-07-09 2.302594 0.536270 -0.806829 0.786662 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 2020-07-12 0.099601 0.570422 0.183447 -0.985196 2020-07-13 2.066843 -0.036730 -0.114365 0.233777
new = list ( 'abcdefghij' )
data2[ 'NEW' ] = new
data2
A B C D NEW 2020-07-04 1.461007 0.330546 0.069813 0.753270 a 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 b 2020-07-06 0.607181 -0.681571 0.480336 1.924793 c 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 d 2020-07-08 0.317019 1.150236 1.235036 1.173084 e 2020-07-09 2.302594 0.536270 -0.806829 0.786662 f 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 g 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 h 2020-07-12 0.099601 0.570422 0.183447 -0.985196 i 2020-07-13 2.066843 -0.036730 -0.114365 0.233777 j
data2[ data2. NEW. isin( [ 'a' , 'c' ] ) ]
A B C D NEW 2020-07-04 1.461007 0.330546 0.069813 0.753270 a 2020-07-06 0.607181 -0.681571 0.480336 1.924793 c
data2. iat[ 0 , 0 ] = 100
data2
A B C D NEW 2020-07-04 100.000000 0.330546 0.069813 0.753270 a 2020-07-05 -1.467785 0.847787 0.689302 -0.272599 b 2020-07-06 0.607181 -0.681571 0.480336 1.924793 c 2020-07-07 0.993651 -1.429534 1.063160 -0.477975 d 2020-07-08 0.317019 1.150236 1.235036 1.173084 e 2020-07-09 2.302594 0.536270 -0.806829 0.786662 f 2020-07-10 -0.101381 -0.477274 -0.988975 -2.492971 g 2020-07-11 -1.408476 -0.708248 1.382285 -1.137585 h 2020-07-12 0.099601 0.570422 0.183447 -0.985196 i 2020-07-13 2.066843 -0.036730 -0.114365 0.233777 j
data2. B = range ( 10 )
data2
A B C D NEW 2020-07-04 100.000000 0 0.069813 0.753270 a 2020-07-05 -1.467785 1 0.689302 -0.272599 b 2020-07-06 0.607181 2 0.480336 1.924793 c 2020-07-07 0.993651 3 1.063160 -0.477975 d 2020-07-08 0.317019 4 1.235036 1.173084 e 2020-07-09 2.302594 5 -0.806829 0.786662 f 2020-07-10 -0.101381 6 -0.988975 -2.492971 g 2020-07-11 -1.408476 7 1.382285 -1.137585 h 2020-07-12 0.099601 8 0.183447 -0.985196 i 2020-07-13 2.066843 9 -0.114365 0.233777 j
data2. iloc[ 8 : , 2 : 4 ] = np. arange( 4 ) . reshape( 2 , 2 )
data2
A B C D NEW 2020-07-04 100.000000 0 0.069813 0.753270 a 2020-07-05 -1.467785 1 0.689302 -0.272599 b 2020-07-06 0.607181 2 0.480336 1.924793 c 2020-07-07 0.993651 3 1.063160 -0.477975 d 2020-07-08 0.317019 4 1.235036 1.173084 e 2020-07-09 2.302594 5 -0.806829 0.786662 f 2020-07-10 -0.101381 6 -0.988975 -2.492971 g 2020-07-11 -1.408476 7 1.382285 -1.137585 h 2020-07-12 0.099601 8 0.000000 1.000000 i 2020-07-13 2.066843 9 2.000000 3.000000 j
dates = pd. date_range( '20200705' , periods= 6 )
df2 = pd. DataFrame( np. random. randn( 6 , 4 ) , index= dates, columns= list ( 'ABCD' ) )
df2
A B C D 2020-07-05 1.162305 0.467942 1.393984 -0.513032 2020-07-06 1.014034 -0.878029 0.457580 -0.326289 2020-07-07 -0.806825 -0.757913 1.265789 1.237268 2020-07-08 0.351479 0.591388 0.860177 -1.946902 2020-07-09 -0.938336 1.229066 -0.243945 1.210629 2020-07-10 -2.233276 0.243813 0.490214 0.591144
处理缺失值
df3 = df2. reindex( index= dates[ 0 : 4 ] , columns= list ( df2. columns) + [ 'E' ] )
df3
A B C D E 2020-07-05 1.162305 0.467942 1.393984 -0.513032 NaN 2020-07-06 1.014034 -0.878029 0.457580 -0.326289 NaN 2020-07-07 -0.806825 -0.757913 1.265789 1.237268 NaN 2020-07-08 0.351479 0.591388 0.860177 -1.946902 NaN
df3. loc[ dates[ 1 : 3 ] , 'E' ] = 2
df3
A B C D E 2020-07-05 1.162305 0.467942 1.393984 -0.513032 NaN 2020-07-06 1.014034 -0.878029 0.457580 -0.326289 2.0 2020-07-07 -0.806825 -0.757913 1.265789 1.237268 2.0 2020-07-08 0.351479 0.591388 0.860177 -1.946902 NaN
df3. dropna( )
A B C D E 2020-07-06 1.014034 -0.878029 0.457580 -0.326289 2.0 2020-07-07 -0.806825 -0.757913 1.265789 1.237268 2.0
df3. fillna( 5 )
A B C D E 2020-07-05 1.162305 0.467942 1.393984 -0.513032 5.0 2020-07-06 1.014034 -0.878029 0.457580 -0.326289 2.0 2020-07-07 -0.806825 -0.757913 1.265789 1.237268 2.0 2020-07-08 0.351479 0.591388 0.860177 -1.946902 5.0
df3. isnull( )
A B C D E 2020-07-05 False False False False True 2020-07-06 False False False False False 2020-07-07 False False False False False 2020-07-08 False False False False True
df3. isnull( ) . any ( ) . any ( )
True
基本统计
print ( df3)
print ( df3. mean( ) )
A B C D E
2020-07-05 1.162305 0.467942 1.393984 -0.513032 NaN
2020-07-06 1.014034 -0.878029 0.457580 -0.326289 2.0
2020-07-07 -0.806825 -0.757913 1.265789 1.237268 2.0
2020-07-08 0.351479 0.591388 0.860177 -1.946902 NaN
A 0.430248
B -0.144153
C 0.994383
D -0.387239
E 2.000000
dtype: float64
print ( df3. mean( axis= 1 ) )
2020-07-05 0.627800
2020-07-06 0.453459
2020-07-07 0.587664
2020-07-08 -0.035965
Freq: D, dtype: float64
print ( df3. cumsum( ) )
print ( df3. cumsum( axis= 1 ) )
A B C D E
2020-07-05 1.162305 0.467942 1.393984 -0.513032 NaN
2020-07-06 2.176339 -0.410087 1.851565 -0.839322 2.0
2020-07-07 1.369514 -1.167999 3.117354 0.397947 4.0
2020-07-08 1.720993 -0.576611 3.977531 -1.548955 NaN
A B C D E
2020-07-05 1.162305 1.630247 3.024232 2.511199 NaN
2020-07-06 1.014034 0.136005 0.593585 0.267296 2.267296
2020-07-07 -0.806825 -1.564737 -0.298948 0.938320 2.938320
2020-07-08 0.351479 0.942867 1.803044 -0.143858 NaN
s2 = pd. Series( [ 1 , 3 , 5 , np. nan, 6 , np. nan] , index= dates)
s2
2020-07-05 1.0
2020-07-06 3.0
2020-07-07 5.0
2020-07-08 NaN
2020-07-09 6.0
2020-07-10 NaN
Freq: D, dtype: float64
print ( df2)
df2. sub( s2, axis= 0 )
A B C D
2020-07-05 1.162305 0.467942 1.393984 -0.513032
2020-07-06 1.014034 -0.878029 0.457580 -0.326289
2020-07-07 -0.806825 -0.757913 1.265789 1.237268
2020-07-08 0.351479 0.591388 0.860177 -1.946902
2020-07-09 -0.938336 1.229066 -0.243945 1.210629
2020-07-10 -2.233276 0.243813 0.490214 0.591144
A B C D 2020-07-05 0.162305 -0.532058 0.393984 -1.513032 2020-07-06 -1.985966 -3.878029 -2.542420 -3.326289 2020-07-07 -5.806825 -5.757913 -3.734211 -3.762732 2020-07-08 NaN NaN NaN NaN 2020-07-09 -6.938336 -4.770934 -6.243945 -4.789371 2020-07-10 NaN NaN NaN NaN
df2. apply ( np. cumsum)
A B C D 2020-07-05 1.162305 0.467942 1.393984 -0.513032 2020-07-06 2.176339 -0.410087 1.851565 -0.839322 2020-07-07 1.369514 -1.167999 3.117354 0.397947 2020-07-08 1.720993 -0.576611 3.977531 -1.548955 2020-07-09 0.782657 0.652454 3.733586 -0.338326 2020-07-10 -1.450619 0.896267 4.223800 0.252817
def printtype ( x) :
return type ( x)
df2. apply ( printtype)
A <class 'pandas.core.series.Series'>
B <class 'pandas.core.series.Series'>
C <class 'pandas.core.series.Series'>
D <class 'pandas.core.series.Series'>
dtype: object
s3 = pd. Series( np. random. randint( 10 , 20 , size= 10 ) )
s3
0 15
1 15
2 12
3 18
4 12
5 13
6 18
7 15
8 18
9 18
dtype: int32
s3. value_counts( )
18 4
15 3
12 2
13 1
dtype: int64
s3. mode( )
0 18
dtype: int32
数据合并
df4 = pd. DataFrame( np. random. randn( 10 , 4 ) , columns= list ( 'ABCD' ) )
df4
A B C D 0 0.005348 0.351821 -0.708831 -0.227980 1 -0.422946 0.153891 0.418701 0.477547 2 -0.384677 0.677650 -0.741967 -0.060229 3 1.604719 -0.300600 1.207413 -0.064982 4 -0.701497 -0.862793 -1.111455 -1.332034 5 -1.265365 0.938897 0.534374 0.143901 6 -0.176843 -0.549029 -0.951518 -1.567208 7 -0.265452 -0.553400 1.133660 -0.593252 8 -2.557326 0.021411 0.444763 0.073160 9 0.794872 0.796273 -0.366471 -0.434226
print ( df4. iloc[ 0 : 5 ] )
print ( df4. iloc[ 5 : 10 ] )
df5 = pd. concat( [ df4. iloc[ 0 : 5 ] , df4. iloc[ 5 : 10 ] ] )
( df5 == df4) . all ( ) . all ( )
A B C D
0 0.005348 0.351821 -0.708831 -0.227980
1 -0.422946 0.153891 0.418701 0.477547
2 -0.384677 0.677650 -0.741967 -0.060229
3 1.604719 -0.300600 1.207413 -0.064982
4 -0.701497 -0.862793 -1.111455 -1.332034
A B C D
5 -1.265365 0.938897 0.534374 0.143901
6 -0.176843 -0.549029 -0.951518 -1.567208
7 -0.265452 -0.553400 1.133660 -0.593252
8 -2.557326 0.021411 0.444763 0.073160
9 0.794872 0.796273 -0.366471 -0.434226
True
dic2 = { 'key' : [ 'foo' , 'foo' ] , 'lval' : [ 1 , 2 ] }
dic3 = { 'key' : [ 'foo' , 'foo' ] , 'rval' : [ 4 , 5 ] }
df5 = pd. DataFrame( dic2)
df6 = pd. DataFrame( dic3)
df5
df6
pd. merge( df5, df6, on = 'key' )
key lval rval 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5
s = pd. Series( np. random. randint( 1 , 5 , size = 5 ) )
s
0 4
1 3
2 4
3 2
4 2
dtype: int32
print ( df2)
df2. append( s, ignore_index = True )
A B C D
2020-07-05 1.162305 0.467942 1.393984 -0.513032
2020-07-06 1.014034 -0.878029 0.457580 -0.326289
2020-07-07 -0.806825 -0.757913 1.265789 1.237268
2020-07-08 0.351479 0.591388 0.860177 -1.946902
2020-07-09 -0.938336 1.229066 -0.243945 1.210629
2020-07-10 -2.233276 0.243813 0.490214 0.591144
A B C D 0 1 2 3 4 0 1.162305 0.467942 1.393984 -0.513032 NaN NaN NaN NaN NaN 1 1.014034 -0.878029 0.457580 -0.326289 NaN NaN NaN NaN NaN 2 -0.806825 -0.757913 1.265789 1.237268 NaN NaN NaN NaN NaN 3 0.351479 0.591388 0.860177 -1.946902 NaN NaN NaN NaN NaN 4 -0.938336 1.229066 -0.243945 1.210629 NaN NaN NaN NaN NaN 5 -2.233276 0.243813 0.490214 0.591144 NaN NaN NaN NaN NaN 6 NaN NaN NaN NaN 4.0 3.0 4.0 2.0 2.0
分组统计
df7 = pd. DataFrame( { 'A' : [ 'foo' , 'bar' , 'foo' , 'bar' ,
'foo' , 'bar' , 'foo' , 'foo' ] ,
'B' : [ 'one' , 'one' , 'two' , 'three' ,
'two' , 'two' , 'one' , 'three' ] ,
'C' : np. random. randn( 8 ) ,
'D' : np. random. randn( 8 ) } )
df7
A B C D 0 foo one -2.185653 0.495551 1 bar one 0.178044 1.334638 2 foo two 0.134376 1.051614 3 bar three -0.533640 -0.109897 4 foo two -0.184697 -0.930691 5 bar two -0.408308 0.667295 6 foo one -2.382098 -0.896218 7 foo three -0.114952 -1.740475
df7. groupby( 'A' ) . sum ( )
C D A bar -0.763904 1.892036 foo -4.733024 -2.020218
df7. groupby( [ 'A' , 'B' ] ) . sum ( )
C D A B bar one 0.178044 1.334638 three -0.533640 -0.109897 two -0.408308 0.667295 foo one -4.567751 -0.400666 three -0.114952 -1.740475 two -0.050321 0.120923
数据整形
tuples = list ( zip ( * [ [ 'bar' , 'bar' , 'baz' , 'baz' ,
'foo' , 'foo' , 'qux' , 'qux' ] ,
[ 'one' , 'two' , 'one' , 'two' ,
'one' , 'two' , 'one' , 'two' ] ] ) )
index = pd. MultiIndex. from_tuples( tuples, names= [ 'first' , 'second' ] )
df = pd. DataFrame( np. random. randn( 8 , 2 ) , index= index, columns= [ 'A' , 'B' ] )
df
A B first second bar one 0.580665 -1.265347 two -0.844164 -0.803420 baz one 0.144407 2.093790 two -0.738359 -0.243598 foo one -0.522004 0.542619 two 0.077826 -0.919478 qux one 0.719028 0.484342 two 0.804915 0.208629
df. loc[ 'bar' ]
A B second one 0.580665 -1.265347 two -0.844164 -0.803420
df. loc[ 'bar' ] . loc[ 'one' ]
A 0.580665
B -1.265347
Name: one, dtype: float64
df. loc[ 'bar' ] . loc[ 'one' ]
A 0.580665
B -1.265347
Name: one, dtype: float64
stacked = df. stack( )
stacked
first second
bar one A 0.580665
B -1.265347
two A -0.844164
B -0.803420
baz one A 0.144407
B 2.093790
two A -0.738359
B -0.243598
foo one A -0.522004
B 0.542619
two A 0.077826
B -0.919478
qux one A 0.719028
B 0.484342
two A 0.804915
B 0.208629
dtype: float64
stacked. index
MultiIndex([('bar', 'one', 'A'),
('bar', 'one', 'B'),
('bar', 'two', 'A'),
('bar', 'two', 'B'),
('baz', 'one', 'A'),
('baz', 'one', 'B'),
('baz', 'two', 'A'),
('baz', 'two', 'B'),
('foo', 'one', 'A'),
('foo', 'one', 'B'),
('foo', 'two', 'A'),
('foo', 'two', 'B'),
('qux', 'one', 'A'),
('qux', 'one', 'B'),
('qux', 'two', 'A'),
('qux', 'two', 'B')],
names=['first', 'second', None])
stacked. unstack( ) . unstack( )
A B second one two one two first bar 0.580665 -0.844164 -1.265347 -0.803420 baz 0.144407 -0.738359 2.093790 -0.243598 foo -0.522004 0.077826 0.542619 -0.919478 qux 0.719028 0.804915 0.484342 0.208629
stacked. unstack( 1 )
second one two first bar A 0.580665 -0.844164 B -1.265347 -0.803420 baz A 0.144407 -0.738359 B 2.093790 -0.243598 foo A -0.522004 0.077826 B 0.542619 -0.919478 qux A 0.719028 0.804915 B 0.484342 0.208629
数据透视
df = pd. DataFrame( { 'A' : [ 'one' , 'one' , 'two' , 'three' ] * 3 ,
'B' : [ 'A' , 'B' , 'C' ] * 4 ,
'C' : [ 'foo' , 'foo' , 'foo' , 'bar' , 'bar' , 'bar' ] * 2 ,
'D' : np. random. randn( 12 ) ,
'E' : np. random. randn( 12 ) } )
df
A B C D E 0 one A foo -1.115244 -1.477782 1 one B foo 0.417247 -0.389403 2 two C foo -0.534743 1.876607 3 three A bar -0.222409 1.227989 4 one B bar -0.904903 0.965748 5 one C bar -0.950457 -0.727518 6 two A foo 0.110705 0.760229 7 three B foo -0.893722 0.098459 8 one C foo 1.270220 1.366383 9 one A bar -1.050292 -0.758947 10 two B bar 1.070277 -1.388617 11 three C bar -0.752369 0.311584
pd. pivot_table( df, values= 'D' , index= [ 'A' , 'B' ] , columns= [ 'C' ] )
C bar foo A B one A -1.050292 -1.115244 B -0.904903 0.417247 C -0.950457 1.270220 three A -0.222409 NaN B NaN -0.893722 C -0.752369 NaN two A NaN 0.110705 B 1.070277 NaN C NaN -0.534743
时间序列
rng = pd. date_range( '20160301' , periods= 600 , freq= 's' )
rng
DatetimeIndex(['2016-03-01 00:00:00', '2016-03-01 00:00:01',
'2016-03-01 00:00:02', '2016-03-01 00:00:03',
'2016-03-01 00:00:04', '2016-03-01 00:00:05',
'2016-03-01 00:00:06', '2016-03-01 00:00:07',
'2016-03-01 00:00:08', '2016-03-01 00:00:09',
...
'2016-03-01 00:09:50', '2016-03-01 00:09:51',
'2016-03-01 00:09:52', '2016-03-01 00:09:53',
'2016-03-01 00:09:54', '2016-03-01 00:09:55',
'2016-03-01 00:09:56', '2016-03-01 00:09:57',
'2016-03-01 00:09:58', '2016-03-01 00:09:59'],
dtype='datetime64[ns]', length=600, freq='S')
ts = pd. Series( np. random. randint( 0 , 500 , len ( rng) ) , index= rng)
ts
2016-03-01 00:00:00 356
2016-03-01 00:00:01 169
2016-03-01 00:00:02 189
2016-03-01 00:00:03 42
2016-03-01 00:00:04 295
...
2016-03-01 00:09:55 377
2016-03-01 00:09:56 248
2016-03-01 00:09:57 70
2016-03-01 00:09:58 490
2016-03-01 00:09:59 6
Freq: S, Length: 600, dtype: int32
ts. resample( '2Min' , how= 'sum' )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-139-e8bcfecc97a5> in <module>
----> 1 ts.resample('2Min', how='sum')
TypeError: resample() got an unexpected keyword argument 'how'
rng = pd. date_range( '20160301' , periods= 5 , freq= 'M' )
ts = pd. Series( np. random. randn( len ( rng) ) , index= rng)
ts
2016-03-31 -0.501975
2016-04-30 -1.704262
2016-05-31 -2.127817
2016-06-30 -1.115246
2016-07-31 -0.750171
Freq: M, dtype: float64
ps = ts. to_period( )
ps
2016-03 -0.501975
2016-04 -1.704262
2016-05 -2.127817
2016-06 -1.115246
2016-07 -0.750171
Freq: M, dtype: float64
ps = ts. to_period( )
ps
2016-03 -0.501975
2016-04 -1.704262
2016-05 -2.127817
2016-06 -1.115246
2016-07 -0.750171
Freq: M, dtype: float64
print ( pd. Timestamp( '20160301' ) - pd. Timestamp( '20160201' ) )
print ( pd. Timestamp( '20160301' ) + pd. Timedelta( days= 5 ) )
29 days 00:00:00
2016-03-06 00:00:00
类别数据
df = pd. DataFrame( { "id" : [ 1 , 2 , 3 , 4 , 5 , 6 ] , "raw_grade" : [ 'a' , 'b' , 'b' , 'a' , 'a' , 'e' ] } )
df
id raw_grade 0 1 a 1 2 b 2 3 b 3 4 a 4 5 a 5 6 e
df[ "grade" ] = df[ "raw_grade" ] . astype( "category" )
df
id raw_grade grade 0 1 a a 1 2 b b 2 3 b b 3 4 a a 4 5 a a 5 6 e e
df. grade
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
df[ "grade" ] . cat. categories
Index(['a', 'b', 'e'], dtype='object')
df[ "grade" ] . cat. categories = [ "very good" , "good" , "very bad" ]
df
id raw_grade grade 0 1 a very good 1 2 b good 2 3 b good 3 4 a very good 4 5 a very good 5 6 e very bad
df. sort_values( by= 'grade' , ascending= True )
id raw_grade grade 0 1 a very good 3 4 a very good 4 5 a very good 1 2 b good 2 3 b good 5 6 e very bad
df. groupby( "grade" ) . size( )
grade
very good 3
good 2
very bad 1
dtype: int64
画图
ts = pd. Series( np. random. randn( 1000 ) , index= pd. date_range( '20000101' , periods= 1000 ) )
ts = ts. cumsum( )
ts
2000-01-01 1.465772
2000-01-02 0.590570
2000-01-03 1.463768
2000-01-04 2.059164
2000-01-05 3.997711
...
2002-09-22 21.165409
2002-09-23 21.138746
2002-09-24 21.833248
2002-09-25 22.712132
2002-09-26 23.005704
Freq: D, Length: 1000, dtype: float64
ts. plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x16bb1ec3668>
数据读写
df = pd. DataFrame( np. random. randn( 100 , 4 ) , columns= list ( 'ABCD' ) )
df
A B C D 0 -0.276452 -0.043881 0.887262 0.573018 1 0.118645 -0.812897 -2.111207 0.335636 2 1.424447 -1.279426 -0.207269 -0.290550 3 -1.023856 -0.190659 0.371154 -0.437665 4 -0.745313 -1.551667 -0.646987 0.094488 ... ... ... ... ... 95 -0.395240 0.577842 1.439321 -0.606708 96 0.373816 0.282914 1.361671 -0.080537 97 -0.852696 0.088553 0.490996 0.064302 98 -1.426436 0.619357 -1.011322 0.367762 99 0.367029 -0.756744 -0.405919 -0.834062
100 rows × 4 columns
df. to_csv( 'data.csv' )
% ls
驱动器 C 中的卷没有标签。
卷的序列号是 4240-7EF8
C:\Users\Administrator.DESKTOP-EUSQJR5 的目录
2020/07/05 07:53 <DIR> .
2020/07/05 07:53 <DIR> ..
2019/07/08 14:29 <DIR> .anaconda
2020/06/22 08:45 <DIR> .astropy
2020/07/05 02:38 <DIR> .conda
2020/07/05 02:07 130 .condarc
2020/06/22 08:49 <DIR> .config
2020/04/19 19:28 <DIR> .eclipse
2019/10/07 21:26 <DIR> .idlerc
2020/07/05 04:49 <DIR> .ipynb_checkpoints
2019/07/08 15:00 <DIR> .ipython
2020/07/05 03:49 <DIR> .jupyter
2020/02/27 17:32 <DIR> .matplotlib
2020/06/26 21:05 <DIR> .p2
2019/12/15 23:43 64 .pgAdmin4.1549456812.addr
2019/12/15 23:42 0 .pgAdmin4.1549456812.log
2019/12/15 23:43 1,597 .pgAdmin4.startup.log
2020/02/29 16:33 <DIR> .PyCharm2019.1
2020/03/04 17:42 <DIR> .pylint.d
2020/03/04 12:26 7 .python_history
2019/10/15 15:47 <DIR> .spss
2020/03/07 12:23 <DIR> .spyder-py3
2020/04/19 19:28 <DIR> .tooling
2020/02/27 17:14 <DIR> .vscode
2020/07/05 04:51 77,246 03.pandas_intro_p1.ipynb
2020/07/05 04:51 64,477 04.pandas_intro_p2.ipynb
2020/07/05 04:49 112,051 05.pandas_intro_p3.ipynb
2019/10/26 19:10 3,047,162 20191026_191017.mp4
2020/06/11 21:30 <DIR> 3D Objects
2020/03/31 16:09 3,618 config.txt
2020/06/11 21:30 <DIR> Contacts
2020/07/05 07:53 8,258 data.csv
2020/07/05 02:42 <DIR> Desktop
2019/10/26 22:12 41 doc2any.ini
2020/06/11 21:30 <DIR> Documents
2020/06/11 21:30 <DIR> Downloads
2020/06/11 21:30 <DIR> Favorites
2019/07/31 14:49 <DIR> Funshion
2020/06/11 21:30 <DIR> Links
2020/06/11 21:30 <DIR> Music
2019/08/31 22:01 <DIR> OneDrive
2020/07/05 07:53 190,020 pandas基础.ipynb
2020/03/31 16:03 426 persist
2020/06/11 21:30 <DIR> Pictures
2020/06/11 21:30 <DIR> Saved Games
2020/06/11 21:30 <DIR> Searches
2019/07/04 22:17 <DIR> UIDowner
2020/06/29 14:54 52,430 Untitled.ipynb
2020/03/04 18:32 1,475 Untitled1.ipynb
2020/06/11 21:30 <DIR> Videos
2019/10/25 19:50 <DIR> Yinxiang Biji
2019/11/26 14:40 <DIR> Zotero
16 个文件 3,559,002 字节
36 个目录 142,991,962,112 可用字节
print ( pd. read_csv( 'data.csv' ) )
print ( pd. read_csv( 'data.csv' , index_col= 0 ) )
Unnamed: 0 A B C D
0 0 -0.276452 -0.043881 0.887262 0.573018
1 1 0.118645 -0.812897 -2.111207 0.335636
2 2 1.424447 -1.279426 -0.207269 -0.290550
3 3 -1.023856 -0.190659 0.371154 -0.437665
4 4 -0.745313 -1.551667 -0.646987 0.094488
.. ... ... ... ... ...
95 95 -0.395240 0.577842 1.439321 -0.606708
96 96 0.373816 0.282914 1.361671 -0.080537
97 97 -0.852696 0.088553 0.490996 0.064302
98 98 -1.426436 0.619357 -1.011322 0.367762
99 99 0.367029 -0.756744 -0.405919 -0.834062
[100 rows x 5 columns]
A B C D
0 -0.276452 -0.043881 0.887262 0.573018
1 0.118645 -0.812897 -2.111207 0.335636
2 1.424447 -1.279426 -0.207269 -0.290550
3 -1.023856 -0.190659 0.371154 -0.437665
4 -0.745313 -1.551667 -0.646987 0.094488
.. ... ... ... ...
95 -0.395240 0.577842 1.439321 -0.606708
96 0.373816 0.282914 1.361671 -0.080537
97 -0.852696 0.088553 0.490996 0.064302
98 -1.426436 0.619357 -1.011322 0.367762
99 0.367029 -0.756744 -0.405919 -0.834062
[100 rows x 4 columns]