import numpy as np
import pandas as pd
d= pd. DataFrame( np. random. randint( 1 , 24 , size= ( 6 , 4 ) ) ,
index= pd. date_range( "19980102" , periods= 6 ) ,
columns= [ "a" , "b" , "c" , "d" ] )
print ( d)
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
d2 = pd. DataFrame( { 'A' : np. random. rand( 4 ) ,
'B' : pd. Timestamp( '20130102' ) ,
'C' : pd. Series( np. arange( 1 , 5 ) , index= list ( range ( 4 ) ) , dtype= 'float32' ) ,
'D' : np. array( [ 3 ] * 4 , dtype= 'int32' ) ,
'E' : pd. Categorical( [ "test" , "train" , "test" , "train" ] ) ,
'F' : 'foo' } )
d2
A B C D E F 0 0.075741 2013-01-02 1.0 3 test foo 1 0.410090 2013-01-02 2.0 3 train foo 2 0.620653 2013-01-02 3.0 3 test foo 3 0.061835 2013-01-02 4.0 3 train foo
查看数据
查看数据类型,dtypes输出每一列数据类型
print ( d. dtypes)
print ( d2. dtypes)
a int32
b int32
c int32
d int32
dtype: object
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
查看 DataFrame 头部
print ( d. head( ) )
查看 DataFrame 尾部,可选择数量
print ( d. tail( 3 ) )
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
a b c d
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
行索引列标签
print ( d. index)
print ( d. columns)
DatetimeIndex(['1998-01-02', '1998-01-03', '1998-01-04', '1998-01-05',
'1998-01-06', '1998-01-07'],
dtype='datetime64[ns]', freq='D')
Index(['a', 'b', 'c', 'd'], dtype='object')
输出数组对象
print ( d. to_numpy( ) )
print ( d2. to_numpy( ) )
[[15 21 23 7]
[11 6 5 2]
[13 4 23 23]
[21 20 22 20]
[ 5 23 19 22]
[ 4 9 11 4]]
[[0.07574114092524209 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[0.41008997302466055 Timestamp('2013-01-02 00:00:00') 2.0 3 'train'
'foo']
[0.6206525618816419 Timestamp('2013-01-02 00:00:00') 3.0 3 'test' 'foo']
[0.061835078456274895 Timestamp('2013-01-02 00:00:00') 4.0 3 'train'
'foo']]
describe() 快速查看数据的统计摘要:
数字数据 DataFrame.count 计算非NA /空观测值的数量。 DataFrame.max 对象中的最大值。 DataFrame.min 对象中的最小值。 DataFrame.mean 值的平均值。 DataFrame.std 观测值的标准差。 DataFrame.select_dtypes DataFrame的子集,包括/基于列的dtype排除列。
对象的数据(例如字符串或时间戳) 结果的指数将包括count,unique,top,和freq。 该top 是最常见的值。该freq是最常见的值的频率。时间戳记还包括first和last项目。
混合数据输出数字数据
d. describe( )
a b c d count 6.000000 6.000000 6.000000 6.000000 mean 11.500000 13.833333 17.166667 13.000000 std 6.379655 8.424172 7.494442 9.674709 min 4.000000 4.000000 5.000000 2.000000 25% 6.500000 6.750000 13.000000 4.750000 50% 12.000000 14.500000 20.500000 13.500000 75% 14.500000 20.750000 22.750000 21.500000 max 21.000000 23.000000 23.000000 23.000000
25%、50%、75%为分位值 一共6个数,共5个数字间隔,每个四分位间5/4=1.25个数。
计算25分位: 第1个四分位数为上面6个数中的第1+1.25=2.25个数 指的是第2个数+第2个和第3个数中间的0.25位置处,即: 5+0.25*(11-5)= 6.5
计算50分位: 第2个四分位数为上面6个数中的第1+1.25*2=3.5个数 即:11+0.5(13-11)=12
计算75分位: 第3个四分位数为上面6个数中的第1+1.25*3=4.75个数 指的是第4个数+第4个和第5个数中间的0.75位置处,即: 13+0.75(15-13)=14.5
d2. describe( )
A C D count 4.000000 4.000000 4.0 mean 0.292080 2.500000 3.0 std 0.271846 1.290994 0.0 min 0.061835 1.000000 3.0 25% 0.072265 1.750000 3.0 50% 0.242916 2.500000 3.0 75% 0.462731 3.250000 3.0 max 0.620653 4.000000 3.0
转置
d. T
1998-01-02 1998-01-03 1998-01-04 1998-01-05 1998-01-06 1998-01-07 a 15 11 13 21 5 4 b 21 6 4 20 23 9 c 23 5 23 22 19 11 d 7 2 23 20 22 4
排序
索引轴排序
d. sort_index( axis= 0 , ascending= False )
a b c d 1998-01-07 4 9 11 4 1998-01-06 5 23 19 22 1998-01-05 21 20 22 20 1998-01-04 13 4 23 23 1998-01-03 11 6 5 2 1998-01-02 15 21 23 7
d. sort_index( axis= 0 , ascending= False )
a b c d 1998-01-07 4 9 11 4 1998-01-06 5 23 19 22 1998-01-05 21 20 22 20 1998-01-04 13 4 23 23 1998-01-03 11 6 5 2 1998-01-02 15 21 23 7
值排序
d. sort_values( by= [ 'a' ] , axis= 0 )
a b c d 1998-01-07 4 9 11 4 1998-01-06 5 23 19 22 1998-01-03 11 6 5 2 1998-01-04 13 4 23 23 1998-01-02 15 21 23 7 1998-01-05 21 20 22 20
d. sort_values( by= '1998-01-02' , axis= 1 )
d a b c 1998-01-02 7 15 21 23 1998-01-03 2 11 6 5 1998-01-04 23 13 4 23 1998-01-05 20 21 20 22 1998-01-06 22 5 23 19 1998-01-07 4 4 9 11
索引和切片
[ ] 切片
print ( d. a)
print ( d[ 'a' ] )
1998-01-02 15
1998-01-03 11
1998-01-04 13
1998-01-05 21
1998-01-06 5
1998-01-07 4
Freq: D, Name: a, dtype: int32
1998-01-02 15
1998-01-03 11
1998-01-04 13
1998-01-05 21
1998-01-06 5
1998-01-07 4
Freq: D, Name: a, dtype: int32
选择多列,切片,
d[ : 3 ]
a b c d 1998-01-02 15 21 23 7 1998-01-03 11 6 5 2 1998-01-04 13 4 23 23
d[ '1998-01-03' : '1998-01-05' ]
a b c d 1998-01-03 11 6 5 2 1998-01-04 13 4 23 23 1998-01-05 21 20 22 20
按标签选择切片loc
d. loc[ '1998-01-03' : '1998-01-05' ]
a b c d 1998-01-03 11 6 5 2 1998-01-04 13 4 23 23 1998-01-05 21 20 22 20
d. loc[ '1998-01-03' : '1998-01-06' , 'b' : 'd' ]
b c d 1998-01-03 6 5 2 1998-01-04 4 23 23 1998-01-05 20 22 20 1998-01-06 23 19 22
使用布尔数组获取值:
print ( d. loc[ '1998-01-04' ] > 0 )
print ( d. loc[ : , d. loc[ '1998-01-04' ] > 0 ] )
a True
b True
c True
d True
Name: 1998-01-04 00:00:00, dtype: bool
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
iloc属性类似NumPy 用整数切片:
d. iloc[ 3 ]
a 21
b 20
c 22
d 20
Name: 1998-01-05 00:00:00, dtype: int32
d. iloc[ 3 : 5 , 0 : 2 ]
a b 1998-01-05 21 20 1998-01-06 5 23
d. iloc[ 1 , 1 ]
6
用 isin() 插入、筛选:
d[ 'E' ] = [ 'one' , 'one' , 'two' , 'three' , 'four' , 'three' ]
d
a b c d E 1998-01-02 15 21 23 7 one 1998-01-03 11 6 5 2 one 1998-01-04 13 4 23 23 two 1998-01-05 21 20 22 20 three 1998-01-06 5 23 19 22 four 1998-01-07 4 9 11 4 three
d[ d[ 'E' ] . isin( [ 'two' , 'four' ] ) ]
a b c d E 1998-01-04 13 4 23 23 two 1998-01-06 5 23 19 22 four
d
a b c d E 1998-01-02 15 21 23 7 one 1998-01-03 11 6 5 2 one 1998-01-04 13 4 23 23 two 1998-01-05 21 20 22 20 three 1998-01-06 5 23 19 22 four 1998-01-07 4 9 11 4 three
reindex更改、添加、删除指定轴的索引,并返回数据副本,即不更改原数据。
df = pd. DataFrame( np. random. randn( 6 , 4 ) ,
index= pd. date_range( '20130101' , periods= 6 ) ,
columns= list ( 'ABCD' ) )
df1= df. reindex( index= dates[ 0 : 4 ] , columns= list ( df. columns) + [ 'E' ] )
df1. loc[ dates[ 0 ] : dates[ 1 ] , 'E' ] = 1
df1
A B C D E 1998-01-02 NaN NaN NaN NaN 1.0 1998-01-03 NaN NaN NaN NaN 1.0 1998-01-04 NaN NaN NaN NaN NaN 1998-01-05 NaN NaN NaN NaN NaN
删除所有含缺失值的行:
df1. dropna( how= 'any' )
填充缺失值:
df1. fillna( value= 5 )
A B C D E 1998-01-02 5.0 5.0 5.0 5.0 1.0 1998-01-03 5.0 5.0 5.0 5.0 1.0 1998-01-04 5.0 5.0 5.0 5.0 5.0 1998-01-05 5.0 5.0 5.0 5.0 5.0
3、运算
统计mean( )
print ( df)
print ( df. mean( ) )
A B C D
2013-01-01 0.725530 0.304938 0.988725 0.749843
2013-01-02 -0.697489 -0.916037 0.019967 0.256584
2013-01-03 -0.590450 -0.261403 -0.414659 -0.344422
2013-01-04 -1.720918 -0.553150 -1.047237 0.222394
2013-01-05 -3.493505 1.665040 0.356288 0.953887
2013-01-06 0.936351 -1.692798 1.251221 1.718479
A -0.806747
B -0.242235
C 0.192384
D 0.592794
dtype: float64
df. mean( 1 )
2013-01-01 0.692259
2013-01-02 -0.334244
2013-01-03 -0.402733
2013-01-04 -0.774728
2013-01-05 -0.129572
2013-01-06 0.553313
Freq: D, dtype: float64
s = pd. Series( [ 1 , 3 , 5 , np. nan, 6 , 8 ] ,
index= pd. date_range( '20130101' , periods= 6 ) ) . shift( 2 )
s
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
df
A B C D 2013-01-01 0.725530 0.304938 0.988725 0.749843 2013-01-02 -0.697489 -0.916037 0.019967 0.256584 2013-01-03 -0.590450 -0.261403 -0.414659 -0.344422 2013-01-04 -1.720918 -0.553150 -1.047237 0.222394 2013-01-05 -3.493505 1.665040 0.356288 0.953887 2013-01-06 0.936351 -1.692798 1.251221 1.718479
df. sub( s, axis= 'index' )
A B C D 2013-01-01 NaN NaN NaN NaN 2013-01-02 NaN NaN NaN NaN 2013-01-03 -1.590450 -1.261403 -1.414659 -1.344422 2013-01-04 -4.720918 -3.553150 -4.047237 -2.777606 2013-01-05 -8.493505 -3.334960 -4.643712 -4.046113 2013-01-06 NaN NaN NaN NaN
合并(Merge)
pd. concat( [ df[ : 2 ] , df[ 4 : 5 ] ] )
A B C D 2013-01-01 0.725530 0.304938 0.988725 0.749843 2013-01-02 -0.697489 -0.916037 0.019967 0.256584 2013-01-05 -3.493505 1.665040 0.356288 0.953887
连接(join)
left = pd. DataFrame( { 'key' : [ 'foo' , 'foo' ] , 'lval' : [ 1 , 2 ] } )
right = pd. DataFrame( { 'key' : [ 'foo' , 'foo' ] , 'rval' : [ 4 , 5 ] } )
left
right
pd. merge( left, right, on= 'key' )
key lval rval 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5
追加(Append)
df = pd. DataFrame( np. random. randn( 8 , 4 ) , columns= [ 'A' , 'B' , 'C' , 'D' ] )
df. append( df, ignore_index= True )
A B C D 0 -0.761027 0.430054 0.452784 1.122863 1 -2.276889 -0.943561 1.823242 -0.716462 2 0.430023 -0.812228 0.938351 -0.839029 3 0.169974 0.890258 -0.387269 0.510224 4 -1.647350 1.135522 -1.064858 -0.303383 5 -0.382446 0.890663 -0.052855 -0.548905 6 -0.963716 -1.307239 1.830830 0.106964 7 1.481288 -0.022846 1.371338 -0.227230 8 -0.761027 0.430054 0.452784 1.122863 9 -2.276889 -0.943561 1.823242 -0.716462 10 0.430023 -0.812228 0.938351 -0.839029 11 0.169974 0.890258 -0.387269 0.510224 12 -1.647350 1.135522 -1.064858 -0.303383 13 -0.382446 0.890663 -0.052855 -0.548905 14 -0.963716 -1.307239 1.830830 0.106964 15 1.481288 -0.022846 1.371338 -0.227230
未完待续。。。