pandas的数据结构介绍
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
Series (索引在左边,值在右边。可看作是一个定长的有序字典)
obj = Series( [ 4 , 7 , - 5 , 3 ] )
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj. values
obj. index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series( [ 4 , 7 , - 5 , 3 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
obj2. index
Index(['d', 'b', 'a', 'c'], dtype='object')
obj2[ obj2 > 0 ]
obj2 * 2
np. exp( obj2)
'b' in obj2
'e' in obj2
False
sdata = { 'Ohio' : 35000 , 'Texas' : 71000 , 'Oregon' : 16000 , 'Utah' : 5000 }
obj3 = Series( sdata)
states = [ 'California' , 'Ohio' , 'Oregon' , 'Texas' ]
obj4 = Series( sdata, index= states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
pd. isnull( obj4)
pd. notnull( obj4)
California False
Ohio True
Oregon True
Texas True
dtype: bool
obj3 + obj4
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
obj4. name = 'population'
obj4. index. name = 'state'
obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
obj. index = [ 'Bob' , 'Steve' , 'Jeff' , 'Ryan' ]
obj
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
DataFrame
data = { 'state' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] ,
'year' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] ,
'pop' : [ 1.5 , 1.7 , 3.6 , 2.4 , 2.9 ] }
frame = DataFrame( data)
frame
state year pop 0 Ohio 2000 1.5 1 Ohio 2001 1.7 2 Ohio 2002 3.6 3 Nevada 2001 2.4 4 Nevada 2002 2.9
DataFrame( data, columns= [ 'year' , 'state' , 'pop' ] )
year state pop 0 2000 Ohio 1.5 1 2001 Ohio 1.7 2 2002 Ohio 3.6 3 2001 Nevada 2.4 4 2002 Nevada 2.9
frame2 = DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
frame2
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN
frame2[ 'state' ]
frame2. year
frame2. loc[ 'three' ]
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
frame2[ 'debt' ] = 16.5
frame2[ 'debt' ] = np. arange( 5 . )
frame2
year state pop debt one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0
val = Series( [ - 1.2 , - 1.5 , - 1.7 ] , index= [ 'two' , 'four' , 'five' ] )
frame2[ 'debt' ] = val
frame2
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 -1.2 three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7
frame2[ 'eastern' ] = frame2. state == 'Ohio'
frame2
year state pop debt eastern one 2000 Ohio 1.5 NaN True two 2001 Ohio 1.7 -1.2 True three 2002 Ohio 3.6 NaN True four 2001 Nevada 2.4 -1.5 False five 2002 Nevada 2.9 -1.7 False
del frame2[ 'eastern' ]
frame2. columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } , 'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame3 = DataFrame( pop)
frame3. T
2001 2002 2000 Nevada 2.4 2.9 NaN Ohio 1.7 3.6 1.5
#有个知识点
frame3. index. name = 'year' ; frame3. columns. name = 'state'
frame3. values
frame2. values
array([[2000, 'Ohio', 1.5, nan],
[2001, 'Ohio', 1.7, -1.2],
[2002, 'Ohio', 3.6, nan],
[2001, 'Nevada', 2.4, -1.5],
[2002, 'Nevada', 2.9, -1.7]], dtype=object)
索引对象
Index对象是不可修改的
obj = Series( range ( 3 ) , index= [ 'a' , 'b' , 'c' ] )
index = obj. index
index[ 1 : ]
Index(['b', 'c'], dtype='object')
index = pd. Index( np. arange( 3 ) )
obj2 = Series( [ 1.5 , - 2.5 , 0 ] , index= index)
obj2. index is index
True
#又有个知识点
'Ohio' in frame3. columns
2003 in frame3. index
False
#有个知识点
重新索引
obj = Series( [ 4.5 , 7.2 , - 5.3 , 3.6 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
obj2 = obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' ] , fill_value= 0 )
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
obj3 = Series( [ 'blue' , 'purple' , 'yellow' ] , index= [ 0 , 2 , 4 ] )
obj3. reindex( range ( 6 ) , method= 'ffill' )
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = DataFrame( np. arange( 9 ) . reshape( ( 3 , 3 ) ) , index= [ 'a' , 'c' , 'd' ] , columns= [ 'Ohio' , 'Texas' , 'California' ] )
frame2 = frame. reindex( [ 'a' , 'b' , 'c' , 'd' ] )
frame2
Ohio Texas California a 0.0 1.0 2.0 b NaN NaN NaN c 3.0 4.0 5.0 d 6.0 7.0 8.0
states = [ 'Texas' , 'Utah' , 'California' ]
frame. reindex( columns= states)
Texas Utah California a 1 NaN 2 c 4 NaN 5 d 7 NaN 8
frame. reindex( index= [ 'a' , 'b' , 'c' , 'd' ] , columns= states) . ffill( )
Texas Utah California a 1.0 NaN 2.0 b 1.0 NaN 2.0 c 4.0 NaN 5.0 d 7.0 NaN 8.0
frame = frame. reindex( [ 'a' , 'b' , 'c' , 'd' ] )
frame = frame. reindex( columns= states)
frame. loc[ [ 'a' , 'b' , 'c' , 'd' ] , states]
Texas Utah California a 1.0 NaN 2.0 b NaN NaN NaN c 4.0 NaN 5.0 d 7.0 NaN 8.0
#又又又有个知识点
丢弃指定轴上的项
obj = Series( np. arange( 5 . ) , index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
new_obj = obj. drop( 'c' )
new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
data = DataFrame( np. arange( 16 ) . reshape( ( 4 , 4 ) ) , index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] , columns= [ 'one' , 'two' , 'three' , 'four' ] )
data. drop( [ 'Colorado' , 'Ohio' ] )
one two three four Utah 8 9 10 11 New York 12 13 14 15
data. drop( 'two' , axis= 1 )
one three four Ohio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15
索引、选取和过滤
obj = Series( np. arange( 4 . ) , index= [ 'a' , 'b' , 'c' , 'd' ] )
obj[ 'b' ]
obj[ 1 ]
1.0
obj[ obj< 2 ]
a 0.0
b 1.0
dtype: float64
obj[ 'b' : 'c' ]
b 1.0
c 2.0
dtype: float64
data = DataFrame( np. arange( 16 ) . reshape( ( 4 , 4 ) ) , index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] , columns= [ 'one' , 'two' , 'three' , 'four' ] )
data[ [ 'three' , 'one' ] ]
three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12
data[ : 2 ]
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
data[ data[ 'three' ] > 5 ]
one two three four Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data[ data< 5 ] = 0
data
one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data. loc[ 'Colorado' , [ 'two' , 'three' ] ]
two 5
three 6
Name: Colorado, dtype: int32
data. loc[ data. three> 5 , : 'three' ]
one two three Colorado 0 5 6 Utah 8 9 10 New York 12 13 14
#又又又又有个知识点
算术运算和数据对齐
s1 = Series( [ 7.3 , - 2.5 , 3.4 , 1.5 ] , index= [ 'a' , 'c' , 'd' , 'e' ] )
s2 = Series( [ - 2.1 , 3.6 , - 1.5 , 4 , 3.1 ] , index= [ 'a' , 'c' , 'e' , 'f' , 'g' ] )
s1+ s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = DataFrame( np. arange( 9 . ) . reshape( ( 3 , 3 ) ) , columns= list ( 'bcd' ) , index= [ 'Ohio' , 'Texas' , 'Colorado' ] )
df2 = DataFrame( np. arange( 12 . ) . reshape( ( 4 , 3 ) ) , columns= list ( 'bde' ) , index= [ 'Utah' , 'Ohio' , 'Texas' , 'Oregon' ] )
df1+ df2
b c d e Colorado NaN NaN NaN NaN Ohio 3.0 NaN 6.0 NaN Oregon NaN NaN NaN NaN Texas 9.0 NaN 12.0 NaN Utah NaN NaN NaN NaN
在算术方法中填充值
add 加法 sub 减法 div 除法 mul 乘法
df1 = DataFrame( np. arange( 12 . ) . reshape( ( 3 , 4 ) ) , columns= list ( 'abcd' ) )
df2 = DataFrame( np. arange( 20 . ) . reshape( ( 4 , 5 ) ) , columns= list ( 'abcde' ) )
df1. add( df2, fill_value= 0 )
df1. reindex( columns= df2. columns, fill_value= 0 )
a b c d e 0 0.0 1.0 2.0 3.0 0 1 4.0 5.0 6.0 7.0 0 2 8.0 9.0 10.0 11.0 0
DataFrame和Series之间的运算
arr = np. arange( 12 . ) . reshape( ( 3 , 4 ) )
arr- arr[ 0 ]
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
frame = DataFrame( np. arange( 12 . ) . reshape( ( 4 , 3 ) ) , columns= list ( 'bde' ) , index= [ 'Utah' , 'Ohio' , 'Texas' , 'Oregon' ] )
series = frame. loc[ 'Utah' ]
frame- series
b d e Utah 0.0 0.0 0.0 Ohio 3.0 3.0 3.0 Texas 6.0 6.0 6.0 Oregon 9.0 9.0 9.0
series2 = Series( range ( 3 ) , index= [ 'b' , 'e' , 'f' ] )
frame+ series2
b d e f Utah 0.0 NaN 3.0 NaN Ohio 3.0 NaN 6.0 NaN Texas 6.0 NaN 9.0 NaN Oregon 9.0 NaN 12.0 NaN
series3 = frame[ 'd' ]
frame. sub( series3, axis= 0 )
b d e Utah -1.0 0.0 1.0 Ohio -1.0 0.0 1.0 Texas -1.0 0.0 1.0 Oregon -1.0 0.0 1.0
函数应用和映射
frame = DataFrame( np. random. randn( 4 , 3 ) , columns= list ( 'bde' ) , index= [ 'Utah' , 'Ohio' , 'Texas' , 'Oregon' ] )
np. abs ( frame)
f = lambda x: x. max ( ) - x. min ( )
frame. apply ( f)
frame. apply ( f, axis= 1 )
Utah 2.341364
Ohio 1.889609
Texas 1.479165
Oregon 1.122892
dtype: float64
def f ( x) :
return Series( [ x. min ( ) , x. max ( ) ] , index= [ 'min' , 'max' ] )
frame. apply ( f)
b d e min -2.121313 -0.231704 -0.309807 max 1.255592 0.670467 1.786023
format = lambda x: '%.2f' % x
frame. applymap( format )
frame[ 'e' ] . map ( format )
Utah 1.79
Ohio -0.31
Texas 1.42
Oregon 0.13
Name: e, dtype: object
排序和排名
obj = Series( range ( 4 ) , index= [ 'd' , 'a' , 'b' , 'c' ] )
obj. sort_index( )
a 1
b 2
c 3
d 0
dtype: int64
frame = DataFrame( np. arange( 8 ) . reshape( ( 2 , 4 ) ) , index= [ 'three' , 'one' ] , columns= [ 'd' , 'a' , 'b' , 'c' ] )
frame. sort_index( )
frame. sort_index( axis= 1 )
frame. sort_index( axis= 1 , ascending= False )
obj = Series( [ 4 , 7 , - 3 , 2 ] )
obj. sort_values( )
2 -3
3 2
0 4
1 7
dtype: int64
obj = Series( [ 4 , np. nan, 7 , np. nan, - 3 , 2 ] )
obj. sort_values( )
4 -3.0
5 2.0
0 4.0
2 7.0
1 NaN
3 NaN
dtype: float64
frame = DataFrame( { 'b' : [ 4 , 7 , - 3 , 2 ] , 'a' : [ 0 , 1 , 0 , 1 ] } )
frame. sort_values( by= 'b' )
frame. sort_values( by= [ 'a' , 'b' ] )
obj = Series( [ 7 , - 5 , 7 , 4 , 2 , 0 , 4 ] )
obj. rank( )
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
obj. rank( method= 'first' )
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
obj. rank( ascending= False , method= 'max' )
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
#又又又又又有个知识点
带有重复值的轴索引
obj = Series( range ( 5 ) , index= [ 'a' , 'a' , 'b' , 'b' , 'c' ] )
obj. index. is_unique
False
df = DataFrame( np. random. randn( 4 , 3 ) , index= [ 'a' , 'a' , 'b' , 'b' ] )
df. loc[ 'b' ]
0 1 2 b -0.994941 0.304769 0.930754 b 0.918218 0.577393 2.664499
汇总和计算描述统计
df = DataFrame( [ [ 1.4 , np. nan] , [ 7.1 , - 4.5 ] , [ np. nan, np. nan] , [ 0.75 , - 1.3 ] ] , index= [ 'a' , 'b' , 'c' , 'd' ] , columns= [ 'one' , 'two' ] )
df. sum ( )
df. sum ( axis= 1 )
a NaN
b 2.60
c NaN
d -0.55
dtype: float64
df. mean( axis= 1 , skipna= False )
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
#又又又又又又有个知识点
df. idxmax( )
one b
two d
dtype: object
df. cumsum( )
one two a 1.40 NaN b 8.50 -4.5 c NaN NaN d 9.25 -5.8
df. describe( )
one two count 3.000000 2.000000 mean 3.083333 -2.900000 std 3.493685 2.262742 min 0.750000 -4.500000 25% 1.075000 -3.700000 50% 1.400000 -2.900000 75% 4.250000 -2.100000 max 7.100000 -1.300000
obj = Series( [ 'a' , 'a' , 'b' , 'c' ] * 4 )
obj. describe( )
count 16
unique 3
top a
freq 8
dtype: object
#又又又又又又又有个知识点
相关系数与协方差
import pandas_datareader. data as web
all_data = { }
for ticker in [ 'AAPL' , 'IBM' , 'MSFT' , 'GOOG' ] :
all_data[ ticker] = web. get_data_yahoo( ticker, '1/1/2000' , '1/1/2010' )
price = DataFrame( { tic: data[ 'Adj Close' ] for tic, data in all_data. iteritems( ) } )
volume = DataFrame( { tic: data[ 'Volume' ] for tic, data in all_data. iteritems( ) } )
报错。
price = pd. read_pickle( 'E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl' )
volume = pd. read_pickle( 'E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl' )
price. head( )
AAPL GOOG IBM MSFT Date 2010-01-04 27.990226 313.062468 113.304536 25.884104 2010-01-05 28.038618 311.683844 111.935822 25.892466 2010-01-06 27.592626 303.826685 111.208683 25.733566 2010-01-07 27.541619 296.753749 110.823732 25.465944 2010-01-08 27.724725 300.709808 111.935822 25.641571
volume. head( )
AAPL GOOG IBM MSFT Date 2010-01-04 123432400 3927000 6155300 38409100 2010-01-05 150476200 6031900 6841400 49749600 2010-01-06 138040000 7987100 5605300 58182400 2010-01-07 119282800 12876600 5840600 50559700 2010-01-08 111902700 9483900 4197200 51197400
returns = price. pct_change( )
returns. tail( )
AAPL GOOG IBM MSFT Date 2016-10-17 -0.000680 0.001837 0.002072 -0.003483 2016-10-18 -0.000681 0.019616 -0.026168 0.007690 2016-10-19 -0.002979 0.007846 0.003583 -0.002255 2016-10-20 -0.000512 -0.005652 0.001719 -0.004867 2016-10-21 -0.003930 0.003011 -0.012474 0.042096
returns. MSFT. corr( returns. IBM)
0.49976361144151155
returns. MSFT. cov( returns. IBM)
8.870655479703546e-05
returns. corr( )
AAPL GOOG IBM MSFT AAPL 1.000000 0.407919 0.386817 0.389695 GOOG 0.407919 1.000000 0.405099 0.465919 IBM 0.386817 0.405099 1.000000 0.499764 MSFT 0.389695 0.465919 0.499764 1.000000
returns. cov( )
AAPL GOOG IBM MSFT AAPL 0.000277 0.000107 0.000078 0.000095 GOOG 0.000107 0.000251 0.000078 0.000108 IBM 0.000078 0.000078 0.000146 0.000089 MSFT 0.000095 0.000108 0.000089 0.000215
returns. corrwith( returns. IBM)
AAPL 0.386817
GOOG 0.405099
IBM 1.000000
MSFT 0.499764
dtype: float64
returns. corrwith( volume)
AAPL -0.075565
GOOG -0.007067
IBM -0.204849
MSFT -0.092950
dtype: float64
唯一值、值计数以及成员资格
isin 计算一个表示“Series各值是否包含于传入的值序列中”的布尔型数组; unique 计算Series中的唯一值数组,按发现的顺序返回; value_counts 返回一个Series,其索引为唯一值,其值为频率,按计数值降序排列
obj = Series( [ 'c' , 'a' , 'd' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' ] )
uniques = obj. unique( )
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj. value_counts( )
c 3
a 3
b 2
d 1
dtype: int64
pd. value_counts( obj. values, sort= False )
c 3
a 3
d 1
b 2
dtype: int64
mask = obj. isin( [ 'b' , 'c' ] )
obj[ mask]
0 c
5 b
6 b
7 c
8 c
dtype: object
data = DataFrame( { 'Qu1' : [ 1 , 3 , 4 , 3 , 4 ] , 'Qu2' : [ 2 , 3 , 1 , 2 , 3 ] , 'Qu3' : [ 1 , 5 , 2 , 4 , 4 ] } )
result = data. apply ( pd. value_counts) . fillna( 0 )
result
Qu1 Qu2 Qu3 1 1.0 1.0 1.0 2 0.0 2.0 1.0 3 2.0 2.0 0.0 4 2.0 0.0 2.0 5 0.0 0.0 1.0
处理缺失数据
pandas对象上的所有描述统计都排除了缺失数据
string_data = Series( [ 'aardvark' , 'artichoke' , np. nan, 'avocado' ] )
string_data. isnull( )
0 False
1 False
2 True
3 False
dtype: bool
string_data[ 0 ] = None
string_data. isnull( )
0 True
1 False
2 True
3 False
dtype: bool
#又又又又又又又又有个知识点
滤除缺失值
from numpy import nan as NA
data = Series( [ 1 , NA, 3.5 , NA, 7 ] )
data. dropna( )
0 1.0
2 3.5
4 7.0
dtype: float64
data[ data. notnull( ) ]
0 1.0
2 3.5
4 7.0
dtype: float64
data = DataFrame( [ [ 1 . , 6.5 , 3 . ] , [ 1 . , NA, NA] , [ NA, NA, NA] , [ NA, 6.5 , 3 . ] ] )
cleaned = data. dropna( )
cleaned
data. dropna( how= 'all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 3 NaN 6.5 3.0
data[ 4 ] = NA
data. dropna( axis= 1 , how= 'all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
df = DataFrame( np. random. randn( 7 , 3 ) )
df. loc[ : 4 , 1 ] = NA; df. loc[ : 2 , 2 ] = NA
df. dropna( thresh= 3 )
0 1 2 5 -1.991372 -1.644575 0.675400 6 1.718451 0.312742 -1.484959
填充缺失数据
df. fillna( 0 )
0 1 2 0 -0.292230 0.000000 0.000000 1 0.129826 0.000000 0.000000 2 -0.075307 0.000000 0.000000 3 0.280476 0.000000 -1.259970 4 -1.171738 0.000000 0.206481 5 -1.991372 -1.644575 0.675400 6 1.718451 0.312742 -1.484959
df. fillna( { 1 : 0.5 , 3 : - 1 } )
0 1 2 0 -0.292230 0.500000 NaN 1 0.129826 0.500000 NaN 2 -0.075307 0.500000 NaN 3 0.280476 0.500000 -1.259970 4 -1.171738 0.500000 0.206481 5 -1.991372 -1.644575 0.675400 6 1.718451 0.312742 -1.484959
_ = df. fillna( 0 , inplace= True )
df
0 1 2 0 -0.292230 0.000000 0.000000 1 0.129826 0.000000 0.000000 2 -0.075307 0.000000 0.000000 3 0.280476 0.000000 -1.259970 4 -1.171738 0.000000 0.206481 5 -1.991372 -1.644575 0.675400 6 1.718451 0.312742 -1.484959
df = DataFrame( np. random. randn( 6 , 3 ) )
df. loc[ 2 : , 1 ] = NA; df. loc[ 4 : , 2 ] = NA
df. fillna( method= 'ffill' , limit= 2 )
0 1 2 0 -0.814015 -1.672914 -0.437364 1 0.294209 0.038563 -0.141332 2 -0.337091 0.038563 -0.041438 3 0.698458 0.038563 -0.750640 4 -0.369432 NaN -0.750640 5 -0.437763 NaN -0.750640
data = Series( [ 1 . , NA, 3.5 , NA, 7 ] )
data. fillna( data. mean( ) )
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
#又又又又又又又又又有个知识点
层次化索引
能使一个轴上拥有多个索引级别,能以低维度形式处理高维度数据
data = Series( np. random. randn( 10 ) , index= [ [ 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'c' , 'c' , 'd' , 'd' ] , [ 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 2 , 3 ] ] )
data
a 1 0.861096
2 0.613551
3 1.130427
b 1 -0.210724
2 0.962846
3 0.393051
c 1 -0.774183
2 0.456655
d 2 -0.824490
3 0.908530
dtype: float64
data. index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 2),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
data[ 'b' ]
1 -0.210724
2 0.962846
3 0.393051
dtype: float64
data. loc[ [ 'b' , 'd' ] ]
b 1 -0.210724
2 0.962846
3 0.393051
d 2 -0.824490
3 0.908530
dtype: float64
data[ : , 2 ]
a 0.613551
b 0.962846
c 0.456655
d -0.824490
dtype: float64
data. unstack( )
1 2 3 a 0.861096 0.613551 1.130427 b -0.210724 0.962846 0.393051 c -0.774183 0.456655 NaN d NaN -0.824490 0.908530
data. unstack( ) . stack( )
a 1 0.861096
2 0.613551
3 1.130427
b 1 -0.210724
2 0.962846
3 0.393051
c 1 -0.774183
2 0.456655
d 2 -0.824490
3 0.908530
dtype: float64
frame = DataFrame( np. arange( 12 ) . reshape( ( 4 , 3 ) ) , index= [ [ 'a' , 'a' , 'b' , 'b' ] , [ 1 , 2 , 1 , 2 ] ] , columns= [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Green' ] ] )
frame. index. names = [ 'key1' , 'key2' ]
frame. columns. names = [ 'state' , 'color' ]
frame
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
frame[ 'Ohio' ]
color Green Red key1 key2 a 1 0 1 2 3 4 b 1 6 7 2 9 10
MultiIndex. from_arrays( [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Green' ] ] , names= [ 'state' , 'color' ] )
大概这个意思
重排分级顺序
frame. swaplevel( 'key1' , 'key2' )
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11
frame. sort_values( axis= 0 , by= 'key2' )
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11
frame. swaplevel( 0 , 1 ) . sort_index( )
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11
在层次化索引的对象上,如果索引是按字典方式从外到内排序(即调用sort_index),数据选取操作的性能要好很多。
根据级别汇总统计
frame. groupby( level= 'key2' ) . sum ( )
state Ohio Colorado color Green Red Green key2 1 6 8 10 2 12 14 16
frame. groupby( level= 'color' , axis= 1 ) . sum ( )
color Green Red key1 key2 a 1 2 1 2 8 4 b 1 14 7 2 20 10
使用DataFrame的列
frame = DataFrame( { 'a' : range ( 7 ) , 'b' : range ( 7 , 0 , - 1 ) , 'c' : [ 'one' , 'one' , 'one' , 'two' , 'two' , 'two' , 'two' ] , 'd' : [ 0 , 1 , 2 , 0 , 1 , 2 , 3 ] } )
frame2 = frame. set_index( [ 'c' , 'd' ] )
frame2
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
frame. set_index( [ 'c' , 'd' ] , drop= False )
frame2. reset_index( )
c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
其他有关pandas的话题
整数索引
ser = Series( np. arange( 3 . ) )
ser2 = Series( np. arange( 3 . ) , index= [ 'a' , 'b' , 'c' ] )
ser2[ - 1 ]
2.0
ser. loc[ : 1 ]
0 0.0
1 1.0
dtype: float64
ser3 = Series( range ( 3 ) , index= [ - 5 , 1 , 3 ] )
ser3. iat[ 2 ]
2
frame = DataFrame( np. arange( 6 ) . reshape( 3 , 2 ) , index= [ 2 , 0 , 1 ] )
frame. iloc[ 0 ]
frame. iloc[ 0 , : ]
0 0
1 1
Name: 2, dtype: int32
面板数据
可以用一个由DataFrame对象组成的字典或一个三维ndarray来创建Panel对象
price = pd. read_pickle( 'E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl' )
type ( price)
pandas.core.frame.DataFrame
price. loc[ '6/1/2012' ]
AAPL 73.371509
GOOG 285.205295
IBM 168.989059
MSFT 25.262972
Name: 2012-06-01 00:00:00, dtype: float64
stacked= price. loc[ '6/1/2012' ] . to_frame( )
stacked
2012-06-01 AAPL 73.371509 GOOG 285.205295 IBM 168.989059 MSFT 25.262972
pd. Panel( stacked)
AttributeError: module 'pandas' has no attribute 'Panel'