pandas入门
源码点这里
from pandas import Series,DataFrame
import pandas as pd
pandas的两个主要数据结构:Series和DataFrame
Series 是一中类似于一维数组的对象,它由一组数据以及一组与之相关的数据标签(即索引)组成。
obj=Series([4 ,7 ,-5 ,3 ])
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.values
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2=Series([4 ,7 ,-5 ,3 ],index=['d' ,'b' ,'a' ,'c' ])
obj2
d 4 b 7 a -5 c 3 dtype: int64
obj2['a' ]
-5
obj2['d' ]
4
obj2[['c' ,'a' ,'d' ]]
c 3 a -5 d 4 dtype: int64
obj2[obj2>0 ]
d 4 b 7 c 3 dtype: int64
obj2*2
d 8 b 14 a -10 c 6 dtype: int64
import numpy as np
np.exp(obj2)
d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64
sdata={'0hio' :35000 ,'Texta' :79300 ,'Ohjsodf' :16000 ,'Jsdf' :5000 }
obj3=Series(sdata)
obj3
0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
states=['California' ,'0hio' ,'Texta' ,'Ohjsodf' ]
obj4=Series(sdata,index=states)
obj4
California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
pd.isnull(obj4)
California True 0hio False Texta False Ohjsodf False dtype: bool
pd.notnull(obj4)
California False 0hio True Texta True Ohjsodf True dtype: bool
obj4.isnull()
California True 0hio False Texta False Ohjsodf False dtype: bool **下面重点关注如何处理缺失数据**
obj3
0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
obj4
California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
obj3+obj4
0hio 70000.0 California NaN Jsdf NaN Ohjsodf 32000.0 Texta 158600.0 dtype: float64
obj4.name='population'
obj4.index.name='state'
obj4
state California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 Name: population, dtype: float64
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.index=['Bob' ,'Steve' ,'Jeff' ,'Ryan' ]
obj
Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64
DataFrame ####DataFrame是一个表格型的数据结构,它含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等) ####DataFrame既又行索引也有列索引,它可以被看做Series组成的字典(共同用一个索引)
data={'state' :['0hi0' ,'0hio' ,'0hio' ,'Nevada' ,'Nevada' ],
'year' :[2000 ,2001 ,2002 ,2001 ,2002 ],
'pop' :[1.5 ,1.7 ,3.6 ,2.4 ,2.9 ]
}
frame=DataFrame(data)
frame
pop state year 0 1.5 0hi0 2000 1 1.7 0hio 2001 2 3.6 0hio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002
DataFrame(data,columns=['year' ,'state' ,'pop' ])
year state pop 0 2000 0hi0 1.5 1 2001 0hio 1.7 2 2002 0hio 3.6 3 2001 Nevada 2.4 4 2002 Nevada 2.9
frame2=DataFrame(data,columns=['year' ,'state' ,'pop' ,'debt' ],index=['one' ,'two' ,'three' ,'four' ,'five' ])
frame2
year state pop debt one 2000 0hi0 1.5 NaN two 2001 0hio 1.7 NaN three 2002 0hio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
frame2['state' ]
one 0hi0 two 0hio three 0hio four Nevada five Nevada Name: state, dtype: object
frame2.year
one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64
frame2.ix['three' ]
year 2002 state 0hio pop 3.6 debt NaN Name: three, dtype: object
frame2['debt' ]=16.
frame2
frame2['debt' ]=np.arange(5. )
frame2
year state pop debt one 2000 0hi0 1.5 0.0 two 2001 0hio 1.7 1.0 three 2002 0hio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0
val=Series([-1.2 ,-1.5 ,-1.7 ],index=['two' ,'four' ,'five' ])
frame2['debt' ]=val
frame2
year state pop debt one 2000 0hi0 1.5 NaN two 2001 0hio 1.7 -1.2 three 2002 0hio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7
frame2['eastern' ]=frame2.state=='0hi0'
frame2
year state pop debt eastern one 2000 0hi0 1.5 NaN True two 2001 0hio 1.7 -1.2 False three 2002 0hio 3.6 NaN False four 2001 Nevada 2.4 -1.5 False five 2002 Nevada 2.9 -1.7 False
del frame2['eastern' ]
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
pop={'Nevada' :{2001 :2.4 ,2002 :2.9 },'0hio' :{2000 :1.5 ,2001 :1.7 ,2002 :3.6 }}
frame3=DataFrame(pop)
frame3
0hio Nevada 2000 1.5 NaN 2001 1.7 2.4 2002 3.6 2.9
frame3.T
2000 2001 2002 0hio 1.5 1.7 3.6 Nevada NaN 2.4 2.9
DataFrame(pop,index=[2001 ,2002 ,2003 ])
0hio Nevada 2001 1.7 2.4 2002 3.6 2.9 2003 NaN NaN
frame3.index.name='year'
frame3.columns.name='state'
frame3
state 0hio Nevada year 2000 1.5 NaN 2001 1.7 2.4 2002 3.6 2.9
frame3.values
array([[ 1.5, nan], [ 1.7, 2.4], [ 3.6, 2.9]])
frame2.values
array([[2000, ‘0hi0’, 1.5, nan], [2001, ‘0hio’, 1.7, -1.2], [2002, ‘0hio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7]], dtype=object)
索引对象
obj=Series(range(3 ),index=['a' ,'b' ,'c' ])
index=obj.index
index
Index([‘a’, ‘b’, ‘c’], dtype=’object’)
index[1 :]
Index([‘b’, ‘c’], dtype=’object’)
index[1 ]='d'
————————————————————————— TypeError Traceback (most recent call last) in () 1 #Index对象是不能修改的(immutable),因此用户不能对其进行修改 —-> 2 index[1]=’d’ C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\pandas\indexes\base.py in __setitem__(self, key, value) 1402 1403 def __setitem__(self, key, value): -> 1404 raise TypeError(“Index does not support mutable operations”) 1405 1406 def __getitem__(self, key): TypeError: Index does not support mutable operations
index=pd.Index(np.arange(3 ))
obj2=Series([1.5 ,-2.5 ,0 ],index=index)
obj2.index is index
True
frame3
'0hio' in frame3.columns
True
2002 in frame3.index
True ##重新索引
obj=Series([4.5 ,7.2 ,-5.3 ,3.6 ],index=['d' ,'b' ,'a' ,'c' ])
obj
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj2=obj.reindex(['a' ,'b' ,'c' ,'d' ,'e' ])
obj2
a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64
obj.reindex(['a' ,'b' ,'c' ,'d' ,'e' ],fill_value=0 )
a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64
obj3=Series(['blue' ,'purple' ,'yellow' ],index=[0 ,2 ,4 ])
obj3.reindex(range(6 ),method='ffill' )
0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
frame=DataFrame(np.arange(9 ).reshape((3 ,3 )),index=['a' ,'b' ,'c' ],columns=['0hio' ,'Texas' ,'California' ])
frame
0hio Texas California a 0 1 2 b 3 4 5 c 6 7 8
frame2=frame.reindex(['a' ,'b' ,'c' ,'d' ])
frame2
0hio Texas California a 0.0 1.0 2.0 b 3.0 4.0 5.0 c 6.0 7.0 8.0 d NaN NaN NaN
states=['Texas' ,'Utah' ,'California' ]
frame.reindex(columns=states)
Texas Utah California a 1 NaN 2 b 4 NaN 5 c 7 NaN 8
frame.reindex(index=['a' ,'b' ,'c' ,'d' ],method='ffill' ,columns=states)
Texas Utah California a 1 NaN 2 b 4 NaN 5 c 7 NaN 8 d 7 NaN 8
frame.ix[['a' ,'b' ,'c' ,'d' ],states]
Texas Utah California a 1.0 NaN 2.0 b 4.0 NaN 5.0 c 7.0 NaN 8.0 d NaN NaN NaN
丢弃指定轴上的项
drop方法返回一个在指定轴上删除了指定值的新对象
obj=Series(np.arange(5.0 ),index=['a' ,'b' ,'c' ,'d' ,'e' ])
new_obj=obj.drop('c' )
new_obj
a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
obj.drop(['d' ,'c' ])
a 0.0 b 1.0 e 4.0 dtype: float64
data=DataFrame(np.arange(16 ).reshape((4 ,4 )),index=['0hio' ,'Colorado' ,'Utah' ,'New York' ],columns=['one' ,'two' ,'three' ,'four' ])
data
one two three four 0hio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data.drop(['Colorado' ,'0hio' ])
data.drop('two' ,axis=1 )
one three four 0hio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15
data.drop(['two' ,'four' ],axis=1 )
one three 0hio 0 2 Colorado 4 6 Utah 8 10 New York 12 14
索引、选取和过滤
obj=Series(np.arange(4. ),index=['a' ,'b' ,'c' ,'d' ])
obj['b' ]
1.0
obj[1 ]
1.0
obj[2 :4 ]
c 2.0 d 3.0 dtype: float64
obj[['b' ,'a' ,'d' ]]
b 1.0 a 0.0 d 3.0 dtype: float64
obj[[1 ,3 ]]
b 1.0 d 3.0 dtype: float64
obj[obj<2 ]
a 0.0 b 1.0 dtype: float64
obj['b' :'c' ]
b 1.0 c 2.0 dtype: float64
obj['b' :'c' ]=5
obj
a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64
data=DataFrame(np.arange(16 ).reshape((4 ,4 )),index=['0hio' ,'Colorado' ,'Utah' ,'New York' ],columns=['one' ,'two' ,'three' ,'four' ])
data
one two three four 0hio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data['two' ]
0hio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32
data[['two' ,'three' ]]
two three 0hio 1 2 Colorado 5 6 Utah 9 10 New York 13 14
data[:2 ]
one two three four 0hio 0 1 2 3 Colorado 4 5 6 7
data[data['three' ]>5 ]
one two three four Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data<5
one two three four 0hio True True True True Colorado True False False False Utah False False False False New York False False False False
data[data<5 ]=0
data
one two three four 0hio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data.ix['Colorado' ,['two' ,'three' ]]
two 5 three 6 Name: Colorado, dtype: int32
data.ix[['Colorado' ,'Utah' ],[3 ,0 ,1 ]]
four one two Colorado 7 0 5 Utah 11 8 9
data.ix[data.three>5 ,:3 ]
one two three Colorado 0 5 6 Utah 8 9 10 New York 12 13 14
算术运算和数据对齐
s1=Series([7.3 ,-2.5 ,3.4 ,1.5 ],index=['a' ,'c' ,'d' ,'e' ])
s2=Series([-2.1 ,3.6 ,-1.5 ,4 ,3.1 ],index=['a' ,'c' ,'e' ,'f' ,'g' ])
s1
a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64
s2
a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1+s2
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1=DataFrame(np.arange(9. ).reshape((3 ,3 )),columns=list('bcd' ),index=['0hio' ,'Texas' ,'Colorado' ])
df2=DataFrame(np.arange(12. ).reshape((4 ,3 )),columns=list('bde' ),index=['Utah' ,'0hio' ,'Texas' ,'Oregon' ])
df1
b c d 0hio 0.0 1.0 2.0 Texas 3.0 4.0 5.0 Colorado 6.0 7.0 8.0
df2
b d e Utah 0.0 1.0 2.0 0hio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
df1+df2
b c d e 0hio 3.0 NaN 6.0 NaN Colorado NaN NaN NaN NaN Oregon NaN NaN NaN NaN Texas 9.0 NaN 12.0 NaN Utah NaN NaN NaN NaN
df1=DataFrame(np.arange(12. ).reshape((3 ,4 )),columns=list('abcd' ))
df2=DataFrame(np.arange(20. ).reshape((4 ,5 )),columns=list('abcde' ))
df1
a b c d 0 0.0 1.0 2.0 3.0 1 4.0 5.0 6.0 7.0 2 8.0 9.0 10.0 11.0
df2
a b c d e 0 0.0 1.0 2.0 3.0 4.0 1 5.0 6.0 7.0 8.0 9.0 2 10.0 11.0 12.0 13.0 14.0 3 15.0 16.0 17.0 18.0 19.0
df1+df2
a b c d e 0 0.0 2.0 4.0 6.0 NaN 1 9.0 11.0 13.0 15.0 NaN 2 18.0 20.0 22.0 24.0 NaN 3 NaN NaN NaN NaN NaN
df1.add(df2,fill_value=0 )
a b c d e 0 0.0 2.0 4.0 6.0 4.0 1 9.0 11.0 13.0 15.0 9.0 2 18.0 20.0 22.0 24.0 14.0 3 15.0 16.0 17.0 18.0 19.0
df1.reindex(columns=df2.columns,fill_value=0 )
a b c d e 0 0.0 1.0 2.0 3.0 0 1 4.0 5.0 6.0 7.0 0 2 8.0 9.0 10.0 11.0 0
DataFrame和Series之间的运算
arr=np.arange(12. ).reshape((3 ,4 ))
arr
array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]])
arr[0 ]
array([ 0., 1., 2., 3.])
arr-arr[0 ]
array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])
frame=DataFrame(np.arange(12. ).reshape((4 ,3 )),columns=list('bde' ),index=['Utah' ,'0hio' ,'Texas' ,'Oregon' ])
frame
b d e Utah 0.0 1.0 2.0 0hio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
series=frame.ix[0 ]
series
b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64
frame-series
b d e Utah 0.0 0.0 0.0 0hio 3.0 3.0 3.0 Texas 6.0 6.0 6.0 Oregon 9.0 9.0 9.0
series2=Series(range(3 ),index=['b' ,'e' ,'f' ])
frame+series2
b d e f Utah 0.0 NaN 3.0 NaN 0hio 3.0 NaN 6.0 NaN Texas 6.0 NaN 9.0 NaN Oregon 9.0 NaN 12.0 NaN
series3=frame['d' ]
frame
b d e Utah 0.0 1.0 2.0 0hio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
series3
Utah 1.0 0hio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64
frame.sub(series3,axis=0 )
b d e Utah -1.0 0.0 1.0 0hio -1.0 0.0 1.0 Texas -1.0 0.0 1.0 Oregon -1.0 0.0 1.0
函数应用和映射
frame=DataFrame(np.random.randn(4 ,3 ),columns=list('bde' ),index=['Utah' ,'0hio' ,'Texas' ,'Oregon' ])
frame
b d e Utah -0.191031 -0.004688 -0.329970 0hio 0.708249 0.265398 -2.346897 Texas 1.064349 -1.811846 -0.899921 Oregon 0.334061 -1.058506 0.655632
np.abs(frame)
b d e Utah 0.191031 0.004688 0.329970 0hio 0.708249 0.265398 2.346897 Texas 1.064349 1.811846 0.899921 Oregon 0.334061 1.058506 0.655632
f=lambda x:x.max()-x.min()
frame.apply(f)
b 1.255380 d 2.077245 e 3.002529 dtype: float64
frame.apply(f,axis=1 )
Utah 0.325281 0hio 3.055145 Texas 2.876195 Oregon 1.714138 dtype: float64
def f (x) :
return Series([x.min(),x.max()],index=['min' ,'max' ])
frame.apply(f)
b d e min -0.191031 -1.811846 -2.346897 max 1.064349 0.265398 0.655632
format=lambda x:'%.2f' %x
frame.applymap(format)
b d e Utah -0.19 -0.00 -0.33 0hio 0.71 0.27 -2.35 Texas 1.06 -1.81 -0.90 Oregon 0.33 -1.06 0.66
frame['e' ].map(format)
Utah -0.33 0hio -2.35 Texas -0.90 Oregon 0.66 Name: e, dtype: object ###排序和排名
obj=Series(range(4 ),index=['d' ,'a' ,'b' ,'c' ])
obj.sort_index()
a 1 b 2 c 3 d 0 dtype: int32
obj=Series(range(4 ),index=list('bacd' ))
obj.sort_index()
a 1 b 0 c 2 d 3 dtype: int32
frame=DataFrame(np.arange(8 ).reshape((2 ,4 )),index=['three' ,'one' ],columns=['d' ,'a' ,'b' ,'c' ])
frame.sort_index()
frame.sort_index(axis=1 )
frame.sort_index(axis=1 ,ascending=False )
obj=Series([4 ,7 ,-3 ,2 ])
obj.sort_values()
2 -3 3 2 0 4 1 7 dtype: int64
obj=Series([4 ,np.nan,7 ,np.nan,-3 ,2 ])
obj.sort_values()
4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64
frame=DataFrame({'b' :[4 ,7 ,-3 ,2 ],'a' :[0 ,1 ,0 ,1 ]})
frame
frame.sort_values(by='b' )
frame.sort_values(by=['a' ,'b' ])
obj=Series([7 ,-5 ,7 ,4 ,2 ,0 ,4 ])
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj.rank(method='first' )
0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64
obj.rank(ascending=False ,method='max' )
0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64
frame=DataFrame({'b' :[4.3 ,7 ,-3 ,2 ],'a' :[0 ,1 ,0 ,1 ],'c' :[-2 ,5 ,8 ,-2.5 ]})
frame
a b c 0 0 4.3 -2.0 1 1 7.0 5.0 2 0 -3.0 8.0 3 1 2.0 -2.5
frame.rank(axis=1 )
a b c 0 2.0 3.0 1.0 1 1.0 3.0 2.0 2 2.0 1.0 3.0 3 2.0 3.0 1.0
带有重复值的轴索引
obj=Series(range(5 ),index=['a' ,'a' ,'b' ,'b' ,'c' ])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int32
obj.index.is_unique
False
obj['a' ]
a 0 a 1 dtype: int32
obj['c' ]
4
df=DataFrame(np.random.randn(4 ,3 ),index=['a' ,'a' ,'b' ,'b' ])
df
0 1 2 a -0.524361 -0.145395 -1.322196 a -0.666326 -0.496612 1.486401 b -0.395841 -0.921194 0.260437 b -0.187285 -0.456014 1.434571
df.ix['b' ]
0 1 2 b -0.395841 -0.921194 0.260437 b -0.187285 -0.456014 1.434571
汇总和计算描述统计
df=DataFrame([[1.4 ,np.nan],[7.1 ,-4.5 ],[np.nan,np.nan],[0.75 ,-1.3 ]],index=['a' ,'b' ,'c' ,'d' ],columns=['one' ,'two' ])
df
one two a 1.40 NaN b 7.10 -4.5 c NaN NaN d 0.75 -1.3
df.sum()
one 9.25 two -5.80 dtype: float64
df.sum(axis=1 )
a 1.40 b 2.60 c NaN d -0.55 dtype: float64
df.mean(axis=1 ,skipna=False )
a NaN b 1.300 c NaN d -0.275 dtype: float64
df.idxmax()
one b two d dtype: object
df.cumsum()
one two a 1.40 NaN b 8.50 -4.5 c NaN NaN d 9.25 -5.8
df.describe()
one two count 3.000000 2.000000 mean 3.083333 -2.900000 std 3.493685 2.262742 min 0.750000 -4.500000 25% 1.075000 -3.700000 50% 1.400000 -2.900000 75% 4.250000 -2.100000 max 7.100000 -1.300000
obj=Series(['a' ,'a' ,'b' ,'c' ]*4 )
obj.describe()
count 16 unique 3 top a freq 8 dtype: object ##相关系数与协方差
import pandas_datareader.data as web
all_data={}
for ticker in ['AAPL' ,'IBM' ,'MSFT' ,'GOOG' ]:
all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000' ,'1/1/2010' )
price=DataFrame({tic:data['Adj Close' ] for tic,data in all_data.items()})
volume=DataFrame({tic:data['Volume' ] for tic,data in all_data.items()})
returns=price.pct_change()
returns.tail()
AAPL GOOG IBM MSFT Date 2009-12-24 0.034339 0.011117 0.004385 0.002587 2009-12-28 0.012294 0.007098 0.013326 0.005484 2009-12-29 -0.011861 -0.005571 -0.003477 0.007058 2009-12-30 0.012147 0.005376 0.005461 -0.013699 2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
returns.MSFT.corr(returns.IBM)
0.49597963862836764
returns.corr()
AAPL GOOG IBM MSFT AAPL 1.000000 0.470676 0.410011 0.424305 GOOG 0.470676 1.000000 0.390689 0.443587 IBM 0.410011 0.390689 1.000000 0.495980 MSFT 0.424305 0.443587 0.495980 1.000000
returns.cov()
AAPL GOOG IBM MSFT AAPL 0.001027 0.000303 0.000252 0.000309 GOOG 0.000303 0.000580 0.000142 0.000205 IBM 0.000252 0.000142 0.000367 0.000216 MSFT 0.000309 0.000205 0.000216 0.000516
returns.corrwith(returns.IBM)
AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64
returns.corrwith(volume)
————————————————————————— NameError Traceback (most recent call last) in () 1 #传入一个DataFrame则会计算按列名配对的相关系数。这里,计算百分比变化与成交量的相关系数 —-> 2 returns.corrwith(volume) NameError: name ‘returns’ is not defined
唯一值、值计数以及成员资格
obj=Series(['c' ,'a' ,'d' ,'a' ,'a' ,'b' ,'b' ,'c' ,'c' ])
uniques=obj.unique()
uniques
array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)
obj.value_counts()
c 3 a 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values,sort=False )
d 1 b 2 a 3 c 3 dtype: int64
mask=obj.isin(['b' ,'c' ])
mask
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask]
0 c 5 b 6 b 7 c 8 c dtype: object
data=DataFrame({'Qu1' :[1 ,3 ,4 ,3 ,4 ],'Qu2' :[2 ,3 ,1 ,2 ,3 ],'Qu3' :[1 ,5 ,2 ,4 ,4 ]})
data
Qu1 Qu2 Qu3 0 1 2 1 1 3 3 5 2 4 1 2 3 3 2 4 4 4 3 4
result=data.apply(pd.value_counts).fillna(0 )
result
Qu1 Qu2 Qu3 1 1.0 1.0 1.0 2 0.0 2.0 1.0 3 2.0 2.0 0.0 4 2.0 0.0 2.0 5 0.0 0.0 1.0
处理缺失数据
string_data=Series(['aaedvark' ,'artichoke' ,np.nan,'avocado' ])
string_data
0 aaedvark 1 artichoke 2 NaN 3 avocado dtype: object
string_data.isnull()
0 False 1 False 2 True 3 False dtype: bool
string_data[0 ]=None
string_data.isnull()
0 True 1 False 2 True 3 False dtype: bool ###滤除缺失数据
from numpy import nan as NA
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data=Series([1 ,NA,3.5 ,NA,7 ])
data.dropna()
0 1.0 2 3.5 4 7.0 dtype: float64
data[data.notnull()]
0 1.0 2 3.5 4 7.0 dtype: float64
data=DataFrame([[1. ,6.5 ,3. ],[1. ,NA,NA],[NA,NA,NA],[NA,6.5 ,3. ]])
clearned=data.dropna()
data
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
clearned
data.dropna(how='all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 3 NaN 6.5 3.0
data[4 ]=NA
data
0 1 2 4 0 1.0 6.5 3.0 NaN 1 1.0 NaN NaN NaN 2 NaN NaN NaN NaN 3 NaN 6.5 3.0 NaN
data.dropna(axis=1 ,how='all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
df=DataFrame(np.random.randn(7 ,3 ))
df.ix[:4 ,1 ]=NA
df.ix[:2 ,2 ]=NA
df
0 1 2 0 -1.637463 NaN NaN 1 -1.259674 NaN NaN 2 -0.284635 NaN NaN 3 0.818905 NaN -1.878244 4 -2.402401 NaN -0.533942 5 -0.623351 -1.472599 -0.860614 6 -0.194565 -1.757851 -1.251312
df.dropna(thresh=3 )
0 1 2 5 -0.623351 -1.472599 -0.860614 6 -0.194565 -1.757851 -1.251312
填充缺失数据
df.fillna(0 )
0 1 2 0 -1.637463 0.000000 0.000000 1 -1.259674 0.000000 0.000000 2 -0.284635 0.000000 0.000000 3 0.818905 0.000000 -1.878244 4 -2.402401 0.000000 -0.533942 5 -0.623351 -1.472599 -0.860614 6 -0.194565 -1.757851 -1.251312
df.fillna({1 :0.5 ,2 :-1 })
0 1 2 0 -1.637463 0.500000 -1.000000 1 -1.259674 0.500000 -1.000000 2 -0.284635 0.500000 -1.000000 3 0.818905 0.500000 -1.878244 4 -2.402401 0.500000 -0.533942 5 -0.623351 -1.472599 -0.860614 6 -0.194565 -1.757851 -1.251312
_=df.fillna(0 ,inplace=True )
df
0 1 2 0 -1.637463 0.000000 0.000000 1 -1.259674 0.000000 0.000000 2 -0.284635 0.000000 0.000000 3 0.818905 0.000000 -1.878244 4 -2.402401 0.000000 -0.533942 5 -0.623351 -1.472599 -0.860614 6 -0.194565 -1.757851 -1.251312
df=DataFrame(np.random.randn(6 ,3 ))
df.ix[2 :,1 ]=NA
df.ix[4 :,2 ]=NA
df
0 1 2 0 0.173799 0.267422 0.480141 1 1.303258 -0.429756 -0.790661 2 -0.110613 NaN 0.878062 3 1.188953 NaN -0.125561 4 -0.512800 NaN NaN 5 -0.383978 NaN NaN
df.fillna(method='ffill' )
0 1 2 0 0.173799 0.267422 0.480141 1 1.303258 -0.429756 -0.790661 2 -0.110613 -0.429756 0.878062 3 1.188953 -0.429756 -0.125561 4 -0.512800 -0.429756 -0.125561 5 -0.383978 -0.429756 -0.125561
df.fillna(method='ffill' ,limit=2 )
0 1 2 0 0.173799 0.267422 0.480141 1 1.303258 -0.429756 -0.790661 2 -0.110613 -0.429756 0.878062 3 1.188953 -0.429756 -0.125561 4 -0.512800 NaN -0.125561 5 -0.383978 NaN -0.125561
data=Series([1. ,NA,3.5 ,NA,7 ])
data.fillna(data.mean())
0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64
层次化索引
data=Series(np.random.randn(10 ),index=[['a' ,'a' ,'a' ,'b' ,'b' ,'b' ,'c' ,'c' ,'d' ,'d' ],[1 ,2 ,3 ,1 ,2 ,3 ,1 ,2 ,2 ,3 ]])
data
a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
data.index
MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b' ]
1 0.501535 2 1.547647 3 -0.038850 dtype: float64
data['b' :'c' ]
b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data.ix[['b' ,'c' ]]
b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data[:,2 ]
a 0.276982 b 1.547647 c -0.905470 d -1.697117 dtype: float64
data.unstack()
1 2 3 a -2.059265 0.276982 -1.771092 b 0.501535 1.547647 -0.038850 c 1.963156 -0.905470 NaN d NaN -1.697117 -0.659792
data.unstack().stack()
a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
frame=DataFrame(np.arange(12 ).reshape((4 ,3 )),index=[['a' ,'a' ,'b' ,'b' ],[1 ,2 ,1 ,2 ]],
columns=[['Ohio' ,'Ohio' ,'Colorado' ],['Green' ,'Red' ,'Green' ]])
frame
Ohio Colorado Green Red Green a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
frame.index.names=['key1' ,'key2' ]
frame.columns.names=['state' ,'color' ]
frame
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
frame['Ohio' ]
color Green Red key1 key2 a 1 0 1 2 3 4 b 1 6 7 2 9 10
重排分级顺序
frame.swaplevel('key1' ,'key2' )
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11
frame.sortlevel(1 )
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11
frame.swaplevel(0 ,1 ).sortlevel(0 )
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11
根据级别汇总统计
frame.sum(level='key2' )
state Ohio Colorado color Green Red Green key2 1 6 8 10 2 12 14 16
frame.sum(level='color' ,axis=1 )
color Green Red key1 key2 a 1 2 1 2 8 4 b 1 14 7 2 20 10
使用DataFrame的列
frame=DataFrame({'a' :range(7 ),'b' :range(7 ,0 ,-1 ),'c' :['one' ,'one' ,'one' ,'two' ,'two' ,'two' ,'two' ],
'd' :[0 ,1 ,2 ,0 ,1 ,2 ,3 ]})
frame
a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3
frame2=frame.set_index(['c' ,'d' ])
frame2
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
frame.set_index(['c' ,'d' ],drop=False )
a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3
frame2.reset_index()
c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
其他有关pandas的话题
整数索引
ser=Series(np.arange(3. ))
ser
0 0.0
1 1.0
2 2.0
dtype: float64
ser[1 ]
1.0
ser
ser2=Series(np.arange(3. ),index=['a' ,'b' ,'c' ])
ser2[-1 ]
2.0
ser.ix[:1 ]
0 0.0
1 1.0
dtype: float64
ser3=Series(range(3 ),index=[-5 ,1 ,3 ])
ser3.iloc[2 ]
2
frame=DataFrame(np.arange(6 ).reshape(3 ,2 ),index=[2 ,0 ,1 ])
frame.iloc[0 ]
0 0
1 1
Name: 2, dtype: int32
面板数据
from pandas_datareader import data as web
pdata=pd.Panel(dict((stk,web.get_data_yahoo(stk,'1/1/2009' ,'6/1/2012' ) ) for stk in ['AAPL' ,'GOOG' ,'MSFT' ,'DELL' ]))
pdata