本篇博文主要介绍数据分析包pandas的使用,主要参考资料来自pandas官网,掌握以下内容可以帮助data scientist快速理解pandas日常的数据分析操作,读者可以自己定义一些数据,跟着练习一下,pandas确实挺强大的,比自己一点点的去写numpy要省事许多,可以为大家省下不少时间精力将工作重点放在算法或者业务的深入理解方面,内容so young so naive,但是仔细读来也可以have some fun!
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [5]:
s=pd.Series([1,3,5,np.nan,6,8]) #产生一个序列
In [6]:
s
Out[6]:
In [7]:
dates=pd.date_range('20160501',periods=6)
In [8]:
dates
Out[8]:
In [10]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
In [11]:
df
Out[11]:
In [12]:
df2=pd.DataFrame({'A':1,
'B':pd.Timestamp('20160501'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
In [13]:
df2
Out[13]:
In [14]:
df2.dtypes
Out[14]:
In [16]:
df.head()
Out[16]:
In [17]:
df.tail(3)
Out[17]:
In [18]:
df.index
Out[18]:
In [21]:
df.columns
Out[21]:
In [22]:
df.values
Out[22]:
In [23]:
df.describe()
Out[23]:
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dates=pd.date_range('20160501',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
In [3]:
df
Out[3]:
In [4]:
df.T
Out[4]:
In [5]:
df
Out[5]:
In [6]:
df.sort_index(axis=1,ascending=False) #以索引降序排列 横向
Out[6]:
In [12]:
df['A']
Out[12]:
In [13]:
df[0:3] #获取前三行012
Out[13]:
In [15]:
df['20160502':'20160504']
Out[15]:
In [16]:
df.loc[dates[1]] #按日期截取
Out[16]:
In [17]:
df.loc[:,['A','B']]
Out[17]:
In [18]:
df.loc['20160502':'20160505',['A','B']] #使用loc指定截取
Out[18]:
In [19]:
df.loc[dates[1],'A']
Out[19]:
In [20]:
df.iloc[5] #根据行数直接选定
Out[20]:
In [24]:
df.iloc[4:6,0:2] # iloc直接确定行列数
Out[24]:
In [25]:
df.iloc[1:3,:]
Out[25]:
In [26]:
df[df.A>0]
Out[26]:
In [27]:
df[df>0] #只显示大于零的
Out[27]:
In [28]:
df2=df.copy()
In [29]:
df2['E']=['one','one','two','three','four','three']
In [30]:
df2
Out[30]:
In [31]:
df2[df2['E'].isin(['two','four'])]
Out[31]:
In [32]:
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20160501',periods=6))
In [33]:
s1
Out[33]:
In [34]:
df['F']=s1
In [35]:
df
Out[35]:
In [36]:
df.at[dates[0],'A']=0
In [37]:
df
Out[37]:
In [38]:
df.loc[:,'D']=np.array([5]*len(df)) #把D列替换掉为5
In [39]:
df
Out[39]:
In [40]:
df.iat[0,1]=0 #iat直接确定行列
In [41]:
df
Out[41]:
In [43]:
del df['F']
In [44]:
df
Out[44]:
In [46]:
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20160502',periods=6))
In [48]:
s1 # 注意1号是缺失的
Out[48]:
In [49]:
df['F']=s1
In [50]:
df
Out[50]:
In [51]:
df2=df.copy()
df2[df2>0]=-df2 #全改为负的
df2
Out[51]:
In [52]:
df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1 #只是对E前两个数进行了赋值,后面的未定
df1
Out[52]:
In [53]:
df1.dropna(how='any') #把含有不确定值的样本去掉
Out[53]:
In [54]:
df1.fillna(value=5) #填补不确定的NaN数值为5
Out[54]:
In [55]:
pd.isnull(df1)
Out[55]:
In [56]:
df
Out[56]:
In [57]:
df.mean() #每一列的均值
Out[57]:
In [58]:
df.mean(1) #每一行的均值
Out[58]:
In [59]:
s=pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) #右移两位
s
Out[59]:
In [60]:
df.sub(s,axis='index') # 对应位置相减去
Out[60]:
In [61]:
df.apply(np.cumsum) #一列一列累计求和
Out[61]:
In [62]:
df
Out[62]:
In [63]:
df.apply(lambda x: x.max()-x.min()) # 应用函数 按列计算
Out[63]:
In [64]:
s=pd.Series(['A','B','C','Aaba','Baca',np.nan,'CABA','dog','cat'])
In [65]:
s.str.lower()
Out[65]:
In [66]:
df=pd.DataFrame(np.random.randn(10,4))
In [67]:
df
Out[67]:
In [68]:
pieces=[df[:3],df[3:7],df[7:]] #按行切片 貌似“数字”就是表示行
pd.concat(pieces)
Out[68]:
In [69]:
left=pd.DataFrame({'key':['foo','foo'],'Ival':[1,2]})
right=pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
In [70]:
left
Out[70]:
In [71]:
right
Out[71]:
In [72]:
pd.merge(left,right,on='key')
Out[72]:
In [73]:
df=pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])
df
Out[73]:
In [74]:
s=df.iloc[3] # iloc直接用“数字”定位第四行
df.append(s,ignore_index=True)
Out[74]:
In [77]:
df=pd.DataFrame({'A':['foo','bar','foo','bar',
'foo','bar','foo','foo'],
'B':['one','one','two','three',
'two','two','one','three'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
df
Out[77]:
In [78]:
df.groupby('A').sum()
Out[78]:
In [79]:
df.groupby(['A','B']).sum() #AB都是标签索引
Out[79]:
In [80]:
tuples=list(zip(*[['bar','bar','baz','baz',
'foo','foo','qux','qux'],
['one','two','one','two',
'one','two','one','two']]))
index=pd.MultiIndex.from_tuples(tuples,names=['first','second']) #多指标
index
Out[80]:
In [81]:
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B']) #以index索引指标
df
Out[81]:
In [82]:
df2=df[:4] #截取前四行
df2
Out[82]:
In [83]:
stacked=df2.stack()
stacked
Out[83]:
In [84]:
stacked.unstack()
Out[84]:
In [85]:
stacked.unstack(1)
Out[85]:
In [86]:
stacked.unstack(0)
Out[86]:
In [87]:
df=pd.DataFrame({'A':['one','one','two','three']*3,
'B':['A','B','C']*4,
'C':['foo','foo','foo','bar','bar','bar']*2,
'D':np.random.randn(12),
'E':np.random.randn(12)})
df
Out[87]:
In [89]:
pd.pivot_table(df,values='D',index=['A','B'],columns=['C']) #透视表
Out[89]:
In [93]:
rng=pd.date_range('1/1/2016',periods=100,freq='S') #周期为100 单位为秒
rng
Out[93]:
In [94]:
ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng) #产生对应序列
In [95]:
ts
Out[95]:
In [102]:
ts.resample('20S') #以二十秒为间隔进行重新采样 .sum()求和
Out[102]:
In [103]:
ts.resample('5Min') #以五分钟间隔重新采样
Out[103]:
In [104]:
ts.resample('5Min').sum()
Out[104]:
In [106]:
rng=pd.date_range('6/1/2016 00:00',periods=5,freq='D')
ts=pd.Series(np.random.randn(len(rng)),rng)
ts
Out[106]:
In [107]:
ts_utc=ts.tz_localize('UTC')
ts_utc
Out[107]:
In [108]:
ts_utc.tz_convert('US/Eastern')
Out[108]:
In [110]:
rng=pd.date_range('1/1/2016',periods=5,freq='M')
rng
Out[110]:
In [112]:
ts=pd.Series(np.random.randn(len(rng)),index=rng)
ts
Out[112]:
In [114]:
ps=ts.to_period()
ps
Out[114]:
In [115]:
ps.to_timestamp()
Out[115]:
In [119]:
df=pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":['a','b','b','a','a','e']}) # DataFrame在pandas里真的好常用啊!!
df
Out[119]:
In [136]:
ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2016',periods=1000))
ts=ts.cumsum()
ts.plot()
import pylab
pylab.show() #这里画出一条
In [133]:
df=pd.DataFrame(np.random.randn(1000,4),index=ts.index,
columns=['A','B','C','D'])
df=df.cumsum()
plt.figure();df.plot();plt.legend(loc='best')
pylab.show(plt) #这里会画出四条线
In [134]:
df.to_csv('foo.csv')
In [135]:
pd.read_csv('foo.csv')
Out[135]:
In []:
从pandas 0.15开始,有了categorical的 DataFrame
三个类别 a b e
参考资料:http://pandas.pydata.org/pandas-docs/stable/10min.html