#python常用基础包
import matplotlib.pyplot as plt
import pylab as py
import math as m
import scipy.stats as stats
import numpy as np
import pandas as pd
pandas的一维数据类型:Series
a=pd.Series([9,8,7,6])
a
自定义索引
b=pd.Series([4,5,7,9],index=['a','b','c','d'])
b
pandas的多维维数据类型:DataFrame
d=pd.DataFrame(np.arange(10).reshape(2,5))
d
使用字典对象创建DataFrame
dt={'one':pd.Series([1,2,3],index=['a','b','c']) ,'two':pd.Series([9,8,7,6],index=['a','b','c','d'])}
d=pd.DataFrame(dt)
d
pd.DataFrame(dt,index=['b','c'])
d1={'one':[1,2,3,4],'two':[9,8,7,6]}
d=pd.DataFrame(d1,index=['a','b','c','d'])
d
dl={'城市':['北京','上海','广州','深圳','沈阳'],
'环比':[101.5,101.2,101.3,102.0,100.1],
'同比':[120.7,127.3,119.4,140.9,101.4],
'定基':[121.4,127.8,120.0,145.5,101.6]}
d=pd.DataFrame(dl,index=['c1','c2','c3','c4','c5'])
d
获取DataFrame里的数据
d.index
d.ix['c1']
d['城市']
d.values
d['城市']['c1']
调整列序
d=d.reindex(columns=['城市','环比','同比','定基'])
d
Index增删操作
newc=d.columns.insert(4,'新增')
dd=d.reindex(columns=newc,fill_value=20)
dd
nc=d.columns.delete(2)
ni=d.index.insert(5,'c0').delete(2)
nd=d.reindex(index=ni,columns=nc,method='bfill')
nd
drop()直接删除索引对象(axis默认为0)
nd.drop('c1',axis=0)
nd.drop('城市',axis=1)
数据类型的算数运算
#Index不同值的二维DataFrame间的运算
a=pd.DataFrame(np.arange(12).reshape(3,4))
b=pd.DataFrame(np.arange(20).reshape(4,5))
a+b
a.mul(b,fill_value=1)
#DataFrame和Series间的运算
c=pd.Series(np.arange(4))
b+c
b.sub(c,axis=0)
比较运算(只能同Index值的DataFrame,或广播运算)
bb=b+c
b>=bb
数据的排序
sort_index()
b=pd.DataFrame(np.arange(20).reshape(4,5),index=['c','a','d','b'])
b
b.sort_index(axis=1,ascending=0)
sort_values
b.sort_values(3,axis=0,ascending=False)
基本统计分析
describe()
c=pd.Series([9,8,7,6],index=['a','b','c','d'])
c.describe()
c.describe()['mean']
b.describe()
b.describe()[0]['max']
累计统计分析
b.cumsum(axis=1)
b.rolling(3).sum()
相关性实例:房价增幅与M2增幅相关性
hprice=pd.Series([3.04,22.93,12.75,22.6,12.33],index=['2008','2009','2010','2011','2012'])
m2=pd.Series([8.18,18.38,9.13,7.82,6.69],index=['2008','2009','2010','2011','2012'])
hprice.plot(marker='o')
m2.plot(marker='*')
plt.xlabel('年份',fontproperties='SimHei',fontsize=15)
plt.ylabel(r'增幅比例%',fontproperties='SimHei',fontsize=15)
plt.show()
hprice.corr(m2)
a = pd.Series([9, 8, 7, 6], ['a', 'b', 'c', 'd'])
a.index.dtype