#数据读取
df.iloc[:,0:2]#选取第一列和第二列
df.iloc[[1,3],:]#选取第一行和第四行的全部列
df.iloc[[1,3],0:2]#选取第二行和第四行的第一列到第二列
df.iloc[[1,3],[0,3]]#选取第二行和第四列的第一列和第四列
df.iat[2,2]#只能是某行某列的一个数
df
df.loc[[[10,30],['A','C']]#读取10,30这两行的A列、C列
df.loc[10:50,['A','C']]#读取从10到50的每一行的A列、C列
#开始画图
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
data=pd.read_csv('E:/spx.csv',index_col=0,parse_dates=True)
spx=data['SPX']
spx.plot(ax=ax,style='k-')
#fig.show()
crisis_data=[
(datetime(2007,10,11),'Peak of bull market'),
(datetime(2008,3,12),'Bear Stearns Fails'),
(datetime(2008,9,15),'Lehman Bankruptcy')
]
for date,label in crisis_data:
ax.annotate(label,xy=(date,spx.asof(date)+50),
xytext=(date,spx.asof(date)+50),arrowprops=dict(facecolor='black'),horizontalalignment='left',
verticalalignment='top')
#放大到2007-2010
ax.set_xlim(['1/1/2007','1/1/2011'])
ax.set_ylim([600,1800])
ax.set_title('Import dates in 2008-2009 financial crisis')
>>> import numpy as np
>>> import pandas as pd
>>> s=pd.Series(np.random.randn(10).cumsum(),index=np.arange(0,100,10))
>>> s.plot()
<matplotlib.axes._subplots.AxesSubplot object at 0x040C41D0>#显示不出来图
>>> import matplotlib.pyplot as plt
>>> fig=plt.figure()
>>> ax=fig.add_subplot(1,1,1)
>>> s=pd.Series(np.random.randn(10).cumsum(),index=np.arange(0,100,10))
>>> s.plot()
<matplotlib.axes._subplots.AxesSubplot object at 0x03FE4750>
>>> fig.show()
该Series对象索引index会被传给matplotlib,并用以绘制x轴。可通过use_index=False禁用该功能。
X轴的刻度和界限可通过xticks和xlim选项调节
1、线图
import pandas as pd
import numpy as np
from pandas import DataFrame
df=DataFrame(np.random.randn(10,4).cumsum(0),columns=['A','B','C','D'],index=np.arange(0,100,10))
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
df.plot()
2、柱状图
fig,axes=plt.subplots(2,1,figsize=(12,10))
data=pd.Series(np.random.randn(16),index=list('abcdefghijklmnop'))
data.plot(kind='bar',ax=axes[0],color='k',alpha=0.7)
data.plot(kind='barh',ax=axes[1],color='k',alpha=0.7)
data.value_counts().plot(kind='bar'),value_counts()函数计算频率。
3、直方图
tips=pd.read_csv('e:/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips['tip_pct'].hist(bins=50)
4、密度图
import scipy
tips['tip_pct'].plot(kind='kde')
5、散点图
macro=pd.read_csv('e:/macrodata.csv')
data=macro.iloc[:,7:11]
trans_data=np.log(data).dropna()
trans_data[-5:]
plt.scatter(trans_data[[1]],trans_data[[3]])
plt.title('Changes in log %s vs. log %s' % ('m1','uemp'))
pd.scatter_matrix(trans_data,diagonal='kde',c='k',alpha=.3)
画地图的例子:
data=pd.read_csv('e:/haiti.csv')
#CATEGORY含有一组以逗号分隔的代码,这些代码表示消息的类型
data[['INCIDENT DATE','LATITUDE','LONGITUDE']][:10]
Out[172]:
INCIDENT DATE LATITUDE LONGITUDE
0 05/07/2010 17:26 18.233333 -72.533333
1 28/06/2010 23:06 50.226029 5.729886
2 24/06/2010 16:21 22.278381 114.174287
3 20/06/2010 21:59 44.407062 8.933989
4 18/05/2010 16:26 18.571084 -72.334671
5 26/04/2010 13:14 18.593707 -72.310079
6 26/04/2010 14:19 18.482800 -73.638800
7 26/04/2010 14:27 18.415000 -73.195000
8 15/03/2010 10:58 18.517443 -72.236841
9 15/03/2010 11:00 18.547790 -72.410010
data['CATEGORY'][:6]
Out[173]:
0 1. Urgences | Emergency, 3. Public Health,
1 1. Urgences | Emergency, 2. Urgences logistiqu...
2 2. Urgences logistiques | Vital Lines, 8. Autr...
3 1. Urgences | Emergency,
4 1. Urgences | Emergency,
5 5e. Communication lines down,
Name: CATEGORY, dtype: object
#调用describe发现数据中一些异常的地理位置
data.describe()
Out[175]:
Serial LATITUDE LONGITUDE
count 3593.000000 3593.000000 3593.000000
mean 2080.277484 18.611495 -72.322680
std 1171.100360 0.738572 3.650776
min 4.000000 18.041313 -74.452757
25% 1074.000000 18.524070 -72.417500
50% 2163.000000 18.539269 -72.335000
75% 3088.000000 18.561820 -72.293570
max 4052.000000 50.226029 114.174287
In [176]: data=data[(data.LATITUDE>18)&(data.LATITUDE<20)&(data.LONGITUDE>-75)&(data.LONGITUDE<-70)&data.CATEGORY.notnull()]#编写两个函数,一个用于所有分类的列表获取,另一个用于将各个分类信息拆分为编码和英语
In [177]: def to_cat_list(catstr):
...: stripped=(x.strip() for x in catstr.split(','))
...: return [x for x in stripped if x]
...:
...: def get_all_categories(cat_series):
...: cat_sets=(set(to_cat_list(x)) for x in cat_series)
...: return sorted(set.union(*cat_sets))
...:
...: def get_english(cat):
...: code,names=cat.split('.')
...: if '|' in names:
...: names=names.split('|')[1]
...: return code,names.strip()
...:
...:
#测试一下get_english函数工作是否正常
In [178]: get_english('2.Urgences logistiqued|Vital Lines')
Out[178]: ('2', 'Vital Lines')
#将编码和名称映射起来的字典
In [185]: all_cats=get_all_categories(data.CATEGORY)
In [186]: #生成器表达式
In [187]: english_mapping=dict(get_english(x) for x in all_cats)
In [188]: english_mapping['2a']
Out[188]: 'Food Shortage'In [189]: english_mapping['6c']
Out[189]: 'Earthquake and aftershocks'
In [190]: english_mapping['2b']
Out[190]: 'Water shortage'
#分类选取记录的方式有很多,其一是添加指标列,每个分类一列。首先抽取出唯一的分类编码,并构造一个全0DataFrame
def get_code(seq):
return [x.split('.')[0] for x in seq if x]
all_codes=get_code(all_cats)
code_index=pd.Index(np.unique(all_codes))
dummy_frame=DataFrame(np.zeros((len(data),len(code_index))),index=data.index,columns=code_index)
dummy_frame.ix[:,:6]
Out[191]:
1 1a 1b 1c 1d 2
0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0 0.0
6 0.0 0.0 0.0 0.0 0.0 0.0
7 0.0 0.0 0.0 0.0 0.0 0.0
8 0.0 0.0 0.0 0.0 0.0 0.0
9 0.0 0.0 0.0 0.0 0.0 0.0
10 0.0 0.0 0.0 0.0 0.0 0.0
11 0.0 0.0 0.0 0.0 0.0 0.0
12 0.0 0.0 0.0 0.0 0.0 0.0
13 0.0 0.0 0.0 0.0 0.0 0.0
14 0.0 0.0 0.0 0.0 0.0 0.0
15 0.0 0.0 0.0 0.0 0.0 0.0
16 0.0 0.0 0.0 0.0 0.0 0.0
17 0.0 0.0 0.0 0.0 0.0 0.0
18 0.0 0.0 0.0 0.0 0.0 0.0
19 0.0 0.0 0.0 0.0 0.0 0.0
20 0.0 0.0 0.0 0.0 0.0 0.0
21 0.0 0.0 0.0 0.0 0.0 0.0
22 0.0 0.0 0.0 0.0 0.0 0.0
23 0.0 0.0 0.0 0.0 0.0 0.0
24 0.0 0.0 0.0 0.0 0.0 0.0
25 0.0 0.0 0.0 0.0 0.0 0.0
26 0.0 0.0 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0 0.0 0.0
28 0.0 0.0 0.0 0.0 0.0 0.0
29 0.0 0.0 0.0 0.0 0.0 0.0
30 0.0 0.0 0.0 0.0 0.0 0.0
31 0.0 0.0 0.0 0.0 0.0 0.0
32 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
3563 0.0 0.0 0.0 0.0 0.0 0.0
3564 0.0 0.0 0.0 0.0 0.0 0.0
3565 0.0 0.0 0.0 0.0 0.0 0.0
3566 0.0 0.0 0.0 0.0 0.0 0.0
3567 0.0 0.0 0.0 0.0 0.0 0.0
3568 0.0 0.0 0.0 0.0 0.0 0.0
3569 0.0 0.0 0.0 0.0 0.0 0.0
3570 0.0 0.0 0.0 0.0 0.0 0.0
3571 0.0 0.0 0.0 0.0 0.0 0.0
3572 0.0 0.0 0.0 0.0 0.0 0.0
3573 0.0 0.0 0.0 0.0 0.0 0.0
3574 0.0 0.0 0.0 0.0 0.0 0.0
3575 0.0 0.0 0.0 0.0 0.0 0.0
3576 0.0 0.0 0.0 0.0 0.0 0.0
3577 0.0 0.0 0.0 0.0 0.0 0.0
3578 0.0 0.0 0.0 0.0 0.0 0.0
3579 0.0 0.0 0.0 0.0 0.0 0.0
3580 0.0 0.0 0.0 0.0 0.0 0.0
3581 0.0 0.0 0.0 0.0 0.0 0.0
3582 0.0 0.0 0.0 0.0 0.0 0.0
3583 0.0 0.0 0.0 0.0 0.0 0.0
3584 0.0 0.0 0.0 0.0 0.0 0.0
3585 0.0 0.0 0.0 0.0 0.0 0.0
3586 0.0 0.0 0.0 0.0 0.0 0.0
3587 0.0 0.0 0.0 0.0 0.0 0.0
3588 0.0 0.0 0.0 0.0 0.0 0.0
3589 0.0 0.0 0.0 0.0 0.0 0.0
3590 0.0 0.0 0.0 0.0 0.0 0.0
3591 0.0 0.0 0.0 0.0 0.0 0.0
3592 0.0 0.0 0.0 0.0 0.0 0.0
[3569 rows x 6 columns]
#将各行中适当的项设置为1,再与data进行连接
for row,cat in zip(data.index,data.CATEGORY):
codes=get_code(to_cat_list(cat))
dummy_frame.ix[row,codes]=1
data=data.join(dummy_frame.add_prefix('category_'))
data.ix[:,10:15]
category_1 category_1a category_1b category_1c category_1d0 1.0 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0
6 0.0 0.0 0.0 0.0 0.0
7 0.0 0.0 0.0 0.0 0.0
8 0.0 0.0 0.0 0.0 0.0
9 0.0 0.0 0.0 0.0 0.0
10 0.0 1.0 0.0 0.0 0.0
11 0.0 0.0 0.0 0.0 0.0
12 0.0 0.0 0.0 0.0 0.0
13 0.0 0.0 0.0 0.0 0.0
14 0.0 0.0 0.0 0.0 0.0
15 0.0 0.0 0.0 0.0 0.0
16 1.0 0.0 0.0 0.0 0.0
17 0.0 0.0 0.0 0.0 0.0
18 0.0 0.0 0.0 0.0 0.0
19 1.0 0.0 0.0 0.0 0.0
20 1.0 0.0 0.0 0.0 0.0
21 0.0 0.0 0.0 0.0 0.0
22 0.0 0.0 0.0 0.0 0.0
23 0.0 0.0 0.0 0.0 0.0
24 0.0 0.0 0.0 0.0 0.0
25 0.0 0.0 1.0 0.0 0.0
26 0.0 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0 0.0
28 0.0 0.0 0.0 0.0 0.0
29 0.0 0.0 0.0 0.0 0.0
30 0.0 0.0 0.0 0.0 0.0
31 0.0 0.0 0.0 0.0 0.0
32 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
3563 0.0 0.0 0.0 0.0 0.0
3564 0.0 0.0 0.0 0.0 0.0
3565 1.0 0.0 0.0 0.0 0.0
3566 1.0 0.0 0.0 0.0 0.0
3567 0.0 0.0 0.0 0.0 0.0
3568 0.0 0.0 0.0 0.0 0.0
3569 0.0 0.0 0.0 1.0 0.0
3570 0.0 0.0 0.0 0.0 0.0
3571 0.0 0.0 0.0 0.0 0.0
3572 0.0 0.0 0.0 0.0 0.0
3573 0.0 0.0 0.0 0.0 0.0
3574 0.0 0.0 0.0 1.0 0.0
3575 0.0 0.0 0.0 0.0 0.0
3576 0.0 0.0 0.0 0.0 0.0
3577 0.0 0.0 0.0 0.0 0.0
3578 0.0 0.0 0.0 0.0 0.0
3579 0.0 0.0 0.0 0.0 0.0
3580 0.0 0.0 0.0 0.0 0.0
3581 0.0 0.0 0.0 1.0 0.0
3582 0.0 0.0 0.0 0.0 0.0
3583 0.0 0.0 0.0 0.0 0.0
3584 0.0 0.0 0.0 1.0 0.0
3585 0.0 0.0 0.0 0.0 0.0
3586 1.0 0.0 0.0 0.0 0.0
3587 1.0 0.0 0.0 0.0 0.0
3588 0.0 0.0 0.0 0.0 0.0
3589 0.0 0.0 0.0 1.0 0.0
3590 0.0 0.0 0.0 0.0 0.0
3591 0.0 0.0 0.0 0.0 0.0
3592 0.0 0.0 0.0 0.0 0.0
[3569 rows x 5 columns]