05_python数据统计和可视化
01_4-1 python基本数据统计
01_1-便捷数据获取
# 用python获取数据:本地数据获取(文件的打开、读写和关闭);网络数据获取(抓取网页,解析网页内容,urllib,urllib2,httplib,httplib2)
# 便捷网络数据
from matplotlib.finance import quotes_historical_yahoo_ochl
from datetime import date
import pandas as pd
today = date.today()
print (today)
start = (today.year-1,today.month,today.day)
# quotes = quotes_historical_yahoo_ochl ('CCE',start,today)
quotes = quotes_historical_yahoo_ochl('AXP', start, today)
df = pd.DataFrame(quotes)
print (df)
# 自然语言工具包 NLTK:古腾堡语料库,布朗语料库,路透社语料库,网络和聊天文本……
from nltk.corpus import gutenberg
import nltk
print (gutenberg.fileids())
02_2数据准备
# 数据整理
from matplotlib.finance import quotes_historical_yahoo_ochl
from datetime import date
import pandas as pd
today = date.today()
print (today)
start = (today.year-1,today.month,today.day)
# quotes = quotes_historical_yahoo_ochl ('CCE',start,today)
quotes = quotes_historical_yahoo_ochl('AMX', start, today)
fileds = ['date','open','close','high','low','volume']
df = pd.DataFrame(quotes,columns = fields)
print (df)
# 数据整理
from datetime import date
a = date.fromordinal(735190)
print (a)
2013-11-18
# 创建时间序列
import pandas as pd
dates = pd.date_range('20141001',periods=7)
print (dates)
import numpy as np
dates = pd.DataFrame(np.random.randn(7,3),index = dates,columns=list('ABC'))
print (dates)
DatetimeIndex(['2014-10-01', '2014-10-02', '2014-10-03', '2014-10-04',
'2014-10-05', '2014-10-06', '2014-10-07'],
dtype='datetime64[ns]', freq='D')
A B C
2014-10-01 -0.671403 -0.218888 0.238439
2014-10-02 -0.305552 -2.049908 1.416983
2014-10-03 0.438262 -0.892854 0.121548
2014-10-04 -0.966557 -0.784305 -1.017663
2014-10-05 0.882889 1.418623 -0.927695
2014-10-06 -1.236913 0.638172 -0.379498
2014-10-07 1.077244 -0.664185 -0.372371
03_3 数据显示
# 数据显示
# 显示方式:显示索引,显示列名,显示数据的值,显示数据描述
# a.index a.columns a.values a.describe
# 索引的格式:quotesdf.index
# 显示行:专用方式;切片
# df.head[5] = df[:5]
# df.tail[5] = df[25:]
04_4数据选择
# 选择方式:选择行,选择列,选择区域,筛选(条件选择)
# 选规行:切片,索引,quotesdf[u'2013-12-02':u'2013-12-06']
# 选择列:列名,djidf['code'],djidf.code
# 选择方式:行、列 标签label(loc) djidf.loc[1:5] djidf.loc[:['code','lasttrade']]
# 行和列的区域:标签label(loc);单个值(at)djidf.loc[1:5,['code','lasttrade']] djidf.loc[1,lasttrade'] djidf.at[1,lasttrade']
# 行、列和区域:用iloc(位置);取某个值(iat) djidf.iloc[1:6,[0,2]] djidf.iloc[1,2] djidf.iat[1,2]
# 条件筛选 quotesdf[quotesdf.index>=u'2014-01-01'] quotesdf[(quotesdf.index>=u'2014-01-01')&(quotesdf.close>=95)]
05_5简单统计与筛选
# 最近一次成交价的平均值 djidf.mean(columns = 'lasttrade')
# 最近一次成交价大于等于120的公司名 djidf[djidf.lasttrade >= 120].name
# 统计股票涨和跌的天数 len(quotesdf[quotesdf.close > quotesdf.open])
# 统计相邻两天收盘价的涨跌情况 s=(np.sign(np.diff(quotesdf.close)) s[np.where(s == 1)].size s[np.where(s == -1)].size
# 排序 djidf.sort(columns = 'lasttrade') djidf.sort(columns = 'lasttrade')[27:].name
# 计数统计 统计2014年1月份的