连接SPY数据
导入pandas库
import pandas as pd
执行函数
- 默认join向左合并
- index_col设置dfSPY的索引为Date(原来默认是第一列添加的序号)
- parse_dates为真设置dateframe的索引为datetime的格式
- usecols选择只展示的列名,日期和调整收盘价
- na_values告诉read_csv,NAN不是一个数,而是字符串
- dropna()删除所有SPY值为NAN的行
def test_run():
start_data='2010-01-22'
end_data='2010-01-26'
dates=pd.data_range(start_data,end_data)
df1=pd.DataFrame(index=dates)
dfSPY=pd.read_csv("data/SPY.csv",index_col="Date",
parse_dates=True,
usecols=['Date','Adj Close'],
na_values=['nan'])
df1=df1.join(dfSPY)
df1=df1.dropna()
print df1
最后删除空行也可以用How函数,参考
http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html
- how的参数:inner内连接,outer外连接,left左连接(默认),right右连接
df1=df1.join(dfSPY,how='inner')
连接更多股票
因为任何一支股票的调整收盘价都是Adj Close,列名重复造成clash(冲突),所以分别给列重命名
dfSPY=dfSPY.rename(columns={'Adj Close':'SPY'})
df1=df1.join(dfSPY,how='inner')
symbols=['GOOD','IBM','GLD']
for symbol in symbols:
df_temp=pd.read_csv("data/{}.csv".format(symbol),index_col="Date",
parse_dates=True,usecols=['Date','Adj Close'],
na_values=['nan'])
df_temp=df_temp.rename(columns={'Adj Close':symbol})
df=df1.join(df_temp)
print df1
简化函数
import os
import pandas as pd
def symbol_to_path(symbol,base_dir="data"):
return os.path.join(base_dir,"{}.csv".format(str(symbol)))
def get_data(symbols,dates):
df=pd.DataFrame(index=dates)
if 'SPY' not in symbols:
symbols.insert(0,'SPY')
for symbol in symbols:
# TODO: Read and join data for each symbol
df_temp=pd.read_csv(symbol_to_path(symbol),index_col="Date",
parse_dates=True,usecols=['Date','Adj Close'],
na_values=['nan'])
df_temp=df_temp.rename(columns={'Adj Close':symbol})
df=df.join(df_temp)
if symbol =='SPY':
df = df.dropna(subset=["SPY"])
return df
def test_run():
# Define a date range
dates = pd.date_range('2010-01-22', '2010-01-26')
# Choose stock symbols to read
symbols = ['GOOG', 'IBM', 'GLD']
# Get stock data
df = get_data(symbols, dates)
print df
if __name__ == "__main__":
test_run()
数据切片
- sd 、ed第一纵行选择的的开始、结束的横行
- '__'里是第一横行选择的列名,不用连续
df2=df1[sd:ed,['GOOD','GLD']]
- 日期倒过来就会为空
- 没有ix也会打印相同结果,但是一般认为ix会让pythonic更加稳定,所以选择写
print df.ix['2010-01-01':'2010-01-31']
print df['GOOD']
print df['IBM','GLD']
print df.ix['2010-01-01':'2010-01-31',['IBM','GLD']]
绘制多股票的图形
在“简化函数”中加
import matplotlib.pypolt as plt
- ax 是坐标轴,标题通过参数传入
def plot_data(df,title="Stock prices"):
ax = df.plot(title=title,fontsize=2)
ax.set_xlabel("Date")
ax.set_ylable("Price")
plt.show()
绘制两个股票的图形
import os
import pandas as pd
import matplotlib.pyplot as plt
def plot_selected(df, columns, start_index, end_index):
"""Plot the desired columns over index values in the given range."""
# TODO: Your code here!!!
plot_data(df.ix[start_index:end_index,columns],title="Selected data")
def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df
def plot_data(df, title="Stock prices"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()
def test_run():
# Define a date range
dates = pd.date_range('2010-01-01', '2010-12-31')
# Choose stock symbols to read
symbols = ['GOOG', 'IBM', 'GLD'] # SPY will be added in get_data()
# Get stock data
df = get_data(symbols, dates)
# Slice and plot
plot_selected(df, ['SPY', 'IBM'], '2010-03-01', '2010-04-01')
if __name__ == "__main__":
test_run()
图形标准化
- 提取第一行,让所有数据都除以第一组数据,则所有数据都是从1开始
def normalize_data(df)
return df/df.ix[0,:]