01-06 直方图和散点图
在处理金融领域的数据时,需要进行一些统计量的统计。
常见的就是对日收益回报进行可视化分析,
1.可将日收益回报进行直方图的绘制(histogram),并求其平均值、标准差、峰度、散点图、拟合直线、不同股票之间的相关性。
主要任务有:
1)针对现有的一支股票的数据,进行价格曲线的可视化、日收益回报曲线的刻画,相应直方图、均值、左右2的标准差范围的非可视化、峰度计算
2)两支股票甚至多支股票的价格曲线的可视化、相应日收益回报的直方图可视化在同一坐标轴中
3)以SPY股票作为基准股票数据,比较其和XOM和GLD两支股票的情况:可视化关系的散点图,利用np.ployfit函数进行拟合直线的求解,得到相应的系数项和常数项,可视化相应的拟合直线;直接根据原始的日收益回报数据得到相应的相关系数值。
"""Slice and plot"""
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df
def plot_data(df, title="Stock prices", ylabel, xlabel):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.show()
def compute_daily_returns(df):
"""compute and return the daily return values."""
daily_returns = df.copy()
daily_returns[1:] = (df[1:]/df[:-1].values) - 1
daily_returns.ix[0, :] = 0
return daily_returns
# 对于一只股票的数据进行价格可视化,日收益回报的直方图、mean、std可视化。
def test_run1():
dates = pd.date_range('2010-01-01', '2010-12-31')
# Choose stock symbols to read
symbols = ['SPY'] # SPY will be added in get_data()
# Get stock data
df = get_data(symbols, dates)
plot_data(df, title='prince', ylabel='price',xlabel='Date')
# compute daily returns
daily_returns = compute_daily_returns(df)
plot_data(daily_returns, title='Daily Returns', ylabel='Daily returns',xlabel='Date')
# plot a histogram
daily_returns.hist(bins=20) # 直方图中有bins个基础块。
# Get mean and standard deviation
mean = daily_returns['SPY'].mean()
print('mean:', mean)
std = daily_returns['SPY'].std()
print('std:', std)
# plot mean and std
plt.axvline(mean, color='w', linestyle='dashed', linewidth=2)
plt.axvline(std, color='r', linestyle='dashed', linewidth=2)
plt.axvline(-std, color='r', linestyle='dashed', linewidth=2)
plt.show()
# Get kurtosis
kurtosis = daily_returns.kurtosis()
print('kurtosis:', kurtosis)
# 将多支股票的日收益回报直方图放在一张图中。
def test_run2():
# Define a date range
dates = pd.date_range('2010-01-01', '2010-12-31')
# Choose stock symbols to read
symbols = ['SPY', 'XOM'] # SPY will be added in get_data()
# Get stock data
df = get_data(symbols, dates)
# compute daily returns
daily_returns = compute_daily_returns(df)
# compute and plot both histogram on the same chart
daily_returns['SPY'].hist(bins=20, label='SPY')
daily_returns['XOM'].hist(bins=20, label='XOM')
plt.legend(loc='upper right')
plt.show()
# 根据日收益回报,以SPY为基准数据,分别画出SPY&XOM 和 SPY&GLD的散点图,并进行相应的直线拟合,然后可视化相应的拟合直线。
# 最后计算统计出的日收益回报数据中的几支股票的相关性分析。
def test_run3():
# Define a date range
dates = pd.date_range('2010-01-01', '2010-12-31')
# Choose stock symbols to read
symbols = ['SPY', 'XOM', 'GLD'] # SPY will be added in get_data()
# Get stock data
df = get_data(symbols, dates)
# compute daily returns
daily_returns = compute_daily_returns(df)
# scatter SPY and XOM
daily_returns.plot(kind='scatter', x='SPY', y='XOM')
beta_XOM, alpha_XOM = np.polyfit(daily_returns['SPY'], daily_returns['XOM'], 1)
print('beta_XOM:', beta_XOM)
print('alpha_XOM:', alpha_XOM)
plt.plot(daily_returns['SPY'], beta_XOM*daily_returns['SPY']+alpha_XOM, '-', color='r')
plt.show()
# scatter SPY and GLD
daily_returns.plot(kind='scatter', x='SPY', y='GLD')
beta_GLD, alpha_GLD = np.polyfit(daily_returns['SPY'], daily_returns['GLD'], 1)
print('beta_GLD:', beta_GLD)
print('alpha_GLD:', alpha_GLD)
plt.plot(daily_returns['SPY'], beta_GLD * daily_returns['SPY'] + alpha_GLD, '-', color='r')
plt.show()
print(daily_returns.corr(method='pearson'))
if __name__ == "__main__":
test_run1()
test_run2()
test_run3()