import datetime
import pprint
import numpy as np
import pandas as pd
from pandas.io.data import DataReader
import pylab as plt
import sklearn
from sklearn.cross_validation import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from pandas.io.data import DataReader
def create_lagged_series(symbol, start_date, end_date, lags=5):
"""
This creates a pandas DataFrame that stores the percentage returns of the
adjusted closing value of a stock obtained from Yahoo Finance, along with
a number of lagged returns from the prior trading days (lags defaults to 5 days).
Trading volume, as well as the Direction from the previous day, are also included.
"""
# Obtain stock information from Yahoo Finance
ts = DataReader(symbol, "yahoo", start_date-datetime.timedelta(days=365), end_date)
# Create the new lagged DataFrame
tslag = pd.DataFrame(index=ts.index)
tslag["Today"] = ts["Adj Close"]
tslag["Volume"] = ts["Volume"]
# Create the shifted lag series of prior trading period close values
for i in xrange(0,lags):
tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
# Create the returns DataFrame
tsret = pd.DataFrame(index=tslag.index)
tsret["Volume"] = tslag["Volume"]
tsret["Today"] = tslag["Today"].pct_change()*100.0
# If any of the values of percentage returns equal zero, set them to
# a small number (stops issues with QDA model in scikit-learn)
for i,x in enumerate(tsret["Today"]):
if (abs(x) < 0.0001):
tsret["Today"][i] = 0.0001
# Create the lagged percentage returns columns
for i in xrange(0,lags):
tsret["Lag%s" % str(i+1)] = tslag["Lag%s" % str(i+1)].pct_change()*100.0
# Create the "Direction" column (+1 or -1) indicating an up/down day
tsret["Direction"] = np.sign(tsret["Today"])
tsret = tsret[tsret.index >= start_date]
return tsret
def k_fold_cross_val_poly(folds, degrees, X, y):
n = len(X)
kf = KFold(n, n_folds=folds)
kf_dict=dict([("fold_%s"%i, []) for i in range(1, folds+1)])
fold = 0
for train_index, test_index in kf:
fold += 1
print "Fold: %s" % fold
X_train, X_test = X.ix[train_index], X.ix[test_index]
y_train, y_test = y.ix[train_index], y.ix[test_index]
for d in range(1, degrees+1):
print "Degree: %s" % d
polynomial_features = PolynomialFeatures(degree=d, include_bias=False)
linear_regression = LinearRegression()
model = Pipeline([("polynomial_features", polynomial_features),
("liear_regression", linear_regression)])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
kf_dict["fold_%s" % fold].append(test_mse)
kf_dict["fold_%s" % fold] = np.array(kf_dict["fold_%s" % fold])
kf_dict["avg"] = np.zeros(degrees)
for i in range(1, folds+1):
kf_dict["avg"] += kf_dict["fold_%s" % i]
kf_dict["avg"] /= float(folds)
return kf_dict
if __name__ == '__main__':
symbol = "^FTSE"
start_date = datetime.datetime(2004, 1, 1)
end_date = datetime.datetime(2004, 12, 31)
ftse_lags = create_lagged_series(symbol, start_date, end_date, lags=5)
X = ftse_lags[[
"Lag1", "Lag2", "Lag3", "Lag4", "Lag5",
#"Lag6", "Lag7", "Lag8", "Lag9", "Lag10",
#"Lag11", "Lag12", "Lag13", "Lag14", "Lag15",
#"Lag16", "Lag17", "Lag18", "Lag19", "Lag20"
]]
y = ftse_lags["Today"]
degrees = 3
folds = 10
kf_dict = k_fold_cross_val_poly(folds, degrees, X, y)