K-Fold Cross Validation

import datetime
import pprint

import numpy as np
import pandas as pd
from pandas.io.data import DataReader
import pylab as plt
import sklearn
from sklearn.cross_validation import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from pandas.io.data import DataReader


def create_lagged_series(symbol, start_date, end_date, lags=5):
    """
    This creates a pandas DataFrame that stores the percentage returns of the 
    adjusted closing value of a stock obtained from Yahoo Finance, along with 
    a number of lagged returns from the prior trading days (lags defaults to 5 days).
    Trading volume, as well as the Direction from the previous day, are also included.
    """

    # Obtain stock information from Yahoo Finance
    ts = DataReader(symbol, "yahoo", start_date-datetime.timedelta(days=365), end_date)

    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]

    # Create the shifted lag series of prior trading period close values
    for i in xrange(0,lags):
        tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)

    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change()*100.0

    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in scikit-learn)
    for i,x in enumerate(tsret["Today"]):
        if (abs(x) < 0.0001):
            tsret["Today"][i] = 0.0001

    # Create the lagged percentage returns columns
    for i in xrange(0,lags):
        tsret["Lag%s" % str(i+1)] = tslag["Lag%s" % str(i+1)].pct_change()*100.0

    # Create the "Direction" column (+1 or -1) indicating an up/down day
    tsret["Direction"] = np.sign(tsret["Today"])
    tsret = tsret[tsret.index >= start_date]

    return tsret
def k_fold_cross_val_poly(folds, degrees, X, y):
    n = len(X)
    kf = KFold(n, n_folds=folds)
    kf_dict=dict([("fold_%s"%i, []) for i in range(1, folds+1)])
    fold = 0
    
    for train_index, test_index in kf:
        fold += 1
        print "Fold: %s" % fold
        X_train, X_test = X.ix[train_index], X.ix[test_index]
        y_train, y_test = y.ix[train_index], y.ix[test_index]
        
        for d in range(1, degrees+1):
            print "Degree: %s" % d
            polynomial_features = PolynomialFeatures(degree=d, include_bias=False)
            linear_regression = LinearRegression()
            
            model = Pipeline([("polynomial_features", polynomial_features),
                              ("liear_regression", linear_regression)])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            test_mse = mean_squared_error(y_test, y_pred)
            kf_dict["fold_%s" % fold].append(test_mse)
        kf_dict["fold_%s" % fold] = np.array(kf_dict["fold_%s" % fold])
    kf_dict["avg"] = np.zeros(degrees)
    for i in range(1, folds+1):
        kf_dict["avg"] += kf_dict["fold_%s" % i]
    kf_dict["avg"] /= float(folds)
    return kf_dict
    
if __name__ == '__main__':
    symbol = "^FTSE"
    start_date = datetime.datetime(2004, 1, 1)
    end_date = datetime.datetime(2004, 12, 31)
    ftse_lags = create_lagged_series(symbol, start_date, end_date, lags=5)
    
    X = ftse_lags[[
        "Lag1", "Lag2", "Lag3", "Lag4", "Lag5",
        #"Lag6", "Lag7", "Lag8", "Lag9", "Lag10",
        #"Lag11", "Lag12", "Lag13", "Lag14", "Lag15",
        #"Lag16", "Lag17", "Lag18", "Lag19", "Lag20"
    ]]
    y = ftse_lags["Today"]
    degrees = 3
    
    folds = 10
    kf_dict = k_fold_cross_val_poly(folds, degrees, X, y)




  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值