Module One : Feature Selection
Import Module
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
Part One : Data Preparation
Step one : Import data
data_Macro = pd.read_excel('data_Macro.xlsx')
data_spread = pd.read_excel('data_spread.xlsx')
Step two : Merge dataframe and delete useless columns
data_set = pd.merge(data_Macro, data_spread)
data_set = data_set.dropna()
data_set.to_excel('data_merge.xlsx')
data_set = data_set.drop(columns = ['BAMLC0A1CAAAEY', 'DGS10', 'Date'], axis = 1)
Step three : Split Training set and Test set
def Split_data(dataset) :
num_data = dataset.shape[0]
split_number = int(num_data*0.5)
Train_data = dataset.loc[:split_number]
Test_data = dataset.loc[split_number:]
return Train_data, Test_data
Split Dataset into two parts : Before 2008 & After 2008
Before_data = data_set.loc[:1153]
After_data = data_set.loc[1153:]
Step four : Data transformation
Data normalization
def normalize(X_train, X_test, string_list) :
for i in range(len(string_list)) :
result_train = X_train.copy()
result_test = X_test.copy()
mean = X_train[string_list[i]].mean()
std = X_train[string_list[i]].std()
result_train[string_list[i]] = (X_train[string_list[i]] - mean) / std
result_test[string_list[i]] = (X_test[string_list[i]] - mean) / std
return X_train, X_test
Create lags
def Lag_data(X_train, X_test, lag_list) :
string_list = list(X_train)
string_list.remove('Spread')
Train_result = X_train.copy()
Test_result = X_test.copy()
for i in range(len(string_list)) :
for j in range(len(lag_list)) :
Column_name = string_list[i] + '-lag' + str(lag_list[j])
Train_result[Column_name] = X_train[string_list[i]].diff(lag_list[j])
Test_result[Column_name] = X_test[string_list[i]].diff(lag_list[j])
Train_result = Train_result.dropna()
Test_result = Test_result.dropna()
return Train_result, Test_result
Step Six : Split X Y
def Split_XY(dataset) :
Y = dataset['Spread']
X = dataset.drop(columns = ['Spread'])
return X, Y
Step five : Combine all the steps and get the data
def Data_pipline(dataset, string_list, lag_list) :
Train_data, Test_data = Split_data(dataset)
Train_data, Test_data = normalize(Train_data, Test_data, string_list)
Train_data, Test_data = Lag_data(Train_data, Test_data, lag_list)
datasets_train = Split_XY(Train_data)
datasets_test = Split_XY(Test_data)
return datasets_train[0], datasets_test[0], datasets_train[1], datasets_test[1]
def Data_ndarray(X_train, X_test, y_train, y_test) :
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
return X_train, X_test, y_train, y_test
Step six : Get the data
string_list = ['Dollar Index', 'S&P 500']
lag_list = [1, 2, 5, 10, 20]
X_train_pd_B, X_test_pd_B, y_train_pd_B, y_test_pd_B = Data_pipline(Before_data, string_list, lag_list)
X_train_B, X_test_B, y_train_B, y_test_B = Data_ndarray(X_train_pd_B, X_test_pd_B, y_train_pd_B, y_test_pd_B)
X_train_pd_A, X_test_pd_A, y_train_pd_A, y_test_pd_A = Data_pipline(After_data, string_list, lag_list)
X_train_A, X_test_A, y_train_A, y_test_A = Data_ndarray(X_train_pd_A, X_test_pd_A, y_train_pd_A, y_test_pd_A)
Part Two : Lasso Regression
from sklearn import linear_model
Set Grids in order to see the change of coef as the change of lambda
lambda_list = np.arange(0.1, 5, 0.01)
Calculate Loss
def Compute_loss(coef, X_test, y_test):
residual = y_test - np.dot(X_test, coef)
loss = np.dot(residual.T, residual) / len(y_test)
return loss
Calculate the Coef in different lambda
def Para_Selection(X_train, X_test, y_train, y_test, lambda_list) :
coef_list = []
loss_list = []
for i in range(len(lambda_list)):
model = linear_model.Lasso(alpha = lambda_list[i])
model.fit(X_train, y_train)
coef_list.append(model.coef_)
loss_list.append(Compute_loss(model.coef_, X_test, y_test))
coef_list = np.array(coef_list)
loss_list = np.array(loss_list)
return coef_list, loss_list
View the coef_list in Dataframe
coef_list_B, loss_list_B = Para_Selection(X_train_B, X_test_B, y_train_B, y_test_B, lambda_list)
coef_list_A, loss_list_A = Para_Selection(X_train_A, X_test_A, y_train_A, y_test_A, lambda_list)
coef_list_B = pd.DataFrame(coef_list_B,columns = list(X_train_pd_B), index = lambda_list)
coef_list_A = pd.DataFrame(coef_list_A,columns = list(X_train_pd_A), index = lambda_list)
coef_list_B.to_excel('parameter_selectionB.xlsx')
coef_list_A.to_excel('parameter_selectionA.xlsx')
Module Two : OLS Regression
Part One : Create new dataset & Do the same work as before
new_data = data_set[['TED', 'S&P 500', 'Dollar Index','Spread']]
new_data = new_data.set_index(np.arange(0,new_data.shape[0],1))
Before_data = new_data.loc[:1153]
After_data = new_data.loc[1153:]
X_train_pd_B, X_test_pd_B, y_train_pd_B, y_test_pd_B = Data_pipline(Before_data, string_list, lag_list)
X_train_B, X_test_B, y_train_B, y_test_B = Data_ndarray(X_train_pd_B, X_test_pd_B, y_train_pd_B, y_test_pd_B)
X_train_pd_A, X_test_pd_A, y_train_pd_A, y_test_pd_A = Data_pipline(After_data, string_list, lag_list)
X_train_A, X_test_A, y_train_A, y_test_A = Data_ndarray(X_train_pd_A, X_test_pd_A, y_train_pd_A, y_test_pd_A)
coef_list_B, loss_list_B = Para_Selection(X_train_B, X_test_B, y_train_B, y_test_B, lambda_list)
coef_list_A, loss_list_A = Para_Selection(X_train_A, X_test_A, y_train_A, y_test_A, lambda_list)
coef_list_B = pd.DataFrame(coef_list_B,columns = list(X_train_pd_B), index = lambda_list)
coef_list_A = pd.DataFrame(coef_list_A,columns = list(X_train_pd_A), index = lambda_list)
coef_list_B.to_excel('parameter_selectionB_new.xlsx')
coef_list_A.to_excel('parameter_selectionA_new.xlsx')
Part Two : Determine the parameter and do linear regression
Train_data, Test_data = Split_data(new_data)
Train_data, Test_data = normalize(Train_data, Test_data, ['TED', 'S&P 500', 'Dollar Index'])
Train_X, Train_y = Split_XY(Train_data)
Test_X, Test_y = Split_XY(Test_data)
Train_X, Train_y, Test_X, Test_y = Data_ndarray(Train_X, Train_y, Test_X, Test_y)
from sklearn.metrics import r2_score
Model = linear_model.LinearRegression()
Model.fit(Train_X, Train_y)
y_pred_test = Model.predict( Test_X )
y_pred_train = Model.predict( Train_X )
R2_BA_test = r2_score(Test_y, y_pred_test )
R2_BA_train = r2_score(Train_y, y_pred_train)
R2_BA_train, R2_BA_test
(0.6511875551955599, -4.046149620527698)
residual = Train_y - y_pred_train
x = np.arange(0,len(residual),1)
plt.plot(x, residual)
plt.show()
Module Three : ARMA
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.ar_model import AR
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
Part 1 : Test the ACF & PACF of Residual
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residual,lags=40,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residual,lags=40,ax=ax2)
plt.show()
So, we conclude from the graph above, AR model would be suitable in this case.
Part 2 : Build AR Model
train, test = residual[1:len(residual)-7], residual[len(residual)-8:]
model = AR(train)
model_fit = model.fit()
#print('Lag: %s' % model_fit.k_ar)
#print('Coefficients: %s' % model_fit.params)
# make predictions
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=True)
#for i in range(len(predictions)):
# print('predicted=%f, expected=%f' % (predictions[i], test[i]))
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()
predictions
1705 0.094650
1706 0.060665
1707 0.055057
1708 0.052883
1709 0.060204
1710 0.052916
1711 0.060637
1712 0.039208
dtype: float64
test
1705 0.075248
1706 0.053296
1707 0.044266
1708 -0.103648
1709 -0.061762
1710 -0.019262
1711 -0.031760
1712 -0.024399
Name: Spread, dtype: float64
new_residual = predictions - test
new_residual
1705 0.019402
1706 0.007369
1707 0.010791
1708 0.156530
1709 0.121966
1710 0.072178
1711 0.092397
1712 0.063606
dtype: float64
Module Four : Determine the Final Model
Part 1 : Change the parameter everyday in ols
residual_list = []
y_pred_list = []
x_train_new = Train_X
y_train_new = Train_y
for i in range(Test_X.shape[0]):
Model = linear_model.LinearRegression()
result = Model.fit(x_train_new, y_train_new)
y_pred = result.predict(Test_X[i].reshape(1,-1))
residual = y_pred - Test_y[i]
residual_list.append(residual)
y_pred_list.append(y_pred)
X_add = Test_X[i].reshape(1,-1)
Y_add = Test_y[i]
x_train_new = np.concatenate((x_train_new, X_add), axis = 0)
y_train_new = np.append(y_train_new, Y_add)
y_pred_new = np.array(y_pred_list).flatten()
Part 2 : Calculate IC
corr_list = []
wrong_list = []
for i in range(len(Test_y) - 100) :
A_array = y_pred_new[:i+100]
B_array = Test_y[:i+100]
corr = np.corrcoef(A_array, B_array)[0][1]
if corr > 1 :
wrong_list.append(i)
corr_list.append(corr)
corr_array = np.array(corr_list).flatten()
Part Three : Plot the result
x = np.arange(0, len(Test_y), 1)
plt.plot(x, y_pred_new, color = 'red')
plt.plot(x, Test_y)
plt.show()
x_corr = np.arange(0,len(corr_array),1)
plt.plot(x_corr, corr_array)
plt.show()
corr_list_new = []
for i in range(len(Test_y) - 500) :
A_array = y_pred_new[i:i+500]
B_array = Test_y[i:i+500]
corr = np.corrcoef(A_array, B_array)[0][1]
corr_list_new.append(corr)
corr_array_new = np.array(corr_list_new).flatten()
x_corr_new = np.arange(0,len(corr_array_new),1)
plt.plot(x_corr_new, corr_array_new)
plt.show()