import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
X_ord = pd.read_csv('./Q4_backup.csv')
btc_weekly = pd.read_csv('./Q2_backup.csv')
filtered_btc = btc_weekly[(btc_weekly['Date'] >= '2020-05-01') & (btc_weekly['Date'] <= '2021-05-31')]
filtered_X_ord = X_ord[(X_ord['Date_of_Monday'] >= '2020-05-01') & (X_ord['Date_of_Monday'] <= '2021-05-31')]
filtered_X_ord['Date'] = filtered_X_ord['Date_of_Monday']
merged_df = pd.merge(filtered_X_ord, filtered_btc, on='Date', how='inner')
featrue_cols = X_ord.columns[1:]
kf = KFold(n_splits=5, shuffle=True, random_state=5206)
res = (0, float('inf'))
for p in range(1, 21):
cols = featrue_cols[:p]
X = np.array(merged_df[cols])
y = np.array(merged_df['Adj Close'])
mean_err = 0
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model = LinearRegression()
model.fit(X_train, y_train)
mean_err += np.sqrt(mean_squared_error(model.predict(X_test), y_test))
mean_err = mean_err/5
if mean_err < res[-1]:
res = (p, mean_err)
best_p = res[0]
best_5_fold_rmse = res[-1]
model = LinearRegression()
X = np.array(merged_df[featrue_cols[:best_p]])
y = np.array(merged_df['Adj Close'])
model.fit(X, y)
print(f'best p: {best_p}, best 5 fold rmse: {best_5_fold_rmse}')
print(f'model coef: {model.coef_}')
dsadasdsadsa