实现ridge regression, V-flod cross validation。
import numpy as np
import sys
import math
def load_data(filename):
x = []
y = []
try:
fin = open(filename, 'r')
except IOError:
return
for data in fin:
data = data.strip("\n")
data = data.strip("\r")
data = data.strip(" ")
items = data.split(" ")
# print items
y.append([float(items[-1])])
tmp = [1]
for i in range(0, len(items) - 1):
tmp.append(float(items[i]))
x.append(tmp)
fin.close()
# print x[0], y[0]
return np.array(x), np.array(y)
lamb = 10
test_x = []
test_y = []
train_x = []
train_y = []
valid_x = []
valid_y = []
dim = 3 # add one more bias dimension manually
def ridge_regression():
global lamb
global test_x
global test_y
global train_x
global train_y
global valid_x
global valid_y
x = np.matrix(train_x)
y = np.matrix(train_y)
# print len(train_y)
# print train_y
# print y.shape
# print np.matmul(x.transpose(), y)#lamb * np.matrix(np.identity(dim)) + np.matmul(x.transpose(), x)
# print "here"
# w_lin = np.matmul(np.matmul(np.linalg.inv(lamb * np.matrix(np.identity(dim)) + np.matmul(x.transpose(), x)), x.transpose()), y)
w_lin = np.linalg.inv(lamb * np.eye(dim) + np.dot(x.transpose(), x)).dot(x.transpose()).dot(y)
# print w_lin
predicted_y = np.matmul(x, w_lin)
# print predicted_y
for i in range(0, len(predicted_y)):
if predicted_y[i] >= 0:
predicted_y[i] = 1
else:
predicted_y[i] = -1
diff = [predicted_y[i] != train_y[i] for i in range(0, len(train_y))]
error = 1.0 * np.count_nonzero(diff) / len(train_x)
print "Ein", error
# print predicted_y, train_y
x2 = np.matrix(test_x)
predicted_y2 = np.matmul(x2, w_lin)
for i in range(0, len(predicted_y2)):
if predicted_y2[i] >= 0:
predicted_y2[i] = 1
else:
predicted_y2[i] = -1
diff2 = [predicted_y2[i] != test_y[i] for i in range(0, len(test_y))]
error2 = 1.0 * np.count_nonzero(diff2) / len(test_x)
print "Eout", error2
x3 = np.matrix(valid_x)
predicted_y3 = np.matmul(x3, w_lin)
for i in range(0, len(predicted_y3)):
if predicted_y3[i] >= 0:
predicted_y3[i] = 1
else:
predicted_y3[i] = -1
diff = [predicted_y3[i] != valid_y[i] for i in range(0, len(valid_y))]
error3 = 1.0 * np.count_nonzero(diff) / len(valid_x)
print "Evalid", error3
return error, error2, error3
def pro13():
global lamb
global test_x
global test_y
global train_x
global train_y
lamb = 10
train_x, train_y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
ridge_regression()
def pro14():
global lamb
global test_x
global test_y
global train_x
global train_y
train_x, train_y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
best_lamb = - 10
tmp_Ein = sys.maxint
tmp_Eout = 0
for i in range(-10, 3, 1):
lamb = 10 ** i
print "lambda", lamb, "log10 lamb", i
Ein, Eout, _ = ridge_regression()
if tmp_Ein >= Ein:
best_lamb = lamb
tmp_Ein = Ein
tmp_Eout = Eout
print "best lambda", best_lamb, tmp_Ein, tmp_Eout
def pro15():
global lamb
global test_x
global test_y
global train_x
global train_y
train_x, train_y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
best_lamb = - 10
tmp_Ein = sys.maxint
tmp_Eout = sys.maxint
for i in range(-10, 3, 1):
lamb = 10 ** i
print "lambda", lamb, "log10 lamb", i
Ein, Eout, _ = ridge_regression()
if tmp_Eout >= Eout:
best_lamb = lamb
tmp_Ein = Ein
tmp_Eout = Eout
print "best lambda", best_lamb, tmp_Ein, tmp_Eout
def pro16():
global lamb
global test_x
global test_y
global train_x
global train_y
global valid_x
global valid_y
x, y = load_data("hw4_train.dat")
for i in range(0, 120):
train_x.append(x[i])
train_y.append(y[i])
for i in range(120, len(x)):
valid_x.append(x[i])
valid_y.append(y[i])
test_x, test_y = load_data("hw4_test.dat")
best_lamb = - 10
tmp_Ein = sys.maxint
tmp_Eout = sys.maxint
tmp_Eval = sys.maxint
for i in range(-10, 3, 1):
lamb = 10 ** i
print "lambda", lamb, "log10 lamb", i
Ein, Eout, Eval = ridge_regression()
if tmp_Ein >= Ein:
best_lamb = lamb
tmp_Ein = Ein
tmp_Eout = Eout
tmp_Eval = Eval
print "best lambda", best_lamb, tmp_Ein, tmp_Eval, tmp_Eout
def pro17():
global lamb
global test_x
global test_y
global train_x
global train_y
global valid_x
global valid_y
x, y = load_data("hw4_train.dat")
for i in range(0, 120):
train_x.append(x[i])
train_y.append(y[i])
for i in range(120, len(x)):
valid_x.append(x[i])
valid_y.append(y[i])
test_x, test_y = load_data("hw4_test.dat")
best_lamb = - 10
tmp_Ein = sys.maxint
tmp_Eout = sys.maxint
tmp_Eval = sys.maxint
for i in range(-10, 3, 1):
lamb = 10 ** i
print "lambda", lamb, "log10 lamb", i
Ein, Eout, Eval = ridge_regression()
if tmp_Eval >= Eval:
best_lamb = lamb
tmp_Ein = Ein
tmp_Eout = Eout
tmp_Eval = Eval
print "best lambda", best_lamb, tmp_Ein, tmp_Eval, tmp_Eout
def pro18():
global lamb
global test_x
global test_y
global train_x
global train_y
lamb = 1
train_x, train_y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
ridge_regression()
def pro19():
global lamb
global test_x
global test_y
global train_x
global train_y
global valid_x
global valid_y
x, y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
best_lamb = - 10
tmp_Ecv = sys.maxint
for i in range(-10, 3, 1):
lamb = 10 ** i
print "lambda", lamb, "log10 lamb", i
step = 40
Ecv = 0
for v in range(0, 200, step):
train_x = np.concatenate([x[0: v], x[v + step: 200]])
valid_x = x[v: v + step]
train_y = np.concatenate([y[0: v], y[v + step: 200]])
valid_y = y[v: v + step]
# print valid_x, valid_y
Ein, Eout, Eval = ridge_regression()
Ecv += Eval
Ecv /= 5.0
print "Ecv", Ecv
if tmp_Ecv >= Ecv:
best_lamb = lamb
tmp_Ecv = Ecv
print "best lambda", best_lamb, "Ecv", tmp_Ecv
def pro20():
global lamb
global test_x
global test_y
global train_x
global train_y
lamb = 1e-8
train_x, train_y = load_data("hw4_train.dat")
test_x, test_y = load_data("hw4_test.dat")
ridge_regression()
if __name__ == '__main__':
# print np.identity(2), np.matrix(np.identity(2))
# pro13()
# pro14()
# pro17()
pro20()