Question 13-20 Homework 4 Machine Learning Foundation 机器学习基石作业四 python code

最新推荐文章于 2022-08-12 20:46:20 发布

I-Hsin

最新推荐文章于 2022-08-12 20:46:20 发布

阅读量410

点赞数

文章标签：机器学习基石

本文链接：https://blog.csdn.net/ihsin/article/details/86362851

版权

实现ridge regression, V-flod cross validation。

import numpy as np
import sys
import math

def load_data(filename):
    x = []
    y = []
    try:
        fin = open(filename, 'r')
    except IOError:
        return
    for data in fin:
        data = data.strip("\n")
        data = data.strip("\r")
        data = data.strip(" ")
        items = data.split(" ")
        # print items
        y.append([float(items[-1])])
        tmp = [1]
        for i in range(0, len(items) - 1):
            tmp.append(float(items[i]))
        x.append(tmp)
    fin.close()
    # print x[0], y[0]
    return np.array(x), np.array(y)


lamb = 10
test_x = []
test_y = []
train_x = []
train_y = []
valid_x = []
valid_y = []
dim = 3 # add one more bias dimension manually

def ridge_regression():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    global valid_x
    global valid_y
    x = np.matrix(train_x)
    y = np.matrix(train_y)
    # print len(train_y)
    # print train_y
    # print y.shape
    # print np.matmul(x.transpose(), y)#lamb * np.matrix(np.identity(dim)) + np.matmul(x.transpose(), x)
    # print "here"
    # w_lin = np.matmul(np.matmul(np.linalg.inv(lamb * np.matrix(np.identity(dim)) + np.matmul(x.transpose(), x)), x.transpose()), y)
    w_lin = np.linalg.inv(lamb * np.eye(dim) + np.dot(x.transpose(), x)).dot(x.transpose()).dot(y)
    # print w_lin
    predicted_y = np.matmul(x, w_lin)
    # print predicted_y
    for i in range(0, len(predicted_y)):
        if predicted_y[i] >= 0:
            predicted_y[i] = 1
        else:
            predicted_y[i] = -1
    diff = [predicted_y[i] != train_y[i] for i in range(0, len(train_y))]
    error = 1.0 * np.count_nonzero(diff) / len(train_x)
    print "Ein", error
    # print predicted_y, train_y
    x2 = np.matrix(test_x)
    predicted_y2 = np.matmul(x2, w_lin)
    for i in range(0, len(predicted_y2)):
        if predicted_y2[i] >= 0:
            predicted_y2[i] = 1
        else:
            predicted_y2[i] = -1
    diff2 = [predicted_y2[i] != test_y[i] for i in range(0, len(test_y))]
    error2 = 1.0 * np.count_nonzero(diff2) / len(test_x)
    print "Eout", error2
    x3 = np.matrix(valid_x)
    predicted_y3 = np.matmul(x3, w_lin)
    for i in range(0, len(predicted_y3)):
        if predicted_y3[i] >= 0:
            predicted_y3[i] = 1
        else:
            predicted_y3[i] = -1
    diff = [predicted_y3[i] != valid_y[i] for i in range(0, len(valid_y))]
    error3 = 1.0 * np.count_nonzero(diff) / len(valid_x)
    print "Evalid", error3
    return error, error2, error3


def pro13():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y

    lamb = 10
    train_x, train_y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    ridge_regression()

def pro14():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    train_x, train_y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    best_lamb = - 10
    tmp_Ein = sys.maxint
    tmp_Eout = 0
    for i in range(-10, 3, 1):
        lamb = 10 ** i
        print "lambda", lamb, "log10 lamb", i
        Ein, Eout, _ = ridge_regression()
        if tmp_Ein >= Ein:
            best_lamb = lamb
            tmp_Ein = Ein
            tmp_Eout = Eout
    print "best lambda", best_lamb, tmp_Ein, tmp_Eout


def pro15():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    train_x, train_y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    best_lamb = - 10
    tmp_Ein = sys.maxint
    tmp_Eout = sys.maxint
    for i in range(-10, 3, 1):
        lamb = 10 ** i
        print "lambda", lamb, "log10 lamb", i
        Ein, Eout, _ = ridge_regression()
        if tmp_Eout >= Eout:
            best_lamb = lamb
            tmp_Ein = Ein
            tmp_Eout = Eout
    print "best lambda", best_lamb, tmp_Ein, tmp_Eout

def pro16():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    global valid_x
    global valid_y
    x, y = load_data("hw4_train.dat")
    for i in range(0, 120):
        train_x.append(x[i])
        train_y.append(y[i])
    for i in range(120, len(x)):
        valid_x.append(x[i])
        valid_y.append(y[i])
    test_x, test_y = load_data("hw4_test.dat")

    best_lamb = - 10
    tmp_Ein = sys.maxint
    tmp_Eout = sys.maxint
    tmp_Eval = sys.maxint
    for i in range(-10, 3, 1):
        lamb = 10 ** i
        print "lambda", lamb, "log10 lamb", i
        Ein, Eout, Eval = ridge_regression()
        if tmp_Ein >= Ein:
            best_lamb = lamb
            tmp_Ein = Ein
            tmp_Eout = Eout
            tmp_Eval = Eval
    print "best lambda", best_lamb, tmp_Ein, tmp_Eval, tmp_Eout

def pro17():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    global valid_x
    global valid_y
    x, y = load_data("hw4_train.dat")
    for i in range(0, 120):
        train_x.append(x[i])
        train_y.append(y[i])
    for i in range(120, len(x)):
        valid_x.append(x[i])
        valid_y.append(y[i])
    test_x, test_y = load_data("hw4_test.dat")

    best_lamb = - 10
    tmp_Ein = sys.maxint
    tmp_Eout = sys.maxint
    tmp_Eval = sys.maxint
    for i in range(-10, 3, 1):
        lamb = 10 ** i
        print "lambda", lamb, "log10 lamb", i
        Ein, Eout, Eval = ridge_regression()
        if tmp_Eval >= Eval:
            best_lamb = lamb
            tmp_Ein = Ein
            tmp_Eout = Eout
            tmp_Eval = Eval
    print "best lambda", best_lamb, tmp_Ein, tmp_Eval, tmp_Eout

def pro18():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y

    lamb = 1
    train_x, train_y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    ridge_regression()

def pro19():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y
    global valid_x
    global valid_y
    x, y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    best_lamb = - 10
    tmp_Ecv = sys.maxint
    for i in range(-10, 3, 1):
        lamb = 10 ** i
        print "lambda", lamb, "log10 lamb", i
        step = 40
        Ecv = 0
        for v in range(0, 200, step):
            train_x = np.concatenate([x[0: v], x[v + step: 200]])
            valid_x = x[v: v + step]
            train_y = np.concatenate([y[0: v], y[v + step: 200]])
            valid_y = y[v: v + step]
            # print valid_x, valid_y
            Ein, Eout, Eval = ridge_regression()
            Ecv += Eval
        Ecv /= 5.0
        print "Ecv", Ecv
        if tmp_Ecv >= Ecv:
            best_lamb = lamb
            tmp_Ecv = Ecv
    print "best lambda", best_lamb, "Ecv", tmp_Ecv

def pro20():
    global lamb
    global test_x
    global test_y
    global train_x
    global train_y

    lamb = 1e-8
    train_x, train_y = load_data("hw4_train.dat")
    test_x, test_y = load_data("hw4_test.dat")
    ridge_regression()

if __name__ == '__main__':
    # print np.identity(2), np.matrix(np.identity(2))
    # pro13()
    # pro14()
    # pro17()
    pro20()