林轩田《机器学习基石》作业一-Python实现

15.

import pandas as pd
import numpy as np


def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])

    return X_train, y_train, data_size


def PLA():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    num_iters = 0
    while True:
        flag = False
        for i in range(data_size):
            if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
                w += y_train[i, :] * X_train[i, :].reshape(5, 1)
                flag = True
                num_iters += 1
        if flag == False:
            break
    return num_iters, w


if __name__ == '__main__':
    num, w = PLA()
    X_train, y_train, iter = get_training_set()
    print(num)

结果为

 

16.

在15题基础上进行修改,打乱数据序列,使用np.random.permutation(),返回一个新序列而不改变原始数据序列

import pandas as pd
import numpy as np

def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])
    # 打乱数据
    list = np.random.permutation(data_size)
    X_train = X_train[list]
    y_train = y_train[list]

    return X_train, y_train, data_size

def PLA():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    num_iters = 0
    while True:
        flag = False
        for i in range(data_size):
            if np.dot(X_train[i, :], w)*y_train[i, :] <= 0:
                w += y_train[i, :]*X_train[i, :].reshape(5, 1)
                flag = True
                num_iters += 1
        if flag == False:
            break
    return num_iters

if __name__ == '__main__':
    sum = 0
    for i in range(2000):
        num = PLA()
        sum += num
    ave_sum = sum/2000
    print(ave_sum)

结果为

 

17.

在16题的基础上加上一个学习速率alpha,修改PLA()函数,更新w时加上学习速率alpha

import pandas as pd
import numpy as np


def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])
    # 打乱数据
    list = np.random.permutation(data_size)
    X_train = X_train[list]
    y_train = y_train[list]

    return X_train, y_train, data_size


def PLA():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    num_iters = 0
    alpha = 0.5
    while True:
        flag = False
        for i in range(data_size):
            if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
                #添加学习速率alpha
                w += alpha * y_train[i, :] * X_train[i, :].reshape(5, 1)
                flag = True
                num_iters += 1
        if flag == False:
            break
    return num_iters


if __name__ == '__main__':
    sum = 0
    for i in range(2000):
        num = PLA()
        sum += num
    ave_sum = sum / 2000
    print(ave_sum)

 运行结果为

 

18.

import pandas as pd
import numpy as np
import copy as cp


def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])
    # 打乱数据
    list = np.random.permutation(data_size)
    X_train = X_train[list]
    y_train = y_train[list]

    return X_train, y_train, data_size


def get_test_set():
    data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 初始化测试集
    test_set = np.array(data)
    # 测试集大小
    test_size = test_set.shape[0]
    # 测试X与y分组
    X_val = test_set[:, 0:4]
    X_val = np.hstack((np.ones((test_size, 1)), X_val))
    y_val = test_set[:, 4:5]

    return X_val, y_val, test_size


def Pocket():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    count = 0
    max_iters = data_size
    w_pocket = np.zeros((5, 1))

    for i in range(data_size):
        if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
            w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
            #计数器加一
            count += 1
            validation = 0
            #验证w,是否放入口袋
            for j in range(data_size):
                if np.dot(X_train[j, :], w) * y_train[j, :] <= 0:
                    validation += 1
            if validation < max_iters:
                #验证次数,如果比上一次小,就将新的w放入pocket
                max_iters = validation
                w_pocket = cp.deepcopy(w)
            #迭代次数达到50次时,退出循环
            if 50 == count:
                break

    return w_pocket


def Pocket_Error(w_pocket):
    X_val, y_val, test_size = get_test_set()
    error_num = 0
    # 验证错误个数
    for i in range(test_size):
        if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
            error_num += 1
    error_ratio = error_num/test_size

    return error_ratio


if __name__ == '__main__':
    ratio_sum = 0
    for i in range(2000):
        w_pocket = Pocket()
        ratio = Pocket_Error(w_pocket)
        ratio_sum += ratio
    ave_ratio = ratio_sum/2000
    print(ave_ratio)

 计算结果为,特么这道题一开始结果做错了找了一晚上错最后发现用的第一题的数据在算第二题!!!_(¦3」∠)_

 

19.

import pandas as pd
import numpy as np
import copy as cp


def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])
    # 打乱数据
    list = np.random.permutation(data_size)
    X_train = X_train[list]
    y_train = y_train[list]

    return X_train, y_train, data_size


def get_test_set():
    data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 初始化测试集
    test_set = np.array(data)
    # 测试集大小
    test_size = test_set.shape[0]
    # 测试X与y分组
    X_val = test_set[:, 0:4]
    X_val = np.hstack((np.ones((test_size, 1)), X_val))
    y_val = test_set[:, 4:5]

    return X_val, y_val, test_size


def Pocket():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    count = 0

    for i in range(data_size):
        if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
            w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
            #计数器加一
            count += 1
            #迭代次数达到50次时,退出循环
            if 50 == count:
                break

    return w


def Pocket_Error(w_pocket):
    X_val, y_val, test_size = get_test_set()
    error_num = 0
    # 验证错误个数
    for i in range(test_size):
        if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
            error_num += 1
    error_ratio = error_num/test_size

    return error_ratio


if __name__ == '__main__':
    ratio_sum = 0
    for i in range(2000):
        w_pocket = Pocket()
        ratio = Pocket_Error(w_pocket)
        ratio_sum += ratio
    ave_ratio = ratio_sum/2000
    print(ave_ratio)

运行结果为

 

20.

import pandas as pd
import numpy as np
import copy as cp


def get_training_set():
    # 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
    data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 构建特征向量X
    X_train = np.array(data.iloc[:, 0:4])
    # 训练集大小
    data_size = X_train.shape[0]
    # 训练集X加上一列1
    X_train = np.hstack((np.ones((data_size, 1)), X_train))
    # 构建标签y
    y_train = np.array(data.iloc[:, 4:5])
    # 打乱数据
    list = np.random.permutation(data_size)
    X_train = X_train[list]
    y_train = y_train[list]

    return X_train, y_train, data_size


def get_test_set():
    data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
    # 初始化测试集
    test_set = np.array(data)
    # 测试集大小
    test_size = test_set.shape[0]
    # 测试X与y分组
    X_val = test_set[:, 0:4]
    X_val = np.hstack((np.ones((test_size, 1)), X_val))
    y_val = test_set[:, 4:5]

    return X_val, y_val, test_size


def Pocket():
    X_train, y_train, data_size = get_training_set()
    # 初始化w
    w = np.zeros((5, 1))
    count = 0
    max_iters = data_size
    w_pocket = np.zeros((5, 1))

    for i in range(data_size):
        if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
            w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
            #计数器加一
            count += 1
            validation = 0
            #验证w,是否放入口袋
            for j in range(data_size):
                if np.dot(X_train[j, :], w) * y_train[j, :] <= 0:
                    validation += 1
            if validation < max_iters:
                #验证次数,如果比上一次小,就将新的w放入pocket
                max_iters = validation
                w_pocket = cp.deepcopy(w)
            #迭代次数达到50次时,退出循环
            if 100 == count:
                break

    return w_pocket


def Pocket_Error(w_pocket):
    X_val, y_val, test_size = get_test_set()
    error_num = 0
    # 验证错误个数
    for i in range(test_size):
        if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
            error_num += 1
    error_ratio = error_num/test_size

    return error_ratio


if __name__ == '__main__':
    ratio_sum = 0
    for i in range(2000):
        w_pocket = Pocket()
        ratio = Pocket_Error(w_pocket)
        ratio_sum += ratio
    ave_ratio = ratio_sum/2000
    print(ave_ratio)

把Pocket中的count判断值改为100即可,结果为

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值