机器学习 3.4 个人笔记

3.4 选择2个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率。

与3.3题类似,不过在此处beta与x的乘积进行指数运算,在某些数据集中容易出现RuntimeWarning: overflow encountered in exp,因此不得不放弃,就此作罢。10折交叉验证法和留一法均可以在划分好数据集的情况下直接调用下面的logit_regession(train_data,test_data,gap=0.1)函数。(我用的是UCI上面的iris数据集,结果在区分第二和第三类时,出现了上述错误)

# -*- coding: utf-8 -*-
import numpy as np

iris_0 = [
    [7.0,3.2,4.7,1.4,1],[6.4,3.2,4.5,1.5,1],[6.9,3.1,4.9,1.5,1],[5.5,2.3,4.0,1.3,1],
    [6.5,2.8,4.6,1.5,1],[5.7,2.8,4.5,1.3,1],[6.3,3.3,4.7,1.6,1],[4.9,2.4,3.3,1.0,1],
    [6.6,2.9,4.6,1.3,1],[5.2,2.7,3.9,1.4,1],[5.0,2.0,3.5,1.0,1],[5.9,3.0,4.2,1.5,1],
    [6.0,2.2,4.0,1.0,1],[6.1,2.9,4.7,1.4,1],[5.6,2.9,3.6,1.3,1],[6.7,3.1,4.4,1.4,1],
    [5.6,3.0,4.5,1.5,1],[5.8,2.7,4.1,1.0,1],[6.2,2.2,4.5,1.5,1],[5.6,2.5,3.9,1.1,1],
    [5.9,3.2,4.8,1.8,1],[6.1,2.8,4.0,1.3,1],[6.3,2.5,4.9,1.5,1],[6.1,2.8,4.7,1.2,1],
    [6.4,2.9,4.3,1.3,1],[6.6,3.0,4.4,1.4,1],[6.8,2.8,4.8,1.4,1],[6.7,3.0,5.0,1.7,1],
    [6.0,2.9,4.5,1.5,1],[5.7,2.6,3.5,1.0,1],[5.5,2.4,3.8,1.1,1],[5.5,2.4,3.7,1.0,1],
    [5.8,2.7,3.9,1.2,1],[6.0,2.7,5.1,1.6,1],[5.4,3.0,4.5,1.5,1],[6.0,3.4,4.5,1.6,1],
    [6.7,3.1,4.7,1.5,1],[6.3,2.3,4.4,1.3,1],[5.6,3.0,4.1,1.3,1],[5.5,2.5,4.0,1.3,1],
    [5.5,2.6,4.4,1.2,1],[6.1,3.0,4.6,1.4,1],[5.8,2.6,4.0,1.2,1],[5.0,2.3,3.3,1.0,1],
    [5.6,2.7,4.2,1.3,1],[5.7,3.0,4.2,1.2,1],[5.7,2.9,4.2,1.3,1],[6.2,2.9,4.3,1.3,1],
    [5.1,2.5,3.0,1.1,1],[5.7,2.8,4.1,1.3,1],[6.3,3.3,6.0,2.5,0],[5.8,2.7,5.1,1.9,0],
    [7.1,3.0,5.9,2.1,0],[6.3,2.9,5.6,1.8,0],[6.5,3.0,5.8,2.2,0],[7.6,3.0,6.6,2.1,0],
    [4.9,2.5,4.5,1.7,0],[7.3,2.9,6.3,1.8,0],[6.7,2.5,5.8,1.8,0],[7.2,3.6,6.1,2.5,0],
    [6.5,3.2,5.1,2.0,0],[6.4,2.7,5.3,1.9,0],[6.8,3.0,5.5,2.1,0],[5.7,2.5,5.0,2.0,0],
    [5.8,2.8,5.1,2.4,0],[6.4,3.2,5.3,2.3,0],[6.5,3.0,5.5,1.8,0],[7.7,3.8,6.7,2.2,0],
    [7.7,2.6,6.9,2.3,0],[6.0,2.2,5.0,1.5,0],[6.9,3.2,5.7,2.3,0],[5.6,2.8,4.9,2.0,0],
    [7.7,2.8,6.7,2.0,0],[6.3,2.7,4.9,1.8,0],[6.7,3.3,5.7,2.1,0],[7.2,3.2,6.0,1.8,0],
    [6.2,2.8,4.8,1.8,0],[6.1,3.0,4.9,1.8,0],[6.4,2.8,5.6,2.1,0],[7.2,3.0,5.8,1.6,0],
    [7.4,2.8,6.1,1.9,0],[7.9,3.8,6.4,2.0,0],[6.4,2.8,5.6,2.2,0],[6.3,2.8,5.1,1.5,0],
    [6.1,2.6,5.6,1.4,0],[7.7,3.0,6.1,2.3,0],[6.3,3.4,5.6,2.4,0],[6.4,3.1,5.5,1.8,0],
    [6.0,3.0,4.8,1.8,0],[6.9,3.1,5.4,2.1,0],[6.7,3.1,5.6,2.4,0],[6.9,3.1,5.1,2.3,0],
    [5.8,2.7,5.1,1.9,0],[6.8,3.2,5.9,2.3,0],[6.7,3.3,5.7,2.5,0],[6.7,3.0,5.2,2.3,0],
    [6.3,2.5,5.0,1.9,0],[6.5,3.0,5.2,2.0,0],[6.2,3.4,5.4,2.3,0],[5.9,3.0,5.1,1.8,0]]
# it contains 1,2 tobe label{1,0}


def logit_regession(train_data,test_data,gap=0.1):
    # use the traning_dataset to solve the column vector: beta
    train_data = np.mat(train_data)
    test_data = np.mat(test_data)
    k = test_data.shape[0]  # k represents the number of test instances
    m = train_data.shape[0]  # m represents the number of training instances
    n = train_data.shape[1]  # n represents the length of variables plus one
    beta = np.matrix(np.zeros((n,1)))  # initial beta is a column vector of [w1,w2,b]'
    label = train_data[:,-1]  # last column is the label vector, set one of that two to 1 and another to '0'
    x = train_data.T[0:n-1,:]/5
    x = np.r_[x,np.ones((1,m))]  # initial x is a column vector of [x1,x2,1]'
    dist = 1  # carry out the distance between new_beta and beta.
    def cal_l(beta, x, label):
        # solve the l'(beta) and l''(beta) of data to l1 and l2 respectively
        # l1, l2 = 0, np.mat(np.zeros((n, n)))
        l1, l2 = 0, 0
        for i in range(m):
            v = beta.T * x[:, i]
            l1 += x[:, i] * (np.longfloat(1 / (1 + np.exp(-v[0,0]))) - label[i])
            l2 += x[:, i] * x[:, i].T * np.longfloat(np.exp(-v[0,0]) / ((1 + np.exp(-v[0,0])) ** 2))
        return [l1, l2]
    while dist >= gap:
        # gap represetnts the delta(beta) that we need to reach the true value of beta
        new_beta = beta - cal_l(beta,x,label)[1].I * cal_l(beta,x,label)[0]
        dist = np.linalg.norm(new_beta-beta)
        beta = new_beta

    y = test_data.T[0:n-1, :]/5
    y = np.r_[y, np.ones((1, k))]  # every yi has a column vector of certain value in [x1,x2,1]'
    result = []  # save the logit regression result
    counter = 0  # save the predict error number
    true_label = test_data[:, -1]
    for i in range(k):
        result.append(np.longfloat(1 / (1 + np.exp(-beta.T * y[:, i])))[0, 0])
        if (result[i] - 0.5) * (true_label[i] - 0.5) < 0:
            counter += 1
    return beta, counter

# 10-fold cross validation method of dataset division
# for subset in [iris_2, iris_1, iris_0]:
test_set,train_set = [], []  # test_set[i] corresponds to train_set[i].
for i in range(10):
    item0   = iris_0[i*5:i*5+5]
    item1   = iris_0[50+i*5:55+i*5]
    remain0 = iris_0[:i*5]
    remain1 = iris_0[i*5+5:50+i*5]
    remain2 = iris_0[55+i*5:]
    test_set.append(item0+item1)
    train_set.append(remain0+remain1+remain2)
err_i = 0
for i in range(10):
    beta,err_i = logit_regession(train_set[i],test_set[i],1)
    print(beta, err_i)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值