机器学习 3.4 个人笔记

最新推荐文章于 2023-07-30 21:13:30 发布

qilixuening

最新推荐文章于 2023-07-30 21:13:30 发布

阅读量724

点赞数

分类专栏：自学练习文章标签：个人笔记机器学习 Python

本文链接：https://blog.csdn.net/qilixuening/article/details/61427903

版权

自学练习专栏收录该内容

33 篇文章 4 订阅

订阅专栏

3.4 选择2个UCI数据集，比较10折交叉验证法和留一法所估计出的对率回归的错误率。

与3.3题类似，不过在此处beta与x的乘积进行指数运算，在某些数据集中容易出现RuntimeWarning: overflow encountered in exp,因此不得不放弃，就此作罢。10折交叉验证法和留一法均可以在划分好数据集的情况下直接调用下面的logit_regession(train_data,test_data,gap=0.1)函数。（我用的是UCI上面的iris数据集，结果在区分第二和第三类时，出现了上述错误）

# -*- coding: utf-8 -*-
import numpy as np

iris_0 = [
    [7.0,3.2,4.7,1.4,1],[6.4,3.2,4.5,1.5,1],[6.9,3.1,4.9,1.5,1],[5.5,2.3,4.0,1.3,1],
    [6.5,2.8,4.6,1.5,1],[5.7,2.8,4.5,1.3,1],[6.3,3.3,4.7,1.6,1],[4.9,2.4,3.3,1.0,1],
    [6.6,2.9,4.6,1.3,1],[5.2,2.7,3.9,1.4,1],[5.0,2.0,3.5,1.0,1],[5.9,3.0,4.2,1.5,1],
    [6.0,2.2,4.0,1.0,1],[6.1,2.9,4.7,1.4,1],[5.6,2.9,3.6,1.3,1],[6.7,3.1,4.4,1.4,1],
    [5.6,3.0,4.5,1.5,1],[5.8,2.7,4.1,1.0,1],[6.2,2.2,4.5,1.5,1],[5.6,2.5,3.9,1.1,1],
    [5.9,3.2,4.8,1.8,1],[6.1,2.8,4.0,1.3,1],[6.3,2.5,4.9,1.5,1],[6.1,2.8,4.7,1.2,1],
    [6.4,2.9,4.3,1.3,1],[6.6,3.0,4.4,1.4,1],[6.8,2.8,4.8,1.4,1],[6.7,3.0,5.0,1.7,1],
    [6.0,2.9,4.5,1.5,1],[5.7,2.6,3.5,1.0,1],[5.5,2.4,3.8,1.1,1],[5.5,2.4,3.7,1.0,1],
    [5.8,2.7,3.9,1.2,1],[6.0,2.7,5.1,1.6,1],[5.4,3.0,4.5,1.5,1],[6.0,3.4,4.5,1.6,1],
    [6.7,3.1,4.7,1.5,1],[6.3,2.3,4.4,1.3,1],[5.6,3.0,4.1,1.3,1],[5.5,2.5,4.0,1.3,1],
    [5.5,2.6,4.4,1.2,1],[6.1,3.0,4.6,1.4,1],[5.8,2.6,4.0,1.2,1],[5.0,2.3,3.3,1.0,1],
    [5.6,2.7,4.2,1.3,1],[5.7,3.0,4.2,1.2,1],[5.7,2.9,4.2,1.3,1],[6.2,2.9,4.3,1.3,1],
    [5.1,2.5,3.0,1.1,1],[5.7,2.8,4.1,1.3,1],[6.3,3.3,6.0,2.5,0],[5.8,2.7,5.1,1.9,0],
    [7.1,3.0,5.9,2.1,0],[6.3,2.9,5.6,1.8,0],[6.5,3.0,5.8,2.2,0],[7.6,3.0,6.6,2.1,0],
    [4.9,2.5,4.5,1.7,0],[7.3,2.9,6.3,1.8,0],[6.7,2.5,5.8,1.8,0],[7.2,3.6,6.1,2.5,0],
    [6.5,3.2,5.1,2.0,0],[6.4,2.7,5.3,1.9,0],[6.8,3.0,5.5,2.1,0],[5.7,2.5,5.0,2.0,0],
    [5.8,2.8,5.1,2.4,0],[6.4,3.2,5.3,2.3,0],[6.5,3.0,5.5,1.8,0],[7.7,3.8,6.7,2.2,0],
    [7.7,2.6,6.9,2.3,0],[6.0,2.2,5.0,1.5,0],[6.9,3.2,5.7,2.3,0],[5.6,2.8,4.9,2.0,0],
    [7.7,2.8,6.7,2.0,0],[6.3,2.7,4.9,1.8,0],[6.7,3.3,5.7,2.1,0],[7.2,3.2,6.0,1.8,0],
    [6.2,2.8,4.8,1.8,0],[6.1,3.0,4.9,1.8,0],[6.4,2.8,5.6,2.1,0],[7.2,3.0,5.8,1.6,0],
    [7.4,2.8,6.1,1.9,0],[7.9,3.8,6.4,2.0,0],[6.4,2.8,5.6,2.2,0],[6.3,2.8,5.1,1.5,0],
    [6.1,2.6,5.6,1.4,0],[7.7,3.0,6.1,2.3,0],[6.3,3.4,5.6,2.4,0],[6.4,3.1,5.5,1.8,0],
    [6.0,3.0,4.8,1.8,0],[6.9,3.1,5.4,2.1,0],[6.7,3.1,5.6,2.4,0],[6.9,3.1,5.1,2.3,0],
    [5.8,2.7,5.1,1.9,0],[6.8,3.2,5.9,2.3,0],[6.7,3.3,5.7,2.5,0],[6.7,3.0,5.2,2.3,0],
    [6.3,2.5,5.0,1.9,0],[6.5,3.0,5.2,2.0,0],[6.2,3.4,5.4,2.3,0],[5.9,3.0,5.1,1.8,0]]
# it contains 1,2 tobe label{1,0}


def logit_regession(train_data,test_data,gap=0.1):
    # use the traning_dataset to solve the column vector: beta
    train_data = np.mat(train_data)
    test_data = np.mat(test_data)
    k = test_data.shape[0]  # k represents the number of test instances
    m = train_data.shape[0]  # m represents the number of training instances
    n = train_data.shape[1]  # n represents the length of variables plus one
    beta = np.matrix(np.zeros((n,1)))  # initial beta is a column vector of [w1,w2,b]'
    label = train_data[:,-1]  # last column is the label vector, set one of that two to 1 and another to '0'
    x = train_data.T[0:n-1,:]/5
    x = np.r_[x,np.ones((1,m))]  # initial x is a column vector of [x1,x2,1]'
    dist = 1  # carry out the distance between new_beta and beta.
    def cal_l(beta, x, label):
        # solve the l'(beta) and l''(beta) of data to l1 and l2 respectively
        # l1, l2 = 0, np.mat(np.zeros((n, n)))
        l1, l2 = 0, 0
        for i in range(m):
            v = beta.T * x[:, i]
            l1 += x[:, i] * (np.longfloat(1 / (1 + np.exp(-v[0,0]))) - label[i])
            l2 += x[:, i] * x[:, i].T * np.longfloat(np.exp(-v[0,0]) / ((1 + np.exp(-v[0,0])) ** 2))
        return [l1, l2]
    while dist >= gap:
        # gap represetnts the delta(beta) that we need to reach the true value of beta
        new_beta = beta - cal_l(beta,x,label)[1].I * cal_l(beta,x,label)[0]
        dist = np.linalg.norm(new_beta-beta)
        beta = new_beta

    y = test_data.T[0:n-1, :]/5
    y = np.r_[y, np.ones((1, k))]  # every yi has a column vector of certain value in [x1,x2,1]'
    result = []  # save the logit regression result
    counter = 0  # save the predict error number
    true_label = test_data[:, -1]
    for i in range(k):
        result.append(np.longfloat(1 / (1 + np.exp(-beta.T * y[:, i])))[0, 0])
        if (result[i] - 0.5) * (true_label[i] - 0.5) < 0:
            counter += 1
    return beta, counter

# 10-fold cross validation method of dataset division
# for subset in [iris_2, iris_1, iris_0]:
test_set,train_set = [], []  # test_set[i] corresponds to train_set[i].
for i in range(10):
    item0   = iris_0[i*5:i*5+5]
    item1   = iris_0[50+i*5:55+i*5]
    remain0 = iris_0[:i*5]
    remain1 = iris_0[i*5+5:50+i*5]
    remain2 = iris_0[55+i*5:]
    test_set.append(item0+item1)
    train_set.append(remain0+remain1+remain2)
err_i = 0
for i in range(10):
    beta,err_i = logit_regession(train_set[i],test_set[i],1)
    print(beta, err_i)

qilixuening

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
机器学习 3.4 个人笔记

3.4 选择2个UCI数据集，比较10折交叉验证法和留一法所估计出的对率回归的错误率。与3.3题类似，不过在此处beta与x的乘积进行指数运算，在某些数据集中容易出现RuntimeWarning: overflow encountered in exp,因此不得不放弃，就此作罢。10折交叉验证法和留一法均可以在划分好数据集的情况下直接调用下面的logit_regession(train_dat
复制链接

扫一扫