3.4 选择2个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率。
与3.3题类似,不过在此处beta与x的乘积进行指数运算,在某些数据集中容易出现RuntimeWarning: overflow encountered in exp,因此不得不放弃,就此作罢。10折交叉验证法和留一法均可以在划分好数据集的情况下直接调用下面的logit_regession(train_data,test_data,gap=0.1)函数。(我用的是UCI上面的iris数据集,结果在区分第二和第三类时,出现了上述错误)
# -*- coding: utf-8 -*-
import numpy as np
iris_0 = [
[7.0,3.2,4.7,1.4,1],[6.4,3.2,4.5,1.5,1],[6.9,3.1,4.9,1.5,1],[5.5,2.3,4.0,1.3,1],
[6.5,2.8,4.6,1.5,1],[5.7,2.8,4.5,1.3,1],[6.3,3.3,4.7,1.6,1],[4.9,2.4,3.3,1.0,1],
[6.6,2.9,4.6,1.3,1],[5.2,2.7,3.9,1.4,1],[5.0,2.0,3.5,1.0,1],[5.9,3.0,4.2,1.5,1],
[6.0,2.2,4.0,1.0,1],[6.1,2.9,4.7,1.4,1],[5.6,2.9,3.6,1.3,1],[6.7,3.1,4.4,1.4,1],
[5.6,3.0,4.5,1.5,1],[5.8,2.7,4.1,1.0,1],[6.2,2.2,4.5,1.5,1],[5.6,2.5,3.9,1.1,1],
[5.9,3.2,4.8,1.8,1],[6.1,2.8,4.0,1.3,1],[6.3,2.5,4.9,1.5,1],[6.1,2.8,4.7,1.2,1],
[6.4,2.9,4.3,1.3,1],[6.6,3.0,4.4,1.4,1],[6.8,2.8,4.8,1.4,1],[6.7,3.0,5.0,1.7,1],
[6.0,2.9,4.5,1.5,1],[5.7,2.6,3.5,1.0,1],[5.5,2.4,3.8,1.1,1],[5.5,2.4,3.7,1.0,1],
[5.8,2.7,3.9,1.2,1],[6.0,2.7,5.1,1.6,1],[5.4,3.0,4.5,1.5,1],[6.0,3.4,4.5,1.6,1],
[6.7,3.1,4.7,1.5,1],[6.3,2.3,4.4,1.3,1],[5.6,3.0,4.1,1.3,1],[5.5,2.5,4.0,1.3,1],
[5.5,2.6,4.4,1.2,1],[6.1,3.0,4.6,1.4,1],[5.8,2.6,4.0,1.2,1],[5.0,2.3,3.3,1.0,1],
[5.6,2.7,4.2,1.3,1],[5.7,3.0,4.2,1.2,1],[5.7,2.9,4.2,1.3,1],[6.2,2.9,4.3,1.3,1],
[5.1,2.5,3.0,1.1,1],[5.7,2.8,4.1,1.3,1],[6.3,3.3,6.0,2.5,0],[5.8,2.7,5.1,1.9,0],
[7.1,3.0,5.9,2.1,0],[6.3,2.9,5.6,1.8,0],[6.5,3.0,5.8,2.2,0],[7.6,3.0,6.6,2.1,0],
[4.9,2.5,4.5,1.7,0],[7.3,2.9,6.3,1.8,0],[6.7,2.5,5.8,1.8,0],[7.2,3.6,6.1,2.5,0],
[6.5,3.2,5.1,2.0,0],[6.4,2.7,5.3,1.9,0],[6.8,3.0,5.5,2.1,0],[5.7,2.5,5.0,2.0,0],
[5.8,2.8,5.1,2.4,0],[6.4,3.2,5.3,2.3,0],[6.5,3.0,5.5,1.8,0],[7.7,3.8,6.7,2.2,0],
[7.7,2.6,6.9,2.3,0],[6.0,2.2,5.0,1.5,0],[6.9,3.2,5.7,2.3,0],[5.6,2.8,4.9,2.0,0],
[7.7,2.8,6.7,2.0,0],[6.3,2.7,4.9,1.8,0],[6.7,3.3,5.7,2.1,0],[7.2,3.2,6.0,1.8,0],
[6.2,2.8,4.8,1.8,0],[6.1,3.0,4.9,1.8,0],[6.4,2.8,5.6,2.1,0],[7.2,3.0,5.8,1.6,0],
[7.4,2.8,6.1,1.9,0],[7.9,3.8,6.4,2.0,0],[6.4,2.8,5.6,2.2,0],[6.3,2.8,5.1,1.5,0],
[6.1,2.6,5.6,1.4,0],[7.7,3.0,6.1,2.3,0],[6.3,3.4,5.6,2.4,0],[6.4,3.1,5.5,1.8,0],
[6.0,3.0,4.8,1.8,0],[6.9,3.1,5.4,2.1,0],[6.7,3.1,5.6,2.4,0],[6.9,3.1,5.1,2.3,0],
[5.8,2.7,5.1,1.9,0],[6.8,3.2,5.9,2.3,0],[6.7,3.3,5.7,2.5,0],[6.7,3.0,5.2,2.3,0],
[6.3,2.5,5.0,1.9,0],[6.5,3.0,5.2,2.0,0],[6.2,3.4,5.4,2.3,0],[5.9,3.0,5.1,1.8,0]]
# it contains 1,2 tobe label{1,0}
def logit_regession(train_data,test_data,gap=0.1):
# use the traning_dataset to solve the column vector: beta
train_data = np.mat(train_data)
test_data = np.mat(test_data)
k = test_data.shape[0] # k represents the number of test instances
m = train_data.shape[0] # m represents the number of training instances
n = train_data.shape[1] # n represents the length of variables plus one
beta = np.matrix(np.zeros((n,1))) # initial beta is a column vector of [w1,w2,b]'
label = train_data[:,-1] # last column is the label vector, set one of that two to 1 and another to '0'
x = train_data.T[0:n-1,:]/5
x = np.r_[x,np.ones((1,m))] # initial x is a column vector of [x1,x2,1]'
dist = 1 # carry out the distance between new_beta and beta.
def cal_l(beta, x, label):
# solve the l'(beta) and l''(beta) of data to l1 and l2 respectively
# l1, l2 = 0, np.mat(np.zeros((n, n)))
l1, l2 = 0, 0
for i in range(m):
v = beta.T * x[:, i]
l1 += x[:, i] * (np.longfloat(1 / (1 + np.exp(-v[0,0]))) - label[i])
l2 += x[:, i] * x[:, i].T * np.longfloat(np.exp(-v[0,0]) / ((1 + np.exp(-v[0,0])) ** 2))
return [l1, l2]
while dist >= gap:
# gap represetnts the delta(beta) that we need to reach the true value of beta
new_beta = beta - cal_l(beta,x,label)[1].I * cal_l(beta,x,label)[0]
dist = np.linalg.norm(new_beta-beta)
beta = new_beta
y = test_data.T[0:n-1, :]/5
y = np.r_[y, np.ones((1, k))] # every yi has a column vector of certain value in [x1,x2,1]'
result = [] # save the logit regression result
counter = 0 # save the predict error number
true_label = test_data[:, -1]
for i in range(k):
result.append(np.longfloat(1 / (1 + np.exp(-beta.T * y[:, i])))[0, 0])
if (result[i] - 0.5) * (true_label[i] - 0.5) < 0:
counter += 1
return beta, counter
# 10-fold cross validation method of dataset division
# for subset in [iris_2, iris_1, iris_0]:
test_set,train_set = [], [] # test_set[i] corresponds to train_set[i].
for i in range(10):
item0 = iris_0[i*5:i*5+5]
item1 = iris_0[50+i*5:55+i*5]
remain0 = iris_0[:i*5]
remain1 = iris_0[i*5+5:50+i*5]
remain2 = iris_0[55+i*5:]
test_set.append(item0+item1)
train_set.append(remain0+remain1+remain2)
err_i = 0
for i in range(10):
beta,err_i = logit_regession(train_set[i],test_set[i],1)
print(beta, err_i)