一维直线上有N个点分成两类,hypothesis set是positvie or negative的射线,VC dimension是2N,其中N个点中间的interval任选一个作为射线的起点,总共2(N-1)个,全是o或者x的分类方法有两个。(注意最右边的点x>max(x[i[)作为起点,positive ray和negative ray分类的效果相同,所以算作一类,最左边的点x<min(x[i])同理。)
Question 17-18 要求枚举所有的2N种射线(实现时我是将射线起点设在两个相邻的点的中点),找出Ein最小的作为分类器。
Question 19-20 扩展到多维的点,当x[i]维度是d时,选取其中一维d使得Ein最小,只用这一维做出prediction,其他维度的feature忽略。
import numpy as np
import math
import sys
def pro3():
epsilon = 0.05
delta = 0.05
dvc = 10
N = [420000, 440000, 460000, 480000, 500000]
for i in N:
res = 4 * ((2 * i) ** dvc) * math.exp(-0.125 * (epsilon ** 2) * i)
print i, res
def quadratic(a, b, c):
temp = b * b - 4 * a * c
# print temp, math.sqrt(temp)
if temp < 0:
return None
if a == 0:
return - c / b
m = (- b - math.sqrt(temp)) / (2.0 * a)
n = (- b + math.sqrt(temp)) / (2.0 * a)
return m, n
def pro4():
delta = 0.05
dvc = 50
N = 5#10000
def mh(n):
return (2 * n) ** dvc
# original vc bound
epsilon = math.sqrt(8.0 / N * math.log(4.0 * mh(2 * N) / delta))
print epsilon
# Rademacher Penalty Bound
epsilon = math.sqrt(2.0 / N * math.log(2 * N * (mh(N)))) + math.sqrt(2.0 / N * math.log(1.0 / delta)) + 1.0 / N
print epsilon
# Variant VC bound:
epsilon = math.sqrt(16.0 / N * math.log(2 * (mh(N)) / math.sqrt(delta)))
print epsilon
# ep = 7.04877656418#5.17714785419#0.331308785962 # current min
# Parrondo and Van den Broek
# print math.log(6.0 * mh(2 * N) / delta)
# epsilon = math.sqrt(1.0 / N * (2 * ep + math.log(6.0 * mh(2 * N) / delta)))
print quadratic(N, - 2.0, - math.log(6.0 * mh(2 * N) / delta))
# print epsilon
# Devroye
# epsilon = math.sqrt(0.5 / N * (4 * ep *(1 + ep) + math.log(4) + dvc * math.log(2 * N ** 2) - math.log(delta))) #math.log(4 * N ** (2 * dvc) / delta)))
print quadratic(2 * N - 4, - 4, - math.log(4) - dvc * math.log(2 * N ** 2) + math.log(delta))
# pro 16, 17
def h(x, theta, s):
return np.sign(x - theta) * s
def decison_stump():
min_s = 0
min_theta = 0
N = 20
ave_Ein = []
ave_Eout = []
repeat = 5000
np.random.seed(1234)
noise_prob = 0.2
for rep in range(0, repeat):
train_x = np.random.uniform(-1, 1, N)
train_y = np.zeros(train_x.shape)
train_xy = []
min_ein = 1.0
for i in range(0, len(train_x)):
if train_x[i] > 0:
sgn = 1
else:
sgn = -1
if np.random.random() < noise_prob:
# print "noise", train_x[i]
train_y[i] = - sgn
else:
train_y[i] = sgn
train_xy.append((train_x[i], train_y[i]))
train_xy.sort(key=lambda z: z[0])
x = np.array([train_xy[i][0] for i in range(0, len(train_xy))])
y = np.array([train_xy[i][1] for i in range(0, len(train_xy))])
# print train_xy
for s in [-1, 1]:
for i in range(0, len(train_xy)):
if i == len(train_xy) - 1:
theta = -2 # all is -1 when s = -1, all is +1 when s = 1
else:
theta = (train_xy[i][0] + train_xy[i + 1][0]) / 2.0
predicted_y = h(x, theta, s)
diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
error = 1.0 * np.count_nonzero(diff) / N
# print error, min_ein
if error < min_ein:
min_ein = error
min_s = s
min_theta = theta
# all is -1 or all is +1
ave_Ein.append(min_ein)
eout = 0.5 + 0.3 * min_s * (np.abs(min_theta) - 1)
ave_Eout.append(eout)
print "ave Ein", np.average(ave_Ein)
print "ave Eout", np.average(ave_Eout)
# 0.1729
# 0.259461300445
# pro 19
def load_data(filename):
x = []
y = []
try:
fin = open(filename, 'r')
except IOError:
return
for data in fin:
data = data.strip("\n")
data = data.strip("\r")
data = data.strip(" ")
items = data.split(" ")
# print items
y.append(float(items[-1]))
tmp = []
for i in range(0, len(items) - 1):
tmp.append(float(items[i]))
x.append(tmp)
fin.close()
return x, y
# def multi_dim_h(x, theta, s):
# return np.sign(x - theta) * s
def multi_dim_decison_stump():
min_s = 0
min_theta = 0
train_file = "hw2_train.dat"
test_file = "hw2_test.dat"
train_x, train_y = load_data(train_file)
test_x, test_y = load_data(test_file)
dim = len(train_x[0])
np.random.seed(1234)
final_s = np.zeros(dim)
final_theta = np.zeros(dim)
final_ein = np.zeros(dim) # E in in each dimension
for d in range(0, dim): # try each dimension
train_xy = []
min_ein = 1.0 # best of the best
for i in range(0, len(train_x)):
train_xy.append((train_x[i][d], train_y[i]))
train_xy.sort(key=lambda z: z[0])
x = np.array([train_xy[i][0] for i in range(0, len(train_xy))])
y = np.array([train_xy[i][1] for i in range(0, len(train_xy))])
# print train_xy
for s in [-1, 1]:
for i in range(0, len(train_xy)):
if i == len(train_xy) - 1:
theta = -100 # all is -1 when s = -1, all is +1 when s = 1
else:
theta = (train_xy[i][0] + train_xy[i + 1][0]) / 2.0
predicted_y = h(x, theta, s)
diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
error = 1.0 * np.count_nonzero(diff) / len(train_x)
# print error, min_ein
if error < min_ein:
min_ein = error
min_s = s
min_theta = theta
# all is -1 or all is +1
final_s[d] = min_s
final_theta[d] = min_theta
final_ein[d] = min_ein
# choose one dimension only as the input to predict
idx = np.argmin(final_ein)
print "best Ein", np.average(final_ein[idx])
x = np.array([test_x[i][idx] for i in range(0, len(test_x))])
y = np.array([test_y[i] for i in range(0, len(test_y))])
predicted_y = h(x, final_theta[idx], final_s[idx])
diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
error = 1.0 * np.count_nonzero(diff) / len(test_x)
print "best Eout", np.average(error)
if __name__ == '__main__':
# pro4()
# pro3()
# decison_stump()
multi_dim_decison_stump()