机器学习技法 作业三
1 gini系数的计算。两者相加为1,换元代入求极值。结果是0.5
2 继续换元代入,发现第一项跟原来的gini系数形状一样。
3
l
i
m
N
→
∞
(
1
−
1
/
N
)
p
N
=
e
−
p
lim_{N\rightarrow \infty}(1-1/N)^{pN}=e^{-p}
limN→∞(1−1/N)pN=e−p
4 最好的情况就是三个g之间错误的例子不重叠,那么
E
o
u
t
(
G
)
E_{out}(G)
Eout(G)就是0。最坏的情况就是充分利用每个错误的例子,也就是每个g的错误样例都对G的错误有贡献。画个图集合韦恩图就明白了。最坏的就是
g
3
g_{3}
g3对应的错误例子里有0.2与
g
2
g_{2}
g2的错误重合,有0.1与
g
1
g_{1}
g1的错误重合。因此上限是0.3。答案是[0,0.3],选项有误,应该是最后一个。
5 这题是上题的一般形式。问的是
E
o
u
t
(
G
)
E_{out}(G)
Eout(G)的上限,也就是可以更大但不能更小。举个反例即可排除掉第一个之外的四个。假设三个函数g,错误率都是0.3,而且每个错误样例都对应两个函数g,所以总的错误率就是0.45,因此其他选项排除只剩第一个,同样画个韦恩图即可。当然具体推导不太会。
两个圆分别对应
g
1
,
g
2
g_{1},g_{2}
g1,g2的错误样例,而它们重叠的部分是0.15,它们不重叠的部分代表的是
g
3
g_{3}
g3的错误率,也是0.3。这样
G
G
G的错误率就是0.45。
6 试做推导。当前系数和为
U
t
U_{t}
Ut,假设错误样例对应了x,那么正确样例就对应了
U
t
−
x
U_{t}-x
Ut−x。而下一轮更新后两部分应该相等,即
x
∗
1
−
ϵ
t
ϵ
t
=
(
U
t
−
x
)
∗
ϵ
t
1
−
ϵ
t
x*\sqrt{\dfrac{1-\epsilon_{t}}{\epsilon_{t}}}=(U_{t}-x)*\sqrt{\dfrac{\epsilon_{t}}{1-\epsilon_{t}}}
x∗ϵt1−ϵt=(Ut−x)∗1−ϵtϵt,求得
x
=
ϵ
t
U
t
x=\epsilon_{t}U_{t}
x=ϵtUt,因此
U
t
+
1
=
2
U
t
∗
ϵ
t
(
1
−
ϵ
t
)
U_{t+1}=2U_{t}*\sqrt{\epsilon_{t}(1-\epsilon_{t})}
Ut+1=2Ut∗ϵt(1−ϵt),于是得到答案为最后一项。
7 gbdt每次更新
s
n
s_{n}
sn都是更接近样本label的回归,因此结果是最后一项。
8 接上一题的结果,将
s
n
s_{n}
sn代入即可得到第二项。
9 代入即可知道结果是第一项。
10 将XOR写成五个AND结果的OR。因此D最少也是5(貌似没那么严谨?)。
11 选最后一项。因为计算梯度的时候需要乘以后一层对应神经元的值。除了最后一项说的那个,其它对应的都是0.
12 选第二项。因为计算对应梯度的每个部分都相等。x是同一个,而对应的
δ
j
l
\delta_{j}^{l}
δjl也是相等的,因为系数都是1。
13实现一个CART。代码放最后。每次branching都记录一下,得到的结果是10次。
14 0.0
15 刚开始得到的是0.95左右的结果,后来看到忽略了对于
θ
\theta
θ的要求,需要取每段中间的点。这样得到的结果是0.13左右。选0.15。
16 0.05
17 0.0 选项有点问题
18 0.75左右
19 0.11 左右
20 0.15左右
代码如下:
import numpy as np
import random
def getData(file):
f = open(file)
data = f.readlines()
xs = []
ys = []
for line in data:
d = line.split()
x = np.array([float(d[0]),float(d[1])])
y = float(d[2])
xs.append(x)
ys.append(y)
return np.array(xs),np.array(ys)
class CART:
def __init__(self, x, y, prune=False):
self.x = x
self.y = y
# print('build cart...')
self.branches = self.buildTree(x,y,prune)
def sign(self,v):
if v < 0:
return -1
else:
return 1
def hFunc(self, x, s, theta):
if s:
return self.sign(x-theta)
else:
return -self.sign(x-theta)
def buildTree(self,x,y,prune=False):
branches = []
if prune: #只有一层分支
branch, left, right = self.branchStump(x,y)
branches.append(branch)
if sum(left[1]) >= 0:
branches.append([1])
else:
branches.append([-1])
if sum(right[1]) >= 0:
branches.append([1])
else:
branches.append([-1])
return branches
else:#fully grown
if abs(sum(y)) == len(y):
branches.append(y[0])
return branches
else:
branch, left, right = self.branchStump(x,y)
branches.append(branch)
branches.append(self.buildTree(left[0],left[1]))
branches.append(self.buildTree(right[0],right[1]))
return branches
def branchStump(self, x, y):
# print('13. branch tree once...')
dimensions = len(x[0])
best_s = True
best_theta = 0
best_dim = 0
best_gini = 100
for dim in range(dimensions):
thetas = np.sort(x[:,dim]) #排序x用来作为备选的theta
ss = [True,False] #先遍历s是正的时候
for i,theta in enumerate(thetas):
if i > 0:
theta = (theta+thetas[i-1])/2 #取每个分割段的中间作为theta
else:
theta = theta/2
for s in ss:
gini = self.computeDivideGini(x,y,s,theta,dim)
if gini < best_gini:
best_gini = gini
best_s = s
best_theta = theta
best_dim = dim
branch = [best_s,best_theta,best_dim]
left,right = self.divideData(x,y,best_s,best_theta,best_dim)
return branch,left,right
def divideData(self,x,y,s,theta,dim):
left_x = []
left_y = []
right_x = []
right_y = []
for i in range(len(y)):
if self.hFunc(x[i][dim],s,theta) == -1:
left_x.append(x[i])
left_y.append(y[i])
else:
right_x.append(x[i])
right_y.append(y[i])
# print('left data:',len(left_y), 'right data:',len(right_y))
return [np.array(left_x),np.array(left_y)],[np.array(right_x),np.array(right_y)]
def computeDivideGini(self,x,y,s,theta,dim):
left,right = self.divideData(x,y,s,theta,dim)
left_gini = self.computeGini(left[0],left[1])
right_gini = self.computeGini(right[0],right[1])
return len(left[1])*left_gini+len(right[1])*right_gini
def computeGini(self,x,y):
sums = sum(y)
lens = len(y)
if lens == 0:
return 0
pos_num = (sums+lens)/2
neg_num = lens-pos_num
return 1-(pos_num/lens)*(pos_num/lens)-(neg_num/lens)*(neg_num/lens)
def fit(self,x,branch):
if len(branch) == 3:
dim = branch[0][2]
y = self.hFunc(x[dim],branch[0][0],branch[0][1])
if y == -1:
return self.fit(x,branch[1])
else:
return self.fit(x,branch[2])
else:
return branch[0]
def predict(self,x):
res = []
for i in range(len(x)):
res.append(self.fit(x[i],self.branches))
return np.array(res)
class RandomForest(object):
"""docstring for RandomForest"""
def __init__(self, x, y, T, prune=False):
self.x = x
self.y = y
self.T = T
self.trees = self.buildRF(x,y,T,prune)
def boostrap(self,x,y,N):
indexs = [random.randint(0,N) for _ in range(N)]
return x[indexs], y[indexs]
def buildRF(self,x,y,T,prune):
trees = []
for i in range(T):
tx, ty = self.boostrap(x,y,len(y)-1)
trees.append(CART(tx,ty,prune))
return trees
def fit(self,x):
res = []
for i in range(self.T):
res.append(self.trees[i].fit(x,self.trees[i].branches))
if sum(res) >= 0:
return 1
else:
return -1
def predict(self,x):
res = []
for i in range(len(x)):
res.append(self.fit(x[i]))
return np.array(res)
if __name__ == '__main__':
print('load data...')
train_x,train_y = getData('hw3_train.dat')
test_x,test_y = getData('hw3_test.dat')
cart = CART(train_x,train_y)
train_predict = cart.predict(train_x)
E_in = 1-sum(train_predict==train_y)/len(train_y)
print('14. E_in:',E_in)
predicted = cart.predict(test_x)
print('15. E_out:',1-sum(predicted==test_y)/len(test_y))
times = 10
E_in = 0
E_out = 0
E_g_in = 0
T = 300
for i in range(times):
rf = RandomForest(train_x,train_y,T)
for tree in rf.trees:
train_predict = tree.predict(train_x)
g_in = 1-sum(train_predict==train_y)/len(train_y)
E_g_in += g_in
train_predict = rf.predict(train_x)
e_in = 1-sum(train_predict==train_y)/len(train_y)
E_in += e_in
predicted = rf.predict(test_x)
e_out = 1-sum(predicted==test_y)/len(test_y)
E_out += e_out
print('random forest:',i,'e_in:',e_in,'e_out:',e_out)
print('16. E_g_in:', E_g_in/T/times)
print('17. E_in:', E_in/times)
print('18. E_out:', E_out/times)
E_in = 0
E_out = 0
for i in range(times):
rf = RandomForest(train_x,train_y,T,True)
train_predict = rf.predict(train_x)
e_in = 1-sum(train_predict==train_y)/len(train_y)
E_in += e_in
predicted = rf.predict(test_x)
e_out = 1-sum(predicted==test_y)/len(test_y)
E_out += e_out
print('pruned random forest:',i,'e_in:',e_in,'e_out:',e_out)
print('19. E_in:', E_in/times)
print('20. E_out:', E_out/times)