机器学习基石作业一(NTU-林轩田) Python代码部分
Q15
这题是写PLA演算法
import numpy as np
import pandas as pd
def PLA(data):
n,m = data.shape
w = np.zeros([m-1,1])
counts = 0
k = 0
while 1:
k=0
for i in range(n):
temp = data[i]
target = temp[m-1]
if temp[0:m-1].reshape(1,m-1).dot(w)==0 and target == 1:
w += target*temp[0: m-1].reshape(m-1,1)
counts+=1
elif (temp[0:m-1].reshape(1,m-1).dot(w))*target< 0 :
w += target*temp[0: m-1].reshape(m-1,1)
counts+=1
else:
k+=1
print(k)
if k==n:
return [w,counts]
调用函数、读取数据和结果部分:
colname = ['x1','x2','x3','x4','target']
PLA_data = pd.read_csv('/Downloads/ntumlone_hw1_hw1_15_train.dat.txt',header = None,sep = '\s' ,engine = 'python')
PLA(np.column_stack((np.ones([400,1]), np.array(PLA_data))))[1]
45次
Q16
在15题的基础上增加了一个随机,之后算2000次取平均值,可以近似为期望。
代码:
函数
import random
def random_PLA(data):
n,m = data.shape
w = np.zeros([m-1,1])
counts = 0
k = 0
ran = random.sample(list(range(n)),n)
while 1:
k=0
for i in ran:
temp = data[i]
target = temp[m-1]
if temp[0:m-1].reshape(1,m-1).dot(w)==0 and target == 1:
w += target*temp[0: m-1].reshape(m-1,1)
counts+=1
elif (temp[0:m-1].reshape(1,m-1).dot(w))*target< 0 :
w += target*temp[0: m-1].reshape(m-1,1)
counts+=1
else:
k+=1
if k==n:
return [w,counts]
调用:
mean = 0
for i in range(2000):
mean = random_PLA(np.column_stack((np.ones([400,1]), np.array(PLA_data))))[1]+mean
mean = mean/2000
mean
结果为40.1395 次
Q17
在16题的基础上再加了一个学习率,也就是增加了一个超参数。
代码
函数
import random
def rate_PLA(data):
n,m = data.shape
w = np.zeros([m-1,1])
counts = 0
k = 0
ran = random.sample(list(range(n)),n)
while 1:
k=0
for i in ran:
temp = data[i]
target = temp[m-1]
if temp[0:m-1].reshape(1,m-1).dot(w)==0 and target == 1:
w += 0.5*target*temp[0: m-1].reshape(m-1,1)
counts+=1
elif (temp[0:m-1].reshape(1,m-1).dot(w))*target< 0 :
w += 0.5*target*temp[0: m-1].reshape(m-1,1)
counts+=1
else:
k+=1
if k==n:
return [w,counts]
调用与运行结果:
mean = 0
for i in range(2000):
mean = rate_PLA(np.column_stack((np.ones([400,1]), np.array(PLA_data))))[1]+mean
mean = mean/2000
mean
结果为40.239
Q18
接下来是对于同一个模型使用另外的算法,这个算法算是补充了前一个算法要求数据必须是线性可分的缺点。
代码
函数
import numpy
def acc_rate(data,w):
n,m = data.shape
target = data[:,m-1].reshape(n,1)
x = data[:,0:m-1]
y = (x.dot(w)*target)>0
acc = sum(y)/n
return acc
def Pocket_PLA(data):
step = 0
n,m = data.shape
w = np.zeros([m-1,1])
w_best = w
ran = random.sample(list(range(n)),n)
acc = acc_rate(data,w)
while 1:
for i in ran:
temp = data[i]
target = temp[m-1]
if (temp[0:m-1].reshape(1,m-1).dot(w))*target <= 0:
step +=1
w = w + target * temp[0:m-1].reshape(m-1,1)
if acc_rate(data,w) > acc:
w_best = w
acc = acc_rate(data,w)
if step == 100 or acc_rate(data,w)==1:
return [w_best,acc_rate(data,w_best)]
第一个函数用于计算错误率,第二个就是pocket演算法。
数据读取与函数调用和结果
import pandas
colname = ['x1','x2','x3','x4','target']
PLA_data = pd.read_csv('/Users/zhangdi/Downloads/ntumlone_hw1_hw1_18_train.dat.txt',header = None,sep = '\s' ,engine = 'python')
mean = 0
PLA_test = pd.read_csv('/Users/zhangdi/Downloads/ntumlone_hw1_hw1_18_test.dat.txt',header = None,sep = '\s' ,engine = 'python')
for i in range(2000):
mean = acc_rate(np.column_stack((np.ones([500,1]), np.array(PLA_test))),Pocket_PLA(np.column_stack((np.ones([500,1]), np.array(PLA_data))))[0])+mean
mean = mean/2000
mean
可以得到结果,正确率为868,错误率为0.132
Q19
意思应该是作一个对比,如果不用pocket,而是采取和前一种算法一样的方式,只是增加了一个最大次数作为限制,可以发现,2000次的平均比pocket差特别大
def Q19_PLA(data):
step = 0
n,m = data.shape
w = np.zeros([m-1,1])
w_best = w
ran = random.sample(list(range(n)),n)
acc = acc_rate(data,w)
while 1:
for i in ran:
temp = data[i]
target = temp[m-1]
if (temp[0:m-1].reshape(1,m-1).dot(w))*target <= 0:
step +=1
w = w + target * temp[0:m-1].reshape(m-1,1)
if step == 50 or acc_rate(data,w)==1:
return [w,acc_rate(data,w)]
for i in range(2000):
print(i)
mean = acc_rate(np.column_stack((np.ones([500,1]), np.array(PLA_test))),Q19_PLA(np.column_stack((np.ones([500,1]), np.array(PLA_data))))[0])+mean
mean = mean/2000
mean
结果为0.63479
Q20
最后一题,不过是将50次改为了100次,可以看得出增加了迭代次数是有助于提高在测试集上的正确率的( 测试集如果与训练集同分布)
代码
函数
def acc_rate(data,w):
n,m = data.shape
target = data[:,m-1].reshape(n,1)
x = data[:,0:m-1]
y = (x.dot(w)*target)>0
acc = sum(y)/n
return acc
def Pocket_PLA(data):
step = 0
n,m = data.shape
w = np.zeros([m-1,1])
w_best = w
ran = random.sample(list(range(n)),n)
acc = acc_rate(data,w)
while 1:
for i in ran:
temp = data[i]
target = temp[m-1]
if (temp[0:m-1].reshape(1,m-1).dot(w))*target <= 0:
step +=1
w = w + target * temp[0:m-1].reshape(m-1,1)
if acc_rate(data,w) > acc:
w_best = w
acc = acc_rate(data,w)
if step == 100 or acc_rate(data,w)==1:
return [w_best,acc_rate(data,w_best)]
mean = 0
PLA_test = pd.read_csv('/Downloads/ntumlone_hw1_hw1_18_test.dat.txt',header = None,sep = '\s' ,engine = 'python')
for i in range(2000):
print(i)
mean = acc_rate(np.column_stack((np.ones([500,1]), np.array(PLA_test))),Pocket_PLA(np.column_stack((np.ones([500,1]), np.array(PLA_data))))[0])+mean
mean = mean/2000
mean
结果为0.113424。