机器学习基石作业一
import numpy as np
def train_matrix():
with open("hw1_15_train.dat.txt","r") as f:
rawData=f.readlines()
dataNum=len(rawData)
dataDim=len(rawData[0].strip().split(' '))
X=np.zeros((dataNum,dataDim))
Y=np.zeros((dataNum,1))
count=0
x=[]
for line in rawData:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1 and len(x)<=4:
x.append(float(str))
elif len(str.split('\t')) == 1 and len(x)==5:
Y[count] = int(str.strip())
else:
x.append(float(str.split('\t')[0]))
Y[count,0] = int(str.split('\t')[1].strip())
X[count,:]=x
x=[]
count += 1
return X,Y,dataNum,dataDim
def PLA():
x_train,y_train,dataNum,dataDim=train_matrix()
w=np.zeros((dataDim,1))
count=0
while True:
flag=False
for i in range(dataNum):
if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0:
w+=y_train[i,:]*x_train[i,:].reshape(5,1)
flag=True
count+=1
if flag==False:
break
return count
if __name__=='__main__':
print(PLA())
import numpy as np
def train_matrix():
with open("hw1_15_train.dat.txt","r") as f:
rawData=f.readlines()
dataNum=len(rawData)
dataDim=len(rawData[0].strip().split(' '))
X=np.zeros((dataNum,dataDim))
Y=np.zeros((dataNum,1))
count=0
x=[]
for line in rawData:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1 and len(x)<=4:
x.append(float(str))
elif len(str.split('\t')) == 1 and len(x)==5:
Y[count] = int(str.strip())
else:
x.append(float(str.split('\t')[0]))
Y[count,0] = int(str.split('\t')[1].strip())
X[count,:]=x
x=[]
count += 1
permutation = np.random.permutation(Y.shape[0]) #numpy打乱数据集的方法
shuffled_dataset = X[permutation]
shuffled_labels = Y[permutation]
return shuffled_dataset,shuffled_labels,dataNum,dataDim
def PLA():
x_train,y_train,dataNum,dataDim=train_matrix()
w=np.zeros((dataDim,1))
count=0
while True:
flag=False
for i in range(dataNum):
if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0:
w+=y_train[i,:]*x_train[i,:].reshape(5,1)
flag=True
count+=1
if flag==False:
break
return count
if __name__=='__main__':
sum=0
for i in range(2000):
count=PLA()
sum+=count
print(sum/2000)
只要在计算w的时候再乘以0.5即可
import numpy
import copy
class Pocket(object):
def __init__(self, dimension, train_count, test_count):
self.__dimension = dimension
self.__train_count = train_count
self.__test_count = test_count
def train_matrix(self, path):
with open(path, "r") as f:
rawData = f.readlines()
X = numpy.zeros((self.__train_count, self.__dimension))
Y = numpy.zeros((self.__train_count, 1))
count = 0
x = []
for line in rawData:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
Y[count, 0] = int(str.split('\t')[1].strip())
X[count, :] = x
x = []
count += 1
permutation = numpy.random.permutation(Y.shape[0]) # numpy打乱数据集的方法
shuffled_dataset = X[permutation]
shuffled_labels = Y[permutation]
return shuffled_dataset, shuffled_labels
def iteration(self, path):
count = 0
x_train, y_train = self.train_matrix(path)
w = numpy.zeros((self.__dimension, 1))
best_count = self.__train_count
best_w = numpy.zeros((self.__dimension, 1))
# pocket算法,对一条线进行修改(最多50次),每次修改后都用训练集数据看是否是当前最好的那条线
for i in range(self.__train_count):
if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
# 修改次数加一
count += 1
num = 0
# 验证
for j in range(self.__train_count):
if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0:
num += 1
if num < best_count:
best_count = num
best_w = copy.deepcopy(w)
if count == 50:
break
return best_w
def test_matrix(self, test_path):
x_test = numpy.zeros((self.__test_count, self.__dimension))
y_test = numpy.zeros((self.__test_count, 1))
test_set = open(test_path)
x = []
x_count = 0
for line in test_set:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
x_test[x_count, :] = x
x = []
x_count += 1
return x_test, y_test
# 验证
def test_error(self, train_path, test_path):
w = self.iteration(train_path)
x_test, y_test = self.test_matrix(test_path)
count = 0.0
for i in range(self.__test_count):
if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
count += 1
return count / self.__test_count
if __name__ == '__main__':
average_error_rate = 0
for i in range(2000):
my_Pocket = Pocket(5, 500, 500)
average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt')
print(average_error_rate / 2000.0)
import numpy
import copy
class Pocket(object):
def __init__(self, dimension, train_count, test_count):
self.__dimension = dimension
self.__train_count = train_count
self.__test_count = test_count
def train_matrix(self, path):
with open(path, "r") as f:
rawData = f.readlines()
X = numpy.zeros((self.__train_count, self.__dimension))
Y = numpy.zeros((self.__train_count, 1))
count = 0
x = []
for line in rawData:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
Y[count, 0] = int(str.split('\t')[1].strip())
X[count, :] = x
x = []
count += 1
permutation = numpy.random.permutation(Y.shape[0]) # numpy打乱数据集的方法
shuffled_dataset = X[permutation]
shuffled_labels = Y[permutation]
return shuffled_dataset, shuffled_labels
def iteration(self, path):
count = 0
x_train, y_train = self.train_matrix(path)
w = numpy.zeros((self.__dimension, 1))
# pocket算法,对一条线进行修改(最多50次),每次修改后都用训练集数据看是否是当前最好的那条线
for i in range(self.__train_count):
if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
# 修改次数加一
count += 1
if count == 50:
break
return w
def test_matrix(self, test_path):
x_test = numpy.zeros((self.__test_count, self.__dimension))
y_test = numpy.zeros((self.__test_count, 1))
test_set = open(test_path)
x = []
x_count = 0
for line in test_set:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
x_test[x_count, :] = x
x = []
x_count += 1
return x_test, y_test
# 验证
def test_error(self, train_path, test_path):
w = self.iteration(train_path)
x_test, y_test = self.test_matrix(test_path)
count = 0.0
for i in range(self.__test_count):
if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
count += 1
return count / self.__test_count
if __name__ == '__main__':
average_error_rate = 0
for i in range(2000):
my_Pocket = Pocket(5, 500, 500)
average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt')
print(average_error_rate / 2000.0)
只需要在18题的代码上修改count判断的条件,把50修改成100即可