15.
import pandas as pd
import numpy as np
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
return X_train, y_train, data_size
def PLA():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
num_iters = 0
while True:
flag = False
for i in range(data_size):
if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
w += y_train[i, :] * X_train[i, :].reshape(5, 1)
flag = True
num_iters += 1
if flag == False:
break
return num_iters, w
if __name__ == '__main__':
num, w = PLA()
X_train, y_train, iter = get_training_set()
print(num)
结果为
16.
在15题基础上进行修改,打乱数据序列,使用np.random.permutation(),返回一个新序列而不改变原始数据序列
import pandas as pd
import numpy as np
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
# 打乱数据
list = np.random.permutation(data_size)
X_train = X_train[list]
y_train = y_train[list]
return X_train, y_train, data_size
def PLA():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
num_iters = 0
while True:
flag = False
for i in range(data_size):
if np.dot(X_train[i, :], w)*y_train[i, :] <= 0:
w += y_train[i, :]*X_train[i, :].reshape(5, 1)
flag = True
num_iters += 1
if flag == False:
break
return num_iters
if __name__ == '__main__':
sum = 0
for i in range(2000):
num = PLA()
sum += num
ave_sum = sum/2000
print(ave_sum)
结果为
17.
在16题的基础上加上一个学习速率alpha,修改PLA()函数,更新w时加上学习速率alpha
import pandas as pd
import numpy as np
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_15_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
# 打乱数据
list = np.random.permutation(data_size)
X_train = X_train[list]
y_train = y_train[list]
return X_train, y_train, data_size
def PLA():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
num_iters = 0
alpha = 0.5
while True:
flag = False
for i in range(data_size):
if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
#添加学习速率alpha
w += alpha * y_train[i, :] * X_train[i, :].reshape(5, 1)
flag = True
num_iters += 1
if flag == False:
break
return num_iters
if __name__ == '__main__':
sum = 0
for i in range(2000):
num = PLA()
sum += num
ave_sum = sum / 2000
print(ave_sum)
运行结果为
18.
import pandas as pd
import numpy as np
import copy as cp
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
# 打乱数据
list = np.random.permutation(data_size)
X_train = X_train[list]
y_train = y_train[list]
return X_train, y_train, data_size
def get_test_set():
data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 初始化测试集
test_set = np.array(data)
# 测试集大小
test_size = test_set.shape[0]
# 测试X与y分组
X_val = test_set[:, 0:4]
X_val = np.hstack((np.ones((test_size, 1)), X_val))
y_val = test_set[:, 4:5]
return X_val, y_val, test_size
def Pocket():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
count = 0
max_iters = data_size
w_pocket = np.zeros((5, 1))
for i in range(data_size):
if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
#计数器加一
count += 1
validation = 0
#验证w,是否放入口袋
for j in range(data_size):
if np.dot(X_train[j, :], w) * y_train[j, :] <= 0:
validation += 1
if validation < max_iters:
#验证次数,如果比上一次小,就将新的w放入pocket
max_iters = validation
w_pocket = cp.deepcopy(w)
#迭代次数达到50次时,退出循环
if 50 == count:
break
return w_pocket
def Pocket_Error(w_pocket):
X_val, y_val, test_size = get_test_set()
error_num = 0
# 验证错误个数
for i in range(test_size):
if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
error_num += 1
error_ratio = error_num/test_size
return error_ratio
if __name__ == '__main__':
ratio_sum = 0
for i in range(2000):
w_pocket = Pocket()
ratio = Pocket_Error(w_pocket)
ratio_sum += ratio
ave_ratio = ratio_sum/2000
print(ave_ratio)
计算结果为,特么这道题一开始结果做错了找了一晚上错最后发现用的第一题的数据在算第二题!!!_(¦3」∠)_
19.
import pandas as pd
import numpy as np
import copy as cp
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
# 打乱数据
list = np.random.permutation(data_size)
X_train = X_train[list]
y_train = y_train[list]
return X_train, y_train, data_size
def get_test_set():
data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 初始化测试集
test_set = np.array(data)
# 测试集大小
test_size = test_set.shape[0]
# 测试X与y分组
X_val = test_set[:, 0:4]
X_val = np.hstack((np.ones((test_size, 1)), X_val))
y_val = test_set[:, 4:5]
return X_val, y_val, test_size
def Pocket():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
count = 0
for i in range(data_size):
if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
#计数器加一
count += 1
#迭代次数达到50次时,退出循环
if 50 == count:
break
return w
def Pocket_Error(w_pocket):
X_val, y_val, test_size = get_test_set()
error_num = 0
# 验证错误个数
for i in range(test_size):
if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
error_num += 1
error_ratio = error_num/test_size
return error_ratio
if __name__ == '__main__':
ratio_sum = 0
for i in range(2000):
w_pocket = Pocket()
ratio = Pocket_Error(w_pocket)
ratio_sum += ratio
ave_ratio = ratio_sum/2000
print(ave_ratio)
运行结果为
20.
import pandas as pd
import numpy as np
import copy as cp
def get_training_set():
# 读取.dat文件,并用\s+匹配空格读取出五列数据,header将第一行放入数据(否则第一行是列名)
data = pd.read_csv('F:\\Kaggle\\hw1_18_train.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 构建特征向量X
X_train = np.array(data.iloc[:, 0:4])
# 训练集大小
data_size = X_train.shape[0]
# 训练集X加上一列1
X_train = np.hstack((np.ones((data_size, 1)), X_train))
# 构建标签y
y_train = np.array(data.iloc[:, 4:5])
# 打乱数据
list = np.random.permutation(data_size)
X_train = X_train[list]
y_train = y_train[list]
return X_train, y_train, data_size
def get_test_set():
data = pd.read_csv('F:\\Kaggle\\hw1_18_test.dat', sep='\s+', names=['a', 'b', 'c', 'd', 'y'])
# 初始化测试集
test_set = np.array(data)
# 测试集大小
test_size = test_set.shape[0]
# 测试X与y分组
X_val = test_set[:, 0:4]
X_val = np.hstack((np.ones((test_size, 1)), X_val))
y_val = test_set[:, 4:5]
return X_val, y_val, test_size
def Pocket():
X_train, y_train, data_size = get_training_set()
# 初始化w
w = np.zeros((5, 1))
count = 0
max_iters = data_size
w_pocket = np.zeros((5, 1))
for i in range(data_size):
if np.dot(X_train[i, :], w) * y_train[i, :] <= 0:
w += 0.5 * y_train[i, :] * X_train[i, :].reshape(5, 1)
#计数器加一
count += 1
validation = 0
#验证w,是否放入口袋
for j in range(data_size):
if np.dot(X_train[j, :], w) * y_train[j, :] <= 0:
validation += 1
if validation < max_iters:
#验证次数,如果比上一次小,就将新的w放入pocket
max_iters = validation
w_pocket = cp.deepcopy(w)
#迭代次数达到50次时,退出循环
if 100 == count:
break
return w_pocket
def Pocket_Error(w_pocket):
X_val, y_val, test_size = get_test_set()
error_num = 0
# 验证错误个数
for i in range(test_size):
if np.dot(X_val[i, :], w_pocket) * y_val[i, :] <= 0:
error_num += 1
error_ratio = error_num/test_size
return error_ratio
if __name__ == '__main__':
ratio_sum = 0
for i in range(2000):
w_pocket = Pocket()
ratio = Pocket_Error(w_pocket)
ratio_sum += ratio
ave_ratio = ratio_sum/2000
print(ave_ratio)
把Pocket中的count判断值改为100即可,结果为