KNN_Result
# -*- coding: utf-8 -*-
import os
import gzip
import numpy as np
from mpi4py import MPI
dir_path = './MNIST'
data_path = "train-images-idx3-ubyte.gz"
label_path = "train-labels-idx1-ubyte.gz"
def load_data(datapath, label):
#读取标签
with gzip.open(os.path.join(datapath, label), 'rb') as f_label:
y = np.frombuffer(f_label.read(), np.uint8, offset=8)
return y
def partition():
par_path = "partition.txt"
with open(par_path, "r", encoding="utf-8") as f: # 打开文件
partition = f.read()
return int(partition)
def read_result():
par_path = "result.txt"
with open(par_path, "r", encoding="utf-8") as f: # 打开文件
results = f.readlines()
results = [int(y_pred.replace("\n","")) for y_pred in results if y_pred != "\n"]
return results
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
name = MPI.Get_processor_name()
if rank == 0:
labels = load_data(dir_path, label_path)
partition = partition()
print("-------统计准确率 ",40000,"-",partition+100)
labels = labels[40000:partition+100:1]
results = read_result()
test_len = len(labels)
per_len = int(test_len/size)
send_index=[]
for i in range(size-1):
send_index.append((i*per_len,(i+1)*per_len))
send_index.append(((size - 1) * per_len, test_len))
print("测试数据分配情况:",send_index)
send_data = []
for index in send_index:
send_data.append((labels[index[0]:index[1]:1],results[index[0]:index[1]:1]))
data = comm.scatter(send_data if rank == 0 else None, root=0)
labels =data[0]
results =data[1]
print('进程%d, 统计分结果:' % (rank),len(labels),len(results)," 来自",name)
error = 0
for y, y_pred in zip(labels, results):
if y != y_pred:
error += 1
errors = comm.gather(error,root=0)
if rank ==0:
error = sum(errors)
total = partition+100 - 40000
print(total,"张图片的分类准确率为:", 1 - error / total)
MPI.Finalize()
KNN
# -*- coding: utf-8 -*-
import os
import gzip
import numpy as np
#from mpi4py import MPI
dir_path = './MNIST'
data_path = "train-images-idx3-ubyte.gz"
label_path = "train-labels-idx1-ubyte.gz"
def load_data(datapath, data, label):
#读取标签 path:文件目录 data:数据文件名 label:标签数据文件名
with gzip.open(os.path.join(datapath, label), 'rb') as f_label:
y = np.frombuffer(f_label.read(), np.uint8, offset=8)
#读取数据
with gzip.open(os.path.join(datapath, data), 'rb') as f_data:
X = np.frombuffer(
f_data.read(), np.uint8, offset=16).reshape(len(y), 28*28)#784
return X, y
x,y=data, labels = load_data(dir_path, data_path, label_path)
print(len(x),len(y))
def normalization(data):
data[data>0]=1
return data
def KNN(X, traindata, trainlabel, k):
trainsize = traindata.shape[0] # 得到traindata的第一维大小
distance = np.sum(np.array(np.tile(X, (trainsize, 1)) - traindata) ** 2, 1)
distancesort = distance.argsort()
distancesort=distancesort[0:k:1]
countdict = dict()
for i in distancesort:
Xlabel = trainlabel[i] # distancesort存储的是训练数据集的索引,前k个为距离最小的k个点的索引,通过trainlabel得到k个点的类别
countdict[Xlabel] = countdict.get(Xlabel, 0) + 1 # 通过字典,存储k个点上每个类别和对应的类别数量
countlist = sorted(countdict.items(), key=lambda x: x[1], reverse=True) # 对字典的值,按降序排列,得到降序排列的存储各元祖的序列
return countlist[0][0] # 其序列的第一个元组为类别数最多的元组,第一个元素为其类别。将该类别赋值给未知元组
def partition():
par_path = "partition.txt"
with open(par_path, "r", encoding="utf-8") as f: # 打开文件
partition = f.read()
return int(partition)
def par_save(par_result):
res_path = "result.txt"
with open(res_path, "a", encoding="utf-8") as f: # 打开文件
f.writelines([str(i)+'\n' for i in par_result])
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
name = MPI.Get_processor_name()
if rank == 0:
data, labels = load_data(dir_path, data_path, label_path)
data.flags.writeable = True
data = normalization(data)
partition = partition()
train_data = data[0:40000:1]
train_labels = labels[0:40000:1]
print("-------",partition,partition+100)
test_data = data[partition:partition+100:1]
test_labels = labels[partition:partition+100:1]
data = comm.bcast([train_data,train_labels] if rank == 0 else None, root=0)
train_data =data[0]
train_labels=data[1]
print('进程%d, 获得样本数据:' % (rank),len(train_data),len(train_labels)," 来自",name)
if rank ==0:
test_len = len(test_labels)
per_len = int(test_len/size)
send_index=[]
for i in range(size-1):
send_index.append((i*per_len,(i+1)*per_len))
send_index.append(((size - 1) * per_len, test_len))
print("测试数据分配情况:",send_index)
send_data = []
for index in send_index:
send_data.append(test_data[index[0]:index[1]:1])
data = comm.scatter(send_data if rank == 0 else None, root=0)
test_data = data
print('进程%d, 获得测试数据:' % (rank),len(test_data)," 来自",name)
pred_labels=[]
for X in test_data:
pred = KNN(X,train_data,train_labels,k=10)
pred_labels.append(pred)
print('进程%d, 分类结果:' % (rank),pred_labels," 来自",name)
pred_labels = comm.gather(pred_labels,root=0)
if rank ==0:
print("进程%d, 汇总了%d个进程分类结果 " % (rank,len(pred_labels))," 来自",name)
pred_labels = [x for j in pred_labels for x in j]
error = 0
for y, y_pred in zip(test_labels, pred_labels):
if y!=y_pred:
error+=1
print("分类准确率为:",1-error/len(pred_labels))
par_save(pred_labels)
MPI.Finalize()
Main
```python
import os
import time
import subprocess
def clean():
par_path = "result.txt"
with open(par_path, "w", encoding="utf-8") as f: # 打开文件
f.write("")
def check():
ping_path = "ping.txt"
with open(ping_path, "r", encoding="utf-8") as f: # 打开文件
nodes = f.readlines() # 读取文件
for node in nodes:
state = os.system("ping -c3 -W 1 "+node)
if state !=0:
nodes.remove(node)
print(nodes)
with open("confile.txt", "w", encoding="utf-8") as f: # 打开文件
f.writelines(nodes)
def partition(epoch):
par_path = "partition.txt"
with open(par_path, "w", encoding="utf-8") as f: # 打开文件
f.write(str(40000+100*epoch))
def run(n):# n 为待分类样本数量
process = int(input("请输入进程数:"))
check()
for epoch in range(int(n/100)):
partition(epoch)
mpi = 'mpiexec -n {} -f ./confile.txt python3 ./knn.py'.format(process)
state = os.system(mpi)
if state!=0:
print("出错了,回滚中")
check()
while state !=0:
state = os.system(mpi)
print("回滚完成")
check()
mpi = 'mpiexec -n {} -f ./confile.txt python3 ./collect.py'.format(process)
state = os.system(mpi)
if state != 0:
nodes = check()
node(nodes)
state = os.system(mpi)
clean()
run(800)