2021-07-15

KNN_Result

# -*- coding: utf-8 -*-
import os
import gzip
import numpy as np
from mpi4py import MPI

dir_path = './MNIST'
data_path = "train-images-idx3-ubyte.gz"
label_path = "train-labels-idx1-ubyte.gz"
def load_data(datapath, label):
    #读取标签
    with gzip.open(os.path.join(datapath, label), 'rb') as f_label:
        y = np.frombuffer(f_label.read(), np.uint8, offset=8)
    return y

def partition():
    par_path = "partition.txt"
    with open(par_path, "r", encoding="utf-8") as f:  # 打开文件
        partition = f.read()
    return int(partition)

def read_result():
    par_path = "result.txt"
    with open(par_path, "r", encoding="utf-8") as f:  # 打开文件
        results = f.readlines()
        results = [int(y_pred.replace("\n","")) for y_pred in results if y_pred != "\n"]
        return results


comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
name = MPI.Get_processor_name()

if rank == 0:
    labels = load_data(dir_path, label_path)
    partition = partition()
    print("-------统计准确率 ",40000,"-",partition+100)
    labels = labels[40000:partition+100:1]
    results = read_result()

    test_len = len(labels)
    per_len = int(test_len/size)
    send_index=[]
    for i in range(size-1):
        send_index.append((i*per_len,(i+1)*per_len))
    send_index.append(((size - 1) * per_len, test_len))
    print("测试数据分配情况:",send_index)
    send_data = []
    for index in send_index:
        send_data.append((labels[index[0]:index[1]:1],results[index[0]:index[1]:1]))

data = comm.scatter(send_data if rank == 0 else None, root=0)
labels =data[0]
results =data[1]
print('进程%d, 统计分结果:' % (rank),len(labels),len(results),"  来自",name)

error = 0
for y, y_pred in zip(labels, results):
    if y != y_pred:
        error += 1

errors = comm.gather(error,root=0)
if rank ==0:
    error = sum(errors)
    total = partition+100 - 40000
    print(total,"张图片的分类准确率为:", 1 - error / total)

MPI.Finalize()

KNN

# -*- coding: utf-8 -*-
import os
import gzip
import numpy as np
#from mpi4py import MPI

dir_path = './MNIST'
data_path = "train-images-idx3-ubyte.gz"
label_path = "train-labels-idx1-ubyte.gz"
def load_data(datapath, data, label):
    #读取标签  path:文件目录  data:数据文件名 label:标签数据文件名
    with gzip.open(os.path.join(datapath, label), 'rb') as f_label:
        y = np.frombuffer(f_label.read(), np.uint8, offset=8)
    #读取数据
    with gzip.open(os.path.join(datapath, data), 'rb') as f_data:
        X = np.frombuffer(
            f_data.read(), np.uint8, offset=16).reshape(len(y), 28*28)#784
    return X, y
x,y=data, labels = load_data(dir_path, data_path, label_path)
print(len(x),len(y))

def normalization(data):
    data[data>0]=1
    return data

def KNN(X, traindata, trainlabel, k):
    trainsize = traindata.shape[0]  # 得到traindata的第一维大小
    distance = np.sum(np.array(np.tile(X, (trainsize, 1)) - traindata) ** 2, 1)
    distancesort = distance.argsort()
    distancesort=distancesort[0:k:1]
    countdict = dict()
    for i in distancesort:
        Xlabel = trainlabel[i]  # distancesort存储的是训练数据集的索引,前k个为距离最小的k个点的索引,通过trainlabel得到k个点的类别
        countdict[Xlabel] = countdict.get(Xlabel, 0) + 1  # 通过字典,存储k个点上每个类别和对应的类别数量
    countlist = sorted(countdict.items(), key=lambda x: x[1], reverse=True)  # 对字典的值,按降序排列,得到降序排列的存储各元祖的序列
    return countlist[0][0]  # 其序列的第一个元组为类别数最多的元组,第一个元素为其类别。将该类别赋值给未知元组

def partition():
    par_path = "partition.txt"
    with open(par_path, "r", encoding="utf-8") as f:  # 打开文件
        partition = f.read()
    return int(partition)

def par_save(par_result):
    res_path = "result.txt"
    with open(res_path, "a", encoding="utf-8") as f:  # 打开文件
        f.writelines([str(i)+'\n' for i in par_result])


comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
name = MPI.Get_processor_name()

if rank == 0:
    data, labels = load_data(dir_path, data_path, label_path)
    data.flags.writeable = True
    data = normalization(data)

    partition = partition()

    train_data = data[0:40000:1]
    train_labels = labels[0:40000:1]
    print("-------",partition,partition+100)
    test_data = data[partition:partition+100:1]
    test_labels = labels[partition:partition+100:1]

data = comm.bcast([train_data,train_labels] if rank == 0 else None, root=0)
train_data =data[0]
train_labels=data[1]
print('进程%d, 获得样本数据:' % (rank),len(train_data),len(train_labels),"  来自",name)

if rank ==0:
    test_len = len(test_labels)
    per_len = int(test_len/size)
    send_index=[]
    for i in range(size-1):
        send_index.append((i*per_len,(i+1)*per_len))
    send_index.append(((size - 1) * per_len, test_len))
    print("测试数据分配情况:",send_index)
    send_data = []
    for index in send_index:
        send_data.append(test_data[index[0]:index[1]:1])


data = comm.scatter(send_data if rank == 0 else None, root=0)
test_data = data
print('进程%d, 获得测试数据:' % (rank),len(test_data),"  来自",name)

pred_labels=[]
for X in test_data:
    pred = KNN(X,train_data,train_labels,k=10)
    pred_labels.append(pred)
print('进程%d, 分类结果:' % (rank),pred_labels,"  来自",name)


pred_labels = comm.gather(pred_labels,root=0)
if rank ==0:
    print("进程%d, 汇总了%d个进程分类结果 " % (rank,len(pred_labels)),"  来自",name)
    pred_labels = [x for j in pred_labels for x in j]
    error = 0
    for y, y_pred in zip(test_labels, pred_labels):
        if y!=y_pred:
            error+=1
    print("分类准确率为:",1-error/len(pred_labels))
    par_save(pred_labels)


MPI.Finalize()


Main
```python
import os
import time
import subprocess




def clean():
    par_path = "result.txt"
    with open(par_path, "w", encoding="utf-8") as f:  # 打开文件
        f.write("")


def check():
    ping_path = "ping.txt"
    with open(ping_path, "r", encoding="utf-8") as f:  # 打开文件
        nodes = f.readlines()  # 读取文件
    for node in nodes:
        state = os.system("ping -c3 -W 1 "+node)
        if state !=0:
            nodes.remove(node)
    print(nodes)
    with open("confile.txt", "w", encoding="utf-8") as f:  # 打开文件
        f.writelines(nodes)


def partition(epoch):
    par_path = "partition.txt"
    with open(par_path, "w", encoding="utf-8") as f:  # 打开文件
        f.write(str(40000+100*epoch))



def run(n):# n 为待分类样本数量
    process = int(input("请输入进程数:"))
    check()
    for epoch in range(int(n/100)):
        partition(epoch)

        mpi = 'mpiexec -n {} -f ./confile.txt python3 ./knn.py'.format(process)
        state = os.system(mpi)
        if state!=0:
            print("出错了,回滚中")
        check()
        while state !=0:
            state = os.system(mpi)
            print("回滚完成")
            check()


    mpi = 'mpiexec -n {} -f ./confile.txt python3 ./collect.py'.format(process)
    state = os.system(mpi)
    if state != 0:
        nodes = check()
        node(nodes)
        state = os.system(mpi)

clean()
run(800)




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值