import pandas as pd from pandas import DataFrame, Series import numpy as np import matplotlib.pyplot as plt import operator from os import listdir from csv import * import csv #for i in range(32): # print(i) 输出0-31 #for i in range(2,30,1): # print(i) 输出2-29 #读取文件 img_train = pd.read_csv('train.csv') img_test = pd.read_csv('test.csv') img_train = np.array(img_train) img_test = np.array(img_test) #初始化 """ trainLabel = np.zeros(img_train.shape[0]) trainSet = np.zeros((img_train.shape[0], 28, 28, 1)) testSet = np.zeros((img_test.shape[0], 28, 28, 1)) testLabel = np.zeros(img_test.shape[0]) """ trainLabel = np.zeros(img_train.shape[0]) trainSet = np.zeros((img_train.shape[0], 784)) testSet = np.zeros((img_test.shape[0], 784)) testLabel = np.zeros(img_test.shape[0]) #结果输出1个整体,共有42000个块,每块为28*28 #print(img_train.shape[0]) 输出42000 无标题头 #print(img_train.shape[1]) 输出785 有标题头 """ for i in range(img_train.shape[0]): #文件数据格式化 trainSet[i] = img_train[i][1:].reshape(28, 28, 1) trainLabel[i] = img_train[i][0].astype(int) for i in range(img_test.shape[0]): testSet[i] = img_test[i].reshape(28, 28, 1) """ for i in range(img_train.shape[0]): #文件数据格式化 trainSet[i] = img_train[i][1:] trainLabel[i] = img_train[i][0] for i in range(img_test.shape[0]): testSet[i] = img_test[i] trainSet /= 255 #归一化 testSet /= 255 #trainLabel = np.array(list(map(int, trainLabel))) def classify0(inX, dataSet, labels, k):#被判断的数据集,训练集,训练集标签,k个近 print("jsh") dataSetSize = dataSet.shape[0]#被训练的列数 diffMat = np.tile(inX, (dataSetSize,1)) - dataSet#被训练函数以行单位复制n份 sqDiffMat = diffMat**2 #返回幂函数 sqDistances = sqDiffMat.sum(axis=1)# axis=1 向量相加 axis=0 普通相加 distances = sqDistances**0.5 sortedDistIndicies = distances.argsort()#从小到大,给出数值的索引,距离最小 classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] #前几个的label classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #返回指定键的值,如果值不在字典中返回0。 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 根据iteritems 的第1个元素排序 即字典中的val ,第0个 是key #reverse定义为True时将按降序排列 return sortedClassCount[0][0] def saveResult(result): with open('answer.csv','w',newline='') as myFile: result = list(map(lambda x: [x],result)) myWriter=csv.writer(myFile) if (result): myWriter.writerows(result) def handwritingClassTest(testSet,trainSet,trainLabel): # trainData,trainLabel=loadTrainData() 输入训练集 结果集 # testData=loadTestData() # testLabel=loadTestResult() trainData=trainSet testData=testSet trainLabel=trainLabel [m,n]=np.shape(testData) errorCount=0 resultList=[] for i in range(m): classifierResult = classify0(testData[i], trainData, trainLabel, 5) resultList.append(classifierResult) print(classifierResult) # if (classifierResult != testLabel[0,i]): errorCount += 1.0#测试数据集 #print "\nthe total number of errors is: %d" % errorCount #print "\nthe total error rate is: %f" % (errorCount/float(m)) saveResult(resultList) #handwritingClassTest(testSet, trainSet[0:20000], trainLabel[0:20000], 5) #测试目标1000 5000 10000 20000 handwritingClassTest(testSet, trainSet[0:10], trainLabel[0:10])
kaggle-图像识别-getstart-knn
最新推荐文章于 2023-12-19 16:48:37 发布
这篇博客介绍了如何使用KNN算法进行Kaggle上的图像识别任务。首先,博主导入了必要的库,然后加载并处理了训练集和测试集数据,将像素值归一化。接着,定义了一个classify0函数来执行KNN分类,并实现了保存预测结果到CSV文件的功能。最后,博主通过调用handwritingClassTest函数对一小部分数据进行了测试。
摘要由CSDN通过智能技术生成