机器学习中的各种距离
机器学习中的各种距离:https://blog.csdn.net/qq_26091271/article/details/52528625
理论推导
knn基本实现(欧式距离+曼哈顿距离+夹角余弦+皮尔森相关系数+杰卡德相关系数)
1、python读取txt文件,将txt文件转换成矩阵形式(每一行为每一个样本,每一列为样本的每一个特征)
用一个矩阵存取样本和特征,用一个列表去存类别
即读取txt文件:
def loadData(filename):
fr = open(filename)
arrayOLines = fr.readlines()
# 得到文件的行数
numberOfLines = len(arrayOLines)
# 创建返回Numpy的矩阵
returnMat = np.zeros((numberOfLines, 3))
classLabelVector = []
index = 0
# 解析文件数据到列表
for line in arrayOLines:
line = line.strip()
listFromLine = line.split(',')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat,classLabelVector
↓下面我们的求距离全部用矩阵实现
图为预测值的对比
欧式距离(欧几里得距离)
def O_distance(X,Y):
distance = np.sum((X - Y) ** 2,axis = 1) ** 0.5
return distance
曼哈顿距离
def manhattan_distance(X,Y):
distance = np.sum(np.abs(X - Y))
return distance
knn算法
def knn(trainSet,label,testSet,k):
finalDistance = O_distance(trainSet,trainSet)
# finalDistance = manhattan_distance(trainSet,trainSet)
sortedIndex=finalDistance.argsort();#获得排序后原始下角标
index=sortedIndex[:k];#获得距离最小的前k个下角标
labelCount={};#字典 key为标签,value为标签出现的次数
for i in index:
tempLabel=label[i];
labelCount[tempLabel]=labelCount.get(tempLabel,0)+1;
sortedCount=sorted(labelCount.items(),key=operator.itemgetter(1),reverse=True);#operator.itemgetter(1)意思是按照value值排序,即按照欧氏距离排序
return sortedCount[0][0];#输出标签出现最多的那个
K值选择
一、近似误差与估计误差:
近似误差:对现有训练集的训练误差,关注训练集,如果近似误差过小可能会出现过拟合的现象,对现有的训练集能有很好的预测,但是对未知的测试样本将会出现较大偏差的预测。模型本身不是最接近最佳模型。
估计误差:可以理解为对测试集的测试误差,关注测试集,估计误差小说明对未知数据的预测能力好,模型本身最接近最佳模型。
二、K值确定标准:
K值过小:k值小,特征空间被划分为更多子空间(模型的项越多),整体模型变复杂,容易发生过拟合,k值越小,选择的范围就比较小,训练的时候命中率较高,近似误差小,而用test的时候就容易出错,估计误差大,容易过拟合。
K值=N:无论输入实例是什么,都将简单的预测他属于训练实例中最多的类。
将一个数据集随机分成训练集和测试集(数据预处理)
分割csv文件
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
housing = pd.read_csv('C:\\Users\\Thinkpad\\Desktop\\人工智能\\梯度下降\\301-科研(BK)\\002-AI-数据集-梯度下降\\housing.csv')
#housing_list = housing.values.tolist()
#housing_list = np.array(housing_list)
#print(housing_list)
#print(len(housing_list))
#housing.head()
#print(housing.head())
#将总的数据集分成训练集和测试集
train_set,test_set = train_test_split(housing,test_size=0.2,random_state=42)
#转换为二维list,再转换为numpy型
train_set_list = train_set.values.tolist()
train_set_list = np.array(train_set_list)
train_set_list = train_set_list[:,:-1]
test_set_list = test_set.values.tolist()
test_set_list = np.array(test_set_list)
test_set_list = test_set_list[:,:-1]
print(test_set_list,train_set_list)
#print(train_set.head())
#print(len(housing),len(train_set))
#将训练集和测试集的数据分别保存为csv文件
#np.savetxt('C:\\Users\\Thinkpad\\Desktop\\knn数据集\\train.txt',train_set_list)
分割txt文件
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def loadData(filename):
fr = open(filename)
arrayOLines = fr.readlines()
# 得到文件的行数
numberOfLines = len(arrayOLines)
# 创建返回Numpy的矩阵
allMat = np.zeros((numberOfLines, 3))
classLabelVector = []
index = 0
# 解析文件数据到列表
for line in arrayOLines:
line = line.strip()
listFromLine = line.split(',')
allMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return allMat,classLabelVector
all_set,all_label = loadData("C:\\Users\\Thinkpad\\Desktop\\人工智能\\knn算法\\003-AI-数据集-KNN\\003-AI-KNN-datasets-Iris.txt");#总的数据集
#print(len(allSet))
train_set,test_set,train_label,test_label = train_test_split(all_set,all_label,test_size=0.2,random_state=42)
print(len(train_set),len(test_set))
mnist数据
首先推荐几篇很好的博客:
https://blog.csdn.net/qq_20936739/article/details/82011320
https://blog.csdn.net/zugexiaodui/article/details/77130862
mnist数据库:手写数字图片的数据库
from PIL import Image
import struct
#将数据库里的图片写到文件中去
def read_image(filename):
f = open(filename, 'rb')
image_index = 0
buf = f.read()
f.close()
magic, images, rows, columns = struct.unpack_from('>IIII', buf, index)
# print(magic,images,rows,columns)
image_index += struct.calcsize('>IIII')
for i in range(images):
# for i in xrange(2000):
image = Image.new('L', (columns, rows))
for x in range(rows):
for y in range(columns):
image.putpixel((y, x), int(struct.unpack_from('>B', buf, index)[0]))
index += struct.calcsize('>B')
print('save ' + str(i) + 'image') #保存图片
image.save('test/' + str(i) + '.png')
#读取标签
def read_label(filename, saveFilename):
f = open(filename, 'rb')
index = 0
buf = f.read()
f.close()
magic, labels = struct.unpack_from('>II', buf, index)
index += struct.calcsize('>II')
labelArr = [0] * labels
# labelArr = [0] * 2000
for x in range(labels):
# for x in xrange(2000):
labelArr[x] = int(struct.unpack_from('>B', buf, index)[0])
index += struct.calcsize('>B')
save = open(saveFilename, 'w')
save.write(','.join(map(lambda x: str(x), labelArr)))
save.write('\n')
save.close()
print('save labels success')
if __name__ == '__main__':
#read_image('C:\\Users\\Thinkpad\\Desktop\\人工智能\\knn算法\\003-AI-数据集-KNN\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte')
read_label('C:\\Users\\Thinkpad\\Desktop\\人工智能\\knn算法\\003-AI-数据集-KNN\\t10k-labels-idx1-ubyte\\t10k-labels.idx1-ubyte', 'test_label/label.txt')
手写识别1-9
import cv2
import os
import numpy as np
from sklearn import neighbors
import struct
#读取源图片文件
'''
def read_test_file():
with open('C:\\Users\\Thinkpad\\Desktop\\人工智能\\knn算法\\003-AI-数据集-KNN\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte') as f1:
buf1 = f1.read()
return buf1
''''
def get_train_images(): # 处理训练图片
imgs = np.zeros([60000, 784], int) # 建立一个60000*784的0矩阵,60000个图片,784个像素格(相当于784个特征)
for i in range(60000):
img1 = cv2.imread("C:\\Users\\Thinkpad\\Desktop\\人工智能代码实现\\knn算法\\train_data\\" + str(i) + ".jpg", 0) # 读取每一张图片(路径自定义)
for rows in range(28):
for cols in range(28): # 访问每张图片的每个像素,这种方法简单易懂但是效率比较低
if img1[rows, cols] >= 127: # 二值化处理,把一整张图片的像素处理成只有0和1
img1[rows, cols] = 1
else:
img1[rows, cols] = 0 # 这里选择的临界点是127,正好是0-255的中间值
imgs[i, rows * 28 + cols] = img1[rows, cols] # 把每张图片(28*28)展开成一行(1*784),
# 然后把每张图片的像素逐行放到(60000*784)的大矩阵中
return imgs # 返回所有图片的像素重构的矩阵
def get_train_labels(): # 解析训练标签(解析出来的标签和图片顺序是一一对应的)
f1 = open("C:\\Users\\Thinkpad\\Desktop\\人工智能代码实现\\knn算法\\train_label\\train_label.txt", 'rb')
buf1 = f1.read()
f1.close()
index = 0
magic, num = struct.unpack_from(">II", buf1, 0)
index += struct.calcsize('>II')
labs = []
labs = struct.unpack_from('>' + str(num) + 'B', buf1, index)
return labs # 返回训练标签。之前没有单独解析出来保存在文本文件中,因为解析标签比较简单。
def get_test_images(): # 处理测试图片,和处理训练图片是一样的
imgs = np.zeros([10000, 784], int) #
for i in range(10000): #
img1 = cv2.imread("C:\\Users\\Thinkpad\\Desktop\\人工智能代码实现\\knn算法\\test_data\\" + str(i) + ".jpg", 0)
for rows in range(28):
for cols in range(28):
if img1[rows, cols] >= 127:
img1[rows, cols] = 1
else:
img1[rows, cols] = 0
imgs[i, rows * 28 + cols] = img1[rows, cols]
return imgs
def get_test_labels():#处理测试标签,和处理训练标签是一样的
f1=open("C:\\Users\\Thinkpad\\Desktop\\人工智能代码实现\\knn算法\\test_label\\test_label.txt",'rb')
buf1=f1.read()
f1.close()
index=0
magic,num=struct.unpack_from(">II",buf1,0)
index+=struct.calcsize('>II')
labs=[]
labs=struct.unpack_from('>'+str(num)+'B',buf1,index)
return labs
if _name_ == "_main_":
print("获得训练数据ing")
train_images = get_train_images()
print("获得训练标签ing")
train_labels = get_train_labels()
print("准备knning")
knn = neighbors.KNeighborsClassifier(algorithm="kdtree",n_neighbors=3)
print("knn读入训练数据ing")
knn.fit(train_images,train_labels)
print("获得测试数据ing")
test_images = get_test_images()
print("获得真实测试标签ing")
test_labels = get_test_labels()
print("预测测试集ing")
test_pre = knn.predict(test_images)
print(test_pre)
wrong_num = np.sum(test_labels != test_pre)
num = len(test_images)
right_rate = 1 - wrong_num/float(num)