刚开始学习机器学习,在学习knn时接触到了iris,突发奇想,为什么不能通过结果总结出特征来判断种类呢,就像现实中认识一朵花,你得先了解它有哪些特征,在根据特征判断花的种类。不知道这种想法是否已经被他人考虑过,如有雷同请原谅。
为了更好画图,这里只截取了iris的前两个特征。将特征转换成向量形式,画图如下。
很明显,红色和黄色重合在一起,很难区分,但看到黄色长度普遍大于红色长度,想着标准化试试,结果效果不错,如图
上图对于三种类型有了较好的区分。这里使用三者的平均值代表三种类型,如图
对于测试集,通过判断两向量的夹角,即余弦相似度的方法,找夹角最小的作为该测试集合的类别。30个测试数据,正确率在65%-90%,往往时70%-80%,和knn等的准确率差不多。若加入更多特征,准确率相比能更高把。下图正确25个,错误5个
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 4 09:00:26 2019
@author: ASUS
"""
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
iris = datasets.load_iris()
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2)
x_train = x_train[:,:2]
x_test = x_test[:,:2]
x0_train = []
x1_train = []
x2_train = []
y0_train = []
y1_train = []
y2_train = []
for i in range(x_train.shape[0]):
if y_train[i] == 0:
x0_train.append(x_train[i])
elif y_train[i] == 1:
x1_train.append(x_train[i])
else:
x2_train.append(x_train[i])
#print(x0_train)
def plant(x_train,color):
for i in range(len(x_train)):
plt.plot([0,x_train[i][0]],[0,x_train[i][1]],color)
plant(x0_train,'b')
plant(x2_train,'y')
plant(x1_train,'r')
plt.show()
# 标准化
x0_min = x_train[:,0].min()
x0_max = x_train[:,0].max()
x1_min = x_train[:,1].min()
x1_max = x_train[:,1].max()
for i in range(x_train.shape[0]):
x_train[i][0] = ((x_train[i][0] - x0_min) / (x0_max - x0_min) -0.5) * 2
x_train[i][1] = ((x_train[i][1] - x1_min) / (x1_max - x1_min) -0.5) * 2
for i in range(x_test.shape[0]):
x_test[i][0] = ((x_test[i][0] - x0_min) / (x0_max - x0_min) -0.5) * 2
x_test[i][1] = ((x_test[i][1] - x1_min) / (x1_max - x1_min) -0.5) * 2
for i in range(x_train.shape[0]):
if y_train[i] == 0:
x0_train.append(x_train[i])
elif y_train[i] == 1:
x1_train.append(x_train[i])
else:
x2_train.append(x_train[i])
#print(x0_train)
def plant(x_train,color):
for i in range(len(x_train)):
plt.plot([0,x_train[i][0]],[0,x_train[i][1]],color)
plant(x0_train,'b')
plant(x2_train,'y')
plant(x1_train,'r')
plt.show()
sum0 = 0
sum1 = 0
for i in range(len(x0_train)):
sum0 += x0_train[i][0]
sum1 += x0_train[i][1]
x0_avx = sum0 / len(x0_train)
x0_avy = sum1 / len(x0_train)
sum0 = 0
sum1 = 0
for i in range(len(x1_train)):
sum0 += x1_train[i][0]
sum1 += x1_train[i][1]
x1_avx = sum0 / len(x1_train)
x1_avy = sum1 / len(x1_train)
sum0 = 0
sum1 = 0
for i in range(len(x2_train)):
sum0 += x2_train[i][0]
sum1 += x2_train[i][1]
x2_avx = sum0 / len(x2_train)
x2_avy = sum1 / len(x2_train)
# 用平均值代替
plt.plot([0,x0_avx],[0,x0_avy],'b')
plt.plot([0,x1_avx],[0,x1_avy],'r')
plt.plot([0,x2_avx],[0,x2_avy],'y')
plt.show()
def getXS(x1,x2,x,y):
return (x1 * x + x2 * y)/(((x1 * x1 + x2 * x2) ** 0.5) * ((x * x + y * y) ** 0.5))
cntRight = 0
cntWrong = 0
for i in range(x_test.shape[0]):
x1 = x_test[i][0]
x2 = x_test[i][1]
#print(x1,x2)
xs0 = getXS(x1,x2,x0_avx,x0_avy)
xs1 = getXS(x1,x2,x1_avx,x1_avy)
xs2 = getXS(x1,x2,x2_avx,x2_avy)
if xs0 > xs1 and xs0 > xs2:
if (y_test[i] == 0):
cntRight += 1
else:
cntWrong += 1
if xs1 > xs0 and xs1 > xs2:
if (y_test[i] == 1):
cntRight += 1
else:
cntWrong += 1
if xs2 > xs1 and xs2 > xs0:
if (y_test[i] == 2):
cntRight += 1
else:
cntWrong += 1
print(cntRight,cntWrong)
print(cntRight / 30)