库的引入:
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
1:调用sklearn中的鸢尾花数据集以及样本点的绘制
dt = datasets.load_iris()
x = dt.data[:,:2] #只取前两个特征,方便绘图
y = dt.target #y代表的是特征的结果(该花属于哪个类别)
plt.plot(x[y==0,0],x[y==0,1],color = "blue")
plt.plot(x[y==1,0],x[y==1,1],color = "yellow")
plt.show()
2:划分数据集和测试集
def train_test_split(X,Y,test_ratio,seed):
if seed:
np.random.seed(seed)
shuffle_indexes = np.random.permutation(len(X)) #对len(X)进行随机排列,也就是X的行号随机排列
test_size = int(test_ratio * len(X)) #测试集大小
test_indexes = shuffle_indexes[:test_size] #把前test_size个样本点的行号保 存在测试集索引中
train_indexes = shuffle_indexes[test_size:len(X)] #把前test_size个样本点的行号 保存在训练集索引中
X_train = X[train_indexes] #把前test_size个样本点保存在训练集
Y_train = Y[train_indexes]
X_test = X[test_indexes]
Y_test = Y[test_indexes]
return X_train,X_test,Y_train,Y_test
3:knn算法详解
def knn(k,X_train,Y_train,x,n):#n代表不同类别个数,x:预测的一组样本点
predict = [] #预测总结果
for i in x:
distances = [] # 每个点到i的距离
for x_train in X_train:
d = np.sqrt(np.sum((i - x_train) ** 2)) # 欧式距离
distances.append(d)
nearest = np.argsort(distances) # 每个点到i距离排序后的列表,np.argsort(x)返回的是列表x从小到大的行下标
value = [] #统计i的前k个样本点 的 每个类别的总数,比如第一类有1个,第二类有3个,第三类有2个
for m in range(n):
value.append(0)
for l in range(k):
value[Y_train[nearest[l]]] = value[Y_train[nearest[l]]] + 1 #nearest[l]为前k个距离最小的下标,Y_train[nearest[l]]为前k个距离最小的下标 的类别号k,value[k]+=1
max = n - 1 # 统计周围特征类别最多的值,假设最后一个类别总数最多
for j in range(n - 1): #遍历前n-1个类别总数(不包括下标为n-1的样本点)
if (value[max] < value[j]):
max = j
predict.append(max)
return predict
4:超参数k的选取
#寻找最好的k
def select_k(X_train,Y_train,X_test,Y_test):
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11): # 超参数的选取
for p in range(1, 10):
knf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)#权重为distance,距离越近,占的权重越大,p默认为2,也就是欧氏距离
knf.fit(X_train, Y_train)
score = knf.score(X_test, Y_test)
if score > best_score:
best_score = score
best_k = k
best_p = p
return best_p,best_k,best_k