代码：k近邻算法范例网站约对_k近邻算法实例网约对象-CSDN博客

本文链接：https://blog.csdn.net/weixin_46870882/article/details/105821124
import pandas as pd
import matplotlib.pyplot as plt
#导入数据集，需要数据集和代码在同一路径下
datingTest = pd.read_table('datingTestSet.txt',header=None)
datingTest.head()
#把不同标签用颜色区分
Colors = []
for i in range(datingTest.shape[0]):#datingTest.shape[0]得出数据集的行数
    m = datingTest.iloc[i,-1]#例如，取第m行的最后一个数据，即标签值
    if m=='didntLike':
        Colors.append('black')
    if m=='smallDoses':
        Colors.append('orange')
    if m=='largeDoses':
        Colors.append('red')


#绘制两两特征之间的散点图 
plt.rcParams['font.sans-serif']=['Simhei']
#图中字体设置为黑体，只有输入上述代码，才能显示图中中文汉字
pl=plt.figure(figsize=(12,8))#设置图幅大小

fig1=pl.add_subplot(221)#2×2格式，在左上角那个位置绘图
plt.scatter(datingTest.iloc[:,1],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('玩游戏视频所占时间比')
plt.ylabel('每周消费冰淇淋公升数')

fig2=pl.add_subplot(222)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,1],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('玩游戏视频所占时间比')

fig3=pl.add_subplot(223)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('每周消费冰淇淋公升数')
plt.show()#这行代码我也不知道干嘛用的，好像加不加对图像显示没有影响


#对数据集进行归一化处理函数
def minmax(dataSet):
    minDf = dataSet.min()#找出每一列中的最小值
    maxDf = dataSet.max()#找出每一列中的最大值
    normSet = (dataSet-minDf)/(maxDf-minDf)#0-1归一化
    return normSet

#得到归一化数据集datingT
datingT1=minmax(datingTest.iloc[:, :3])#对数据集前三列归一化
datingT2=datingTest.iloc[:,3]#取出数据集最后一列标签
datingT = pd.concat([datingT1, datingT2], axis=1)
#取数据集前三列出来归一化，然后再把归一化后的和原来数据集最后一列合并起来
datingT.head()


#划分训练集和测试集函数
def randSplit(dataSet,rate=0.7):
    n = dataSet.shape[0]#数据集的行数
    m=int(n*rate)#训练集的行数
    train = dataSet.iloc[:m,:]#取出训练集
    test = dataSet.iloc[m:,:]#取出测试集
    test.index = range(test.shape[0])#对测试集索引进行重新排序
    return train,test

#得到训练集train1，测试集test1
train1,test1 = randSplit(datingT)
"""
一步一步验证原理用的，可以忽略
n=train1.shape[1]-1#训练集数据的列数减一
m=test1.shape[0]#测试集数据行数
result=[]
for i in range(m):
        dist = list((((train1.iloc[:, :n] - test1.iloc[i, :n]) ** 2).sum(1))**5)
        dist_l = pd.DataFrame({'dist': dist, 'labels': (train1.iloc[:, n])})
        dr = dist_l.sort_values(by = 'dist')[: 5]
        re = dr.loc[:, 'labels'].value_counts()
        result.append(re.index[0])
result2 = pd.Series(result)
test1['predict'] = result2
"""
#k近邻算法函数
def datingClass(train,test,k):
    n=train.shape[1]-1#训练集数据的列数减一
    m=test.shape[0]#测试集数据行数
    result = []
    for i in range(m):
        dist = list((((train.iloc[:, :n] - test.iloc[i, :n]) ** 2).sum(1))**5)
        dist_l = pd.DataFrame({'dist': dist, 'labels': (train.iloc[:, n])})
        dr = dist_l.sort_values(by = 'dist')[: k]
        re = dr.loc[:, 'labels'].value_counts()
        result.append(re.index[0])#输出一列result列表，列表中依次为预测的值
    result = pd.Series(result)#Series处理一下
    test['predict'] = result#在test数据集中加入一列predict，值依次为result值
    acc = (test.iloc[:,-1]==test.iloc[:,-2]).mean()
    print(f'模型预测准确率为{acc}')
    return test


#输出预测值以及准确率
datingClass(train1,test1,5)

"""总结此代码
1.调用了pandas，matplotlib.pyplot模块。
2.给出了调入数据集的办法
3.给出了画散点图的方法
4.给出了划分训练集、测试集的办法
5.给出了k近邻算法的调用
6.给出了预测正确率的一种办法
"""