knn算法,最优k值求解

需求测试knn算法精度,并选出精度最高的k值
文件如下:在test、和train文件夹里分别有如下文件,文件名_左侧是每个txt文件里1组成的数字,每个txt文件都是32行、32列共计1024个字符
用train里面的文件进行训练,用test里面的文件进行测试
在这里插入图片描述

在这里插入图片描述
读取文件中的字符,并保存至excel

import os
import numpy as np
import pandas as pd
import time


def transform_data(file_path):
    #1. 获取训练集的全部文件名称
    train_file_name = os.listdir(file_path)
    print(train_file_name)

    file_arr_list = []
    for file_name in train_file_name:
        #2. 拼接完整路径
        file_full_path = os.path.join(file_path,file_name )
        print(file_full_path)

        # 3.逐一读取文件:
        train_data = pd.read_csv(file_full_path, header=None, engine='python',encoding='gbk').values
        print(train_data)  # shape(32,1)

        #4.横向展平:
        train_arr = train_data.ravel()
        train_list = list(''.join(train_arr))

        #5. 标签:
        label = file_name.split('_')[0]
        train_list.append(label)


        #4. 存入到list中, 元素类型转为int:
        file_arr_list.append(train_list)  #[[一个样本完整数据]]

        #5. 转为df类型:
    name = file_path.split('\\')[-1]
    train_df = pd.DataFrame(data=file_arr_list)

    train_df.to_excel(f'{name}.xlsx')


if __name__ == '__main__':

    #1.获取所有训练集:
    train_path = r'E:\KNN_手写数字识别\digits\trainingDigits'
    test_path=r'E:\KNN_手写数字识别\digits\testDigits'

    transform_data(train_path)
    transform_data(test_path)

手写算法方法

import pandas as pd
import numpy as np

# 1.读取训练集与测试集:
train = pd.read_excel('trainingDigits.xlsx')
test = pd.read_excel('testDigits.xlsx')


# 2. knn算法:

def Knn(k):
    test_predict_list = []

    for sample_index in range(test.shape[0]):
        # (1)获取测试样本:
        sample = test.iloc[sample_index, :-1]  # (1024, )
        real_label = test.iloc[sample_index, -1]

        # (2)计算相似度:
        distance = np.sqrt(((train.iloc[:, :1024] - sample) ** 2).sum(axis=1))  # distance 这里不要用-1,因为下边增加列,就会变

        # (3)将距离插入train中:
        train.loc[:, 'distance'] = distance  # train(行, 1026) 此时多了一列

        # (4) 排序:
        train.sort_values(by='distance', inplace=True)

        predict_label = train.head(k).loc[:, 1024].mode().values[0]
        print('预测结果:\n', predict_label)
        print('真实结果:\n', real_label)

        test_predict_list.append(predict_label)

        # (5)
    predict1 = pd.Series(test_predict_list)
    real1 = test.loc[:, 1024]

    # 精度
    precision = (predict1 == real1).sum() / real1.size
    print(precision)
    return precision


precision_list = []
if __name__ == '__main__':
    for k in range(2, 100, 10):
        precision = Knn(k)
        precision_list.append(precision)

print(precision_list)

knn模块算法

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

#1.读取训练集与测试集:,index_col=0将第0列当做行索引: (1954, 1026)
train = pd.read_excel('trainingDigits.xlsx', index_col=0)
test = pd.read_excel('testDigits.xlsx')

#==================================手动分割训练集与测试集,完成训练与测试的=======================================
#2.实例化:n_neighbors; k
knn = KNeighborsClassifier(n_neighbors=10)

#3.训练模型:利用训练集训练模型,训练集中应该包含x, y
x_train = train.iloc[:, :1024]
y_train = train.loc[:, 1024]
knn.fit(x_train, y_train)


#4. 将测试集输入进行预测:
x_test = test.iloc[:, :1024]
y_test = test.iloc[:, 1024]
print(x_test.shape)

y_predict = knn.predict(x_test)
print('真实结果:\n', y_test)

#5. 求精度的方法:
print(knn.score(x_test, y_test))

交叉验证方法

#====================交叉验证,无需手动分割样本======================================
# 1.拿到所有样本:
concat_data = pd.concat((train, test), axis=0)
print(concat_data.shape)

#2. 模型:
knn = KNeighborsClassifier(n_neighbors=5)

#2. 提取样本中的特征与标签:
x = concat_data.iloc[:, :1024]
y = concat_data.loc[:, 1024]
#3. 交叉验证, 返回每次的模型精度;
score = cross_val_score(knn, x, y, cv=10)
print(score)
print(score.mean())

网格搜索方法

#===========================选择参数:网格搜索=========================

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

#(1) 实例化模型:
knn = KNeighborsClassifier()

#(2)网格搜索模型初始化:
# 定义模型的参数候选集:
params = {'n_neighbors':[2, 3, 4, 5, 6, 7, 8]}

gridcv = GridSearchCV(knn, param_grid=params, cv=10)

#(3) 评估模型精度:
# a.拿到所有样本:
concat_data = pd.concat((train, test), axis=0)

# b. 提取样本中的特征与标签:
x = concat_data.iloc[:, :1024]
y = concat_data.loc[:, 1024]

gridcv.fit(x,y)

print('返回精度最高对应的参数:\n', gridcv.best_params_)
print('返回模型的最高精度值:\n', gridcv.best_score_)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值