需求测试knn算法精度,并选出精度最高的k值
文件如下:在test、和train文件夹里分别有如下文件,文件名_左侧是每个txt文件里1组成的数字,每个txt文件都是32行、32列共计1024个字符
用train里面的文件进行训练,用test里面的文件进行测试


读取文件中的字符,并保存至excel
import os
import numpy as np
import pandas as pd
import time
def transform_data(file_path):
#1. 获取训练集的全部文件名称
train_file_name = os.listdir(file_path)
print(train_file_name)
file_arr_list = []
for file_name in train_file_name:
#2. 拼接完整路径
file_full_path = os.path.join(file_path,file_name )
print(file_full_path)
# 3.逐一读取文件:
train_data = pd.read_csv(file_full_path, header=None, engine='python',encoding='gbk').values
print(train_data) # shape(32,1)
#4.横向展平:
train_arr = train_data.ravel()
train_list = list(''.join(train_arr))
#5. 标签:
label = file_name.split('_')[0]
train_list.append(label)
#4. 存入到list中, 元素类型转为int:
file_arr_list.append(train_list) #[[一个样本完整数据]]
#5. 转为df类型:
name = file_path.split('\\')[-1]
train_df = pd.DataFrame(data=file_arr_list)
train_df.to_excel(f'{name}.xlsx')
if __name__ == '__main__':
#1.获取所有训练集:
train_path = r'E:\KNN_手写数字识别\digits\trainingDigits'
test_path=r'E:\KNN_手写数字识别\digits\testDigits'
transform_data(train_path)
transform_data(test_path)
手写算法方法
import pandas as pd
import numpy as np
# 1.读取训练集与测试集:
train = pd.read_excel('trainingDigits.xlsx')
test = pd.read_excel('testDigits.xlsx')
# 2. knn算法:
def Knn(k):
test_predict_list = []
for sample_index in range(test.shape[0]):
# (1)获取测试样本:
sample = test.iloc[sample_index, :-1] # (1024, )
real_label = test.iloc[sample_index, -1]
# (2)计算相似度:
distance = np.sqrt(((train.iloc[:, :1024] - sample) ** 2).sum(axis=1)) # distance 这里不要用-1,因为下边增加列,就会变
# (3)将距离插入train中:
train.loc[:, 'distance'] = distance # train(行, 1026) 此时多了一列
# (4) 排序:
train.sort_values(by='distance', inplace=True)
predict_label = train.head(k).loc[:, 1024].mode().values[0]
print('预测结果:\n', predict_label)
print('真实结果:\n', real_label)
test_predict_list.append(predict_label)
# (5)
predict1 = pd.Series(test_predict_list)
real1 = test.loc[:, 1024]
# 精度
precision = (predict1 == real1).sum() / real1.size
print(precision)
return precision
precision_list = []
if __name__ == '__main__':
for k in range(2, 100, 10):
precision = Knn(k)
precision_list.append(precision)
print(precision_list)
knn模块算法
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
#1.读取训练集与测试集:,index_col=0将第0列当做行索引: (1954, 1026)
train = pd.read_excel('trainingDigits.xlsx', index_col=0)
test = pd.read_excel('testDigits.xlsx')
#==================================手动分割训练集与测试集,完成训练与测试的=======================================
#2.实例化:n_neighbors; k
knn = KNeighborsClassifier(n_neighbors=10)
#3.训练模型:利用训练集训练模型,训练集中应该包含x, y
x_train = train.iloc[:, :1024]
y_train = train.loc[:, 1024]
knn.fit(x_train, y_train)
#4. 将测试集输入进行预测:
x_test = test.iloc[:, :1024]
y_test = test.iloc[:, 1024]
print(x_test.shape)
y_predict = knn.predict(x_test)
print('真实结果:\n', y_test)
#5. 求精度的方法:
print(knn.score(x_test, y_test))
交叉验证方法
#====================交叉验证,无需手动分割样本======================================
# 1.拿到所有样本:
concat_data = pd.concat((train, test), axis=0)
print(concat_data.shape)
#2. 模型:
knn = KNeighborsClassifier(n_neighbors=5)
#2. 提取样本中的特征与标签:
x = concat_data.iloc[:, :1024]
y = concat_data.loc[:, 1024]
#3. 交叉验证, 返回每次的模型精度;
score = cross_val_score(knn, x, y, cv=10)
print(score)
print(score.mean())
网格搜索方法
#===========================选择参数:网格搜索=========================
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
#(1) 实例化模型:
knn = KNeighborsClassifier()
#(2)网格搜索模型初始化:
# 定义模型的参数候选集:
params = {'n_neighbors':[2, 3, 4, 5, 6, 7, 8]}
gridcv = GridSearchCV(knn, param_grid=params, cv=10)
#(3) 评估模型精度:
# a.拿到所有样本:
concat_data = pd.concat((train, test), axis=0)
# b. 提取样本中的特征与标签:
x = concat_data.iloc[:, :1024]
y = concat_data.loc[:, 1024]
gridcv.fit(x,y)
print('返回精度最高对应的参数:\n', gridcv.best_params_)
print('返回模型的最高精度值:\n', gridcv.best_score_)

1265

被折叠的 条评论
为什么被折叠?



