# Python实现knn

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 15:00:44 2020

@author: asus
"""
'''
knn实现
'''

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

path = r'F:\大三下\李艳老师数据挖掘实践\knn\iris.csv'
##因为原数据按类别排序，所以打乱顺序以划分数据集
#data = data.sample(frac = 1).reset_index(drop=True)
#print(data)

def train_test_train(data,target_columns,test_size = 0.25,random_state = None):
'''
划分数据集，data为数据，
target_columns为决策属性列名，如本例是'Name'
test_size为测试集比例
'''
#打乱顺序以划分数据集
shuffle_data = data.sample(frac = 1,random_state = random_state).reset_index(drop=True)

split_index = int(len(shuffle_data) * (1 - test_size))

#提取决策属性
y = np.array(shuffle_data[target_columns])
x = np.array(shuffle_data.drop(columns = target_columns))

#划分数据
X_train,X_test = x[:split_index],x[split_index:]
y_train,y_test = y[:split_index],y[split_index:]
return X_train,X_test,y_train,y_test

class KNNClassfier(object):

def __init__(self,n_neighbors = 3):
'''
初始化
默认3近邻，计算距离是用欧氏距离
'''
self.n_neighbors = n_neighbors
self.X_train = None
self.y_train = None

def fit(self,X_train,y_train):
'''
X_train is array ,shape like [n_samples,shapes]
X_train is array ,shape like [n_samples,1]
'''
self.X_train = X_train
self.y_train = y_train

def predict(self,X_test):
'''
X_test is array ,shape like [n_samples,shapes]
该函数返回一个在X_test上的预测结果
'''
#先创建一个空的array,
#shape:  与X_test行数相同，一列
#        y_pred = np.zeros((X_test.shape[0],1))
y_pred = list()

for i in range(X_test.shape[0]):
#遍历X_test，对于每一条测试集的数据
dis = []#用于存放距离

for j in range(self.X_train.shape[0]):
#计算第i条测试数据与所有训练数据的距离
#即第i条测试数据与第j条训练数据的差的平方和开更号
#np.linalg.norm()为矩阵整体元素平方和开根号
#所以对于每一个i，即每一条测试数据，有几条训练数据，dis就有几个元素
dis.append(np.linalg.norm(X_test[i] - self.X_train[j,:]))

#对于第i条数据，计算完距离以后排序
labels = []#存放类标
#这里的sorted(range(len(dis)),key = dis.__getitem__)意思是：
#返回一个新的列表，里面的元素是dis从小到大排序后对应的索引，所以前面的就是距离最小的
index = sorted(range(len(dis)),key = dis.__getitem__)

for j in range(self.n_neighbors):
#  假设n_neighbors为3，就取index前三个元素，这三个元素都是数据的索引
#  然后去取这三个数据的类标,比方说前三个是4,78,53，那么就取4,78,53条数据的类别放入labels
#  对于每一个i，labels都有三个类标
labels.append(self.y_train[index[j]])

counts = []#用于计数
for label in labels:
#统计每个类标的个数
counts.append(labels.count(label))
#            y_pred[i] = labels[np.argmax(counts)]

'''
比方说labels = [A,B,A]
那么count = [2,1,2]
那么np.argmax(counts) = 0
根据这个去取labels[0] = A
'''
y_pred.append(labels[np.argmax(counts)])#返回沿最大值的索引

return y_pred

def score(self,X_test,y_test):
'''输入测试集，返回测试集精度'''
pred = self.predict(X_test)
score = np.mean(y_pred == y_test)

return score

if __name__ == '__main__':

#划分数据集
X_train,X_test,y_train,y_test = train_test_train(data,target_columns = 'Name',test_size = 0.3,random_state = 85)

#    knn = KNeighborsClassifier(n_neighbors = 3)
#    knn.fit(X_train,y_train)
#    y_pred = knn.predict(X_test)
#    print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
#    print("Train set score: {:.2f}".format(knn.score(X_train, y_train)))

knn = KNNClassfier(n_neighbors = 3)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
#print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
print("Test set score: {:.2f}".format(knn.score(X_test,y_test)))

03-25 336

08-18 399

07-15 1865

11-04 307

10-14 1418

12-21 966

12-06 365

03-07 27

09-04 271

12-05 1262

10-13 197

11-20 1448

11-09 297

05-14 74

09-30 127

08-20 30

10-13 172

10-14 49

10-14 153

10-14 78

11-09 31

11-09 204

11-09 270

11-10 145

11-20 86

11-21 113

12-02 278

12-07 138

12-16 286

02-29 1359

01-20 34

02-22 1582

03-03 316

03-05 207

04-18 31

04-19 2066

04-21 29

04-27 54

04-28 140

05-01 98

04-29 124

04-30 501

05-01 85

05-03 177

05-03 60

05-04 46

05-06 86

05-07 258

05-09 81

05-10 46

05-13 59

05-14 197

05-26 2万+

12-15 3446

04-06 1927

05-28 4608

07-09 3516

08-16 1497

10-13 704

10-17 810

11-20 1011

12-13 45

12-18 9102

01-27 2725

01-25 1213

01-31 5532

02-27 6013

03-01 262

05-10 948

04-24 993

05-15 156

05-16 1149

05-26 350

06-14 5752

06-26 106

07-02 202

#### 机器学习 day01(二)

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客