原理部分可以参考其他文章,目前网上已经有很多了,下面直接上代码
### 导入所需包 ###
import numpy as np
import pandas as pd
##读入数据
irsflowers = pd.read_csv("iris.csv")
'''
数据预处理
查看是否有缺失值
'''
# print(irsflowers.isnull().sum())
'''缺失值处理:直接删除含有缺失值的行'''
irsflowers.dropna(axis=0, inplace=True)
'''再次查看,缺失值处理完毕'''
# print(irsflowers.isnull().sum())
## 数据编码与标准化
"""
数据编码
将三种花种类分好标签
setosa:0
versicolor:1
virginica:2
"""
datas = irsflowers.values
datas[datas == 'setosa'] = 0
datas[datas == 'versicolor'] = 1
datas[datas == 'virginica'] = 2
train_datas = datas[:, :-1].astype('float32')
train_labels = datas[:, -1:].astype('int64')
'''
标准化:采用(0,1)标准化
'''
def Normalization(data):
max = data.max(axis=0)
min = data.min(axis=0)
m = data.shape[0]
after_normalize = data - np.tile(min, (m, 1))
after_normalize = after_normalize / np.tile((max - min), (m, 1))
return after_normalize
## 划分训练集与测试集(可尝试用不同方法实现)
'''
随机分配百分之80的数据作为训练集,随机分配百分之20的数据作为测试集
使用sklearn.model_selection里的train_test_split模块用于分割数据
'''
from sklearn.model_selection import train_test_split
def splitdata(after_normalize_def, train_labels_def):
data_list = map(lambda x: x[0], train_labels_def)
labels = pd.Series(data_list)
after_normalize_data = pd.DataFrame(after_normalize_def)
Y = labels
X = after_normalize_data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=9)
return X_train, Y_train, X_test, Y_test
normalization = Normalization(train_datas)
X_train, Y_train, X_test, Y_test = splitdata(normalization, train_labels)
X_train_list = X_train.values
Y_train_list = Y_train.values
X_test_list = X_test.values
Y_test_list = Y_test.values
import operator
class MYKNN:
def __init__(self, k,_distance_type):
self.k = k
self._distance_type = _distance_type
self.x_train = None
self.y_train = None
def fit(self, X_train, Y_train):
self.x_train = X_train
self.y_train = Y_train
def predict(self, X_predict):
return np.array([self.predict_method(x) for x in X_predict])
def predict_method(self, x):
dict_eachtype = {"0": 0, "1": 0, "2": 0}
# 计算样本和已知点的距离,有三种距离计算方法,一般选用欧氏距离
''' 0:欧式距离 1:曼哈顿距离 2:切比雪夫距离'''
if self._distance_type == 0:
distances = np.sqrt(np.sum((x - self.x_train) ** 2, axis=1))
elif self._distance_type == 1:
distances = np.sum(abs(self.x_train - x), axis=1)
else:
distances = np.max(abs(self.x_train - x), axis=1)
# 计算样本与前K个已知点的所有分类结果
for i in range(self.k):
#找距离最小的值及其下标
min_index, min_number = min(enumerate(distances), key=operator.itemgetter(1))
type_flower = self.y_train[min_index]
if type_flower == 0:
dict_eachtype["0"] += 1
elif type_flower == 1:
dict_eachtype["1"] += 1
elif type_flower == 2:
dict_eachtype["2"] += 1
#将该最小值改为一个较大的数,这样下一次循环就能找到第二小的值及其下标
distances[min_index] = 2
# 分样本类,谁最多就把样本记为哪类
max1 = max(dict_eachtype, key=dict_eachtype.get)
return int(max1)
# 计算准确率
def acc(self, x_test, label):
y_predict = self.predict(x_test)
all_sample = len(label)
right = 0
for i, e in enumerate(label):
if y_predict[i] == e:
right += 1
ACC = right/all_sample
return ACC,y_predict
#可视化roc
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def visualization(target, predictions):
target, predictions, thersholds = roc_curve(target, predictions, pos_label=2)
roc_auc = auc(target, predictions)
plt.plot(target, predictions, label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,color='red')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
## 结果导出result.csv
def to_csv(_result,_predictions,_acc_csv):
with open('result_KNN.csv', 'w', encoding='utf-8') as f:
for i in range(len(_result)):
f.write("target is : {} , prediction_result is : {}".format(_result[i],_predictions[i]))
f.write("\n")
f.write("The Best Acc is : {}".format(_acc_csv))
#调用方法
_acc = 0
best_prediction=0
for k in range(15):
for distance_type in range(3):
KNN = MYKNN(k+1,distance_type)
KNN.fit(X_train_list, Y_train_list)
result=KNN.acc(X_test_list, Y_test_list)
print("K is {},distance_type is {},ACC is {}".format(k+1,distance_type,result[0]))
if result[0] > _acc:
_acc = result[0]
best_prediction=result[-1]
to_csv(Y_test_list,best_prediction,_acc)
visualization(Y_test_list, best_prediction)
选取不同K值,不同距离计算方法,输出结果: