KNN-以鸢尾花数据集为例

原理部分可以参考其他文章,目前网上已经有很多了,下面直接上代码 

### 导入所需包 ###
import numpy as np
import pandas as pd

##读入数据
irsflowers = pd.read_csv("iris.csv")

'''
    数据预处理
    查看是否有缺失值
'''
# print(irsflowers.isnull().sum())

'''缺失值处理:直接删除含有缺失值的行'''
irsflowers.dropna(axis=0, inplace=True)

'''再次查看,缺失值处理完毕'''
# print(irsflowers.isnull().sum())


## 数据编码与标准化
"""
    数据编码
    将三种花种类分好标签
    setosa:0
    versicolor:1
    virginica:2
"""
datas = irsflowers.values
datas[datas == 'setosa'] = 0
datas[datas == 'versicolor'] = 1
datas[datas == 'virginica'] = 2

train_datas = datas[:, :-1].astype('float32')
train_labels = datas[:, -1:].astype('int64')

'''
    标准化:采用(0,1)标准化
'''


def Normalization(data):
    max = data.max(axis=0)
    min = data.min(axis=0)
    m = data.shape[0]
    after_normalize = data - np.tile(min, (m, 1))
    after_normalize = after_normalize / np.tile((max - min), (m, 1))
    return after_normalize


## 划分训练集与测试集(可尝试用不同方法实现)
'''
    随机分配百分之80的数据作为训练集,随机分配百分之20的数据作为测试集
    使用sklearn.model_selection里的train_test_split模块用于分割数据
'''

from sklearn.model_selection import train_test_split


def splitdata(after_normalize_def, train_labels_def):
    data_list = map(lambda x: x[0], train_labels_def)
    labels = pd.Series(data_list)
    after_normalize_data = pd.DataFrame(after_normalize_def)
    Y = labels
    X = after_normalize_data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=9)
    return X_train, Y_train, X_test, Y_test


normalization = Normalization(train_datas)
X_train, Y_train, X_test, Y_test = splitdata(normalization, train_labels)

X_train_list = X_train.values
Y_train_list = Y_train.values
X_test_list = X_test.values
Y_test_list = Y_test.values

import operator

class MYKNN:
    def __init__(self, k,_distance_type):
        self.k = k
        self._distance_type = _distance_type
        self.x_train = None
        self.y_train = None

    def fit(self, X_train, Y_train):
        self.x_train = X_train
        self.y_train = Y_train

    def predict(self, X_predict):
        return np.array([self.predict_method(x) for x in X_predict])

    def predict_method(self, x):
        dict_eachtype = {"0": 0, "1": 0, "2": 0}
        # 计算样本和已知点的距离,有三种距离计算方法,一般选用欧氏距离
        '''  0:欧式距离    1:曼哈顿距离   2:切比雪夫距离'''
        if self._distance_type == 0:
            distances = np.sqrt(np.sum((x - self.x_train) ** 2, axis=1))
        elif self._distance_type == 1:
            distances = np.sum(abs(self.x_train - x), axis=1)
        else:
            distances = np.max(abs(self.x_train - x), axis=1)
        # 计算样本与前K个已知点的所有分类结果
        for i in range(self.k):
            #找距离最小的值及其下标
            min_index, min_number = min(enumerate(distances), key=operator.itemgetter(1))
            type_flower = self.y_train[min_index]
            if type_flower == 0:
                dict_eachtype["0"] += 1
            elif type_flower == 1:
                dict_eachtype["1"] += 1
            elif type_flower == 2:
                dict_eachtype["2"] += 1
            #将该最小值改为一个较大的数,这样下一次循环就能找到第二小的值及其下标
            distances[min_index] = 2

        # 分样本类,谁最多就把样本记为哪类
        max1 = max(dict_eachtype, key=dict_eachtype.get)
        return int(max1)

    # 计算准确率
    def acc(self, x_test, label):
        y_predict = self.predict(x_test)
        all_sample = len(label)
        right = 0
        for i, e in enumerate(label):
            if y_predict[i] == e:
                right += 1
        ACC = right/all_sample
        return ACC,y_predict


#可视化roc
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def visualization(target, predictions):
    target, predictions, thersholds = roc_curve(target, predictions, pos_label=2)
    roc_auc = auc(target, predictions)
    plt.plot(target, predictions, label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,color='red')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()


## 结果导出result.csv
def to_csv(_result,_predictions,_acc_csv):
    with open('result_KNN.csv', 'w', encoding='utf-8') as f:
        for i in range(len(_result)):
            f.write("target is : {} , prediction_result is : {}".format(_result[i],_predictions[i]))
            f.write("\n")
        f.write("The Best Acc is : {}".format(_acc_csv))


#调用方法
_acc = 0
best_prediction=0
for k in range(15):
    for distance_type in range(3):
        KNN = MYKNN(k+1,distance_type)
        KNN.fit(X_train_list, Y_train_list)
        result=KNN.acc(X_test_list, Y_test_list)
        print("K is {},distance_type is {},ACC is {}".format(k+1,distance_type,result[0]))
        if result[0] > _acc:
            _acc = result[0]
            best_prediction=result[-1]
to_csv(Y_test_list,best_prediction,_acc)
visualization(Y_test_list, best_prediction)


选取不同K值,不同距离计算方法,输出结果:

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值