KNN考试准备

一、流程

KNN基本算法:
1.计算距离
2.递增排序
3.取K (奇数)个最近点
4.通过简单多数或其他规则确定分类

二、手动实现KNN-性别判断

任务描述
本关任务:根据KNN原理,通过“属性分配”的49个数据(不包含你自己)进行训练,分别令K=5、7、9,判断你的性别。

import numpy as np
import pandas as pd
import csv
import warnings
warnings.filterwarnings("ignore")

f=pd.read_csv(r'/data/workspace/myshixun/step2/data.csv')
f=np.array(f) 
data=f[1:,1:6]
label=f[1:,7]
#print(data,label)

def classify(inX,data,labels,k):

    dataSetSize = data.shape[0]  # 计算共有多少条训练数据
    print(dataSetSize)
    print('复制输入向量 用于和样本中的每条数据进行计算 [矩阵的加减乘除]')
    print(tile(inX, (dataSetSize, 1)))

    print(dataSetSize)
    print('复制输入向量 用于和样本中的每条数据进行计算 [矩阵的加减乘除]')
    print(tile(inX, (dataSetSize, 1)))

    # 矩阵的减法 结果:每一项为输入向量和各个样本对应特征点的差值构成的新矩阵
    diffmat = tile(inX, (dataSetSize, 1)) - data
    print('\n相减后:')
    print(diffmat)

    sqDiffMat = diffmat ** 2  # 平方 矩阵中每一项都平方
    print('\n平方后:')
    print(sqDiffMat)
    sqDistances = sqDiffMat.sum(axis=1)  # axis=1 行向量相加 / axis=0 列向量相加
    print('\n各个特征点差值相加[即坐标差值相加]:')
    print(sqDistances)

    distances = sqDistances ** 0.5  # 开方
    print('\n距离:')
    print(distances)
    sortedDistIndexes = distances.argsort()  # 从小到大将距离向量排序并返回其索引值

    classCount = {}  # dict 保存对应标签出现的次数
    for i in range(k):
        voteLabel = labels[sortedDistIndexes[i]] #获得类别标签
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    print('标签出现的次数:')
    print(json.dumps(classCount, ensure_ascii=False))
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)

    print('\n排序后:')
    print(json.dumps(sortedClassCount, ensure_ascii=False))
    # 如: print sortedClassCount ———— [('A', 2), ('B', 1)]

    return sortedClassCount[0][0]  # 返回次数出现次数最多的标签

调库实现KNN-性别判断

#encoding=utf8
import numpy as np
import student #模块名
'''
print(dir(student))
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', 
'__name__', '__package__', '__spec__', 'np', 'sigmoid']

注意:删掉if __name__ == "__main__"函数!
'''
import pandas as pd
from sklearn.neighbors import  KNeighborsClassifier as knn
import warnings
warnings.filterwarnings("ignore")

#使用python的pandas包对数据进行处理
f=pd.read_csv(r'/data/workspace/myshixun/step2/data.csv')
f=np.array(f) 
x=f[1:,1:6]
y=f[1:,7]


#对于y而言,本身只有male和female两个属性,因此,我们使用一个0-1变量(哑变量)进行替代
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

#也要划分好训练集和测试机。因此要对其进行打乱操作
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(x_train)
x_train = scaler1.transform(x_train)
x_test = scaler1.transform(x_test)

from sklearn.neighbors import KNeighborsClassifier 
knn=KNeighborsClassifier() 
knn.fit(x_train,y_train) 

该样本空间里有宅男和文艺青年这两个类别(手动)

#encoding=utf8
import numpy as np
from collections import Counter

class kNNClassifier(object):
    def __init__(self, k):
        '''
        初始化函数
        :param k:kNN算法中的k
        '''
        self.k = k
        # 用来存放训练数据,类型为ndarray
        self.train_feature = None
        # 用来存放训练标签,类型为ndarray
        self.train_label = None


    def fit(self, feature, label):
        '''
        kNN算法的训练过程
        :param feature: 训练集数据,类型为ndarray
        :param label: 训练集标签,类型为ndarray
        :return: 无返回
        '''

        #********* Begin *********#
        self.train_feature = feature
        self.train_label = label
        self.train_vars = feature.shape[0]#计算个数
        #********* End *********#


    def predict(self, feature):
        '''
        kNN算法的预测过程
        :param feature: 测试集数据,类型为ndarray
        :return: 预测结果,类型为ndarray或list
        '''

        #********* Begin *********#
        result = []
        for data in feature:
            distance = self.calculateDistance(data) #计算距离
            KLabels = self.getKLabels(distance)
            result.append(self.getAppearMostLabel(KLabels))
        return result
    # 计算距离
    def calculateDistance(self, feature):
        diffMat = np.tile(feature, (self.train_vars, 1)) - self.train_feature
        sqDistance = (diffMat ** 2).sum(axis=1)
        return sqDistance ** 0.5
    # 递增排序
    def getKLabels(self, distance):
        argOder = distance.argsort()[0:self.k]
        return (self.train_label[i] for i in argOder)
    # 取K (奇数)个最近点
    def getAppearMostLabel(self, KLabels):
        label, count = Counter(KLabels).most_common(1)[0]
        return label
        #********* End *********#

红酒分类(调库knn)

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

def classification(train_feature, train_label, test_feature):
    '''
    对test_feature进行红酒分类
    :param train_feature: 训练集数据,类型为ndarray
    :param train_label: 训练集标签,类型为ndarray
    :param test_feature: 测试集数据,类型为ndarray
    :return: 测试集数据的分类结果
    '''

    #********* Begin *********#
    #实例化StandardScaler函数
    scaler = StandardScaler()
    # 划分测试集和训练集
    train_feature = scaler.fit_transform(np.array(train_feature).reshape(133,13))
    test_feature = scaler.transform(np.array(test_feature).reshape(45,13))
    #生成K近邻分类器
    clf = KNeighborsClassifier()
    #训练分类器
    clf.fit(train_feature, train_label.astype('int'))
    #进行预测
    predict_result = clf.predict(test_feature)
    return predict_result 
    #********* End **********#

红酒5关

使用sklearn中的kNN算法进行分类

from sklearn.neighbors import KNeighborsClassifier

def classification(train_feature, train_label, test_feature):
    '''
    使用KNeighborsClassifier对test_feature进行分类
    :param train_feature: 训练集数据
    :param train_label: 训练集标签
    :param test_feature: 测试集数据
    :return: 测试集预测结果
    '''

    #********* Begin *********#
    clf=KNeighborsClassifier()
    clf.fit(train_feature, train_label) 
    return clf.predict(test_feature)
    #********* End *********#

使用sklearn中的kNN算法进行回归

from sklearn.neighbors import KNeighborsRegressor

def regression(train_feature, train_label, test_feature):
    '''
    使用KNeighborsRegressor对test_feature进行分类
    :param train_feature: 训练集数据
    :param train_label: 训练集标签
    :param test_feature: 测试集数据
    :return: 测试集预测结果
    '''

    #********* Begin *********#
    clf=KNeighborsRegressor() #生成K近邻分类器
    clf.fit(train_feature, train_label)               #训练分类器
    return clf.predict(test_feature)
    #********* End *********#

分析红酒数据

import numpy as np

def alcohol_mean(data):
    '''
    返回红酒数据中红酒的酒精平均含量
    :param data: 红酒数据对象
    :return: 酒精平均含量,类型为float
    '''

    #********* Begin *********#
    return data.data[:,0].mean()
    #********* End **********#


对数据进行标准化

from sklearn.preprocessing import StandardScaler

def scaler(data):
    '''
    返回标准化后的红酒数据
    :param data: 红酒数据对象
    :return: 标准化后的红酒数据,类型为ndarray
    '''

    #********* Begin *********#
    scaler = StandardScaler()
    after_scaler = scaler.fit_transform(data['data'])
    return after_scaler
    #********* End **********#


使用kNN算法进行预测

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

def classification(train_feature, train_label, test_feature):
    '''
    对test_feature进行红酒分类
    :param train_feature: 训练集数据,类型为ndarray
    :param train_label: 训练集标签,类型为ndarray
    :param test_feature: 测试集数据,类型为ndarray
    :return: 测试集数据的分类结果
    '''

    #********* Begin *********#
    scaler = StandardScaler()
    train_feature = scaler.fit_transform(train_feature)
    test_feature = scaler.transform(test_feature)

    clf = KNeighborsClassifier()
    clf.fit(train_feature, train_label)
    return clf.predict(test_feature)
    #********* End **********#

K近邻实战(鸢尾花)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, svm, tree, ensemble 
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")   #忽略警告
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    from numpy.core.umath_tests import inner1d


# set the number of neighbors
n_neighbors = 15

# import the iris dataset
#------------------begin--------------------
iris = datasets.load_iris()

# only take the first two features
X = iris.data[:, :2]
y = iris.target

#-------------------end---------------------

h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

#基于K近邻分类结果描绘分类边界
#------------------begin--------------------
for weights in ['uniform', 'distance']:
  # create an instance of KNN Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

#-------------------end---------------------

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.savefig("step3/结果/result.png")

from sklearn.ensemble import RandomForestClassifier

X, y = datasets.make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf_rf.fit(X, y)

#输出clf_rf的各特征权重以及预测[0,0,0,0]的类别
#------------------begin--------------------
print(clf_rf.feature_importances_)
print(clf_rf.predict([[0,0,0,0]]))

#-------------------end---------------------

自己输入数据

输入说明:输入由四行组成,每行由一个数组成,第一行表示要生成的数据组数,第二行表示生成数据时所使用的随机状态,第三行和第四行表示待测点的特征值(待测点只有两个特征)。

输出说明,由一行组成,即输入点的类别。

# -*- coding: utf-8 -*-

from sklearn.datasets import make_blobs
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# 请在此添加代码 完成本关任务
#
# ********** Begin *********#
samples=int(input())
random_state=int(input())
data=make_blobs(n_samples=samples, random_state=random_state)

X, Y=data
clf=KNeighborsClassifier()
clf.fit(X,Y)
x=int(input())
y=int(input())
print(clf.predict([[x,y]]))
# ********** End **********#
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值