1.步骤

2.举例

3.例子

4.优缺点

5.交叉验证和网格搜索来确定k

6.归一化

7.标准化

# 1.步骤

1）计算已知类别数据集中的点与当前点之间的距离 2）按距离递增次序排序 3）选取与当前点距离最⼩的k个点 4）统计前k个点所在的类别出现的频率 5）返回前k个点出现频率最⾼的类别作为当前点的预测分类

# 2.举例

from sklearn.neighbors import KNeighborsClassifier

x=[[39,0,31],[3,2,65],[2,3,55],[9,38,2],[8,34,17],[5,2,57],[21,17,5],[45,2,9]]
#"喜剧片":0  "动作片":1  "爱情片":2
y=[0,1,2,2,2,1,0,0]
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x,y)
prediction=knn.predict([[23,3,17]])
print(prediction)

# 3.例子

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1.获取数据集
# 2.数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集⽬标值、测试集⽬标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# 3、特征⼯程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4、机器学习(模型训练)
estimator = KNeighborsClassifier(n_neighbors=9)
estimator.fit(x_train, y_train)
# 5、模型评估
# ⽅法1：⽐对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("⽐对真实值和预测值：\n", y_predict == y_test)
# ⽅法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

4.1优点

4.2缺点

# 5.交叉验证和网格搜索来确定k

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1、获取数据集
# 2、数据基本处理 -- 划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)
# 3、特征⼯程：标准化
# 实例化⼀个转换器类
transfer = StandardScaler()
# 调⽤fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、KNN预估器流程
# 4.1 实例化预估器类
estimator = KNeighborsClassifier()
# 4.2 模型选择与调优——⽹格搜索和交叉验证
# 准备要调的超参数
param_dict = {"n_neighbors": [1,3,5,7]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=5)
# 4.3 fit数据进⾏训练
estimator.fit(x_train, y_train)
# 5、评估模型效果
# ⽅法a：⽐对预测结果和真实值
y_predict = estimator.predict(x_test)
print("⽐对预测结果和真实值：\n", y_predict == y_test)
# ⽅法b：直接计算准确率
score = estimator.score(x_test, y_test)
print("直接计算准确率：\n", score)

print("在交叉验证中验证的最好结果：\n", estimator.best_score_)
print("最好的参数模型：\n", estimator.best_estimator_)
print("每次交叉验证后的准确率结果：\n", estimator.cv_results_)

# 6.归一化

作⽤于每⼀列，max为⼀列的最⼤值，min为⼀列的最⼩值,那么X’’为最终结果，mx，mi分别为指定区间值默认 mx为1,mi为0

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

print(data)
# 1、实例化⼀个转换器类
transfer = MinMaxScaler(feature_range=(0, 1))
# 2、调⽤fit_transform
data = transfer.fit_transform(data[['milage','Liters','Consumtime']])
print("最⼩值最⼤值归⼀化处理的结果：\n", data)

# 7.标准化

import pandas as pd
from sklearn.preprocessing import StandardScaler

print(data)
# 1、实例化⼀个转换器类
transfer = StandardScaler()
# 2、调⽤fit_transform
data = transfer.fit_transform(data[['milage','Liters','Consumtime']])
print("标准化的结果:\n", data)
print("每⼀列特征的平均值：\n", transfer.mean_)
print("每⼀列特征的⽅差：\n", transfer.var_)

