目的应用:让电脑代替人工对图片分类
1.数据加载
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
# 1.数据加载
iris=load_iris()
2. DataFrame 使用花萼长、花萼宽、花瓣长、花瓣宽 作为列名
# 2.二维--使用花萼长、花萼宽、花瓣长、花瓣宽 作为列
pd_data = pd.DataFrame(iris["data"], columns=iris.feature_names)
print(pd_data.head())
3.加入label:将iris.target值赋给label加进pd_data
# 3.加入label:将iris.target值赋给label加进pd_data
pd_data["label"] = iris.target
print(pd_data.head(6), pd_data.shape)
4.数据打乱 特征、标签在同一行
# 4.数据打乱 特征、标签在同一行
df = shuffle(pd_data)
print(df.head())
5.索引重新排序 reset_index
df.reset_index(drop=True, inplace=True)
print(df.head(10), df.shape)
6.数据分割
x = df.iloc[:, :4].values # 转换为ndarray
print(x)
# 标签 拍平 多维转一维
# y = df.iloc[:, 4:].values # 中括号太多,不符合数据格式
y = np.ravel(df.iloc[:, 4:].values)
print(y, y.shape, type(y))
7.测试集20% 训练集120 测试集30
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6)
8、超参数k为邻居个数,如何找出最优k以及对应最高评分的呢?先定义一个best_k,给初始值为-1,循环让一个k值取3-12,求对应的score,找出最大的score,对应的k也就是best_k.
# k初始值为-1
best_k = -1 # 邻居 如何得到一半的邻居 k要小于数据个数的平方根
best_score = 0 # 最高评分
for k in range(3, 13):
# 创建出每一个k对应的模型
# for p in range(1, 8):
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(x_train, y_train)
score = knn_model.score(x_test, y_test)
if best_score < score: # 如果bestscore小于score
best_score = score
best_k = k
# p = p
print("best_k:",best_k,"best_sc:", best_score)
可以得出最大score为0.9666666...,对应的最优best_k则为7。用这个k代入进去训练模型、评分、预测。
print("best_k:", best_k, "best_sc:", best_score)
knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(x_train,y_train)
score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
print(y_predict)
print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
得出最优k,还要求best_p(距离的导数)。定义best_p初始化为1,加入p循环、p参数,求最优p。
# k初始值为-1
best_k = -1 # 邻居 如何得到一半的邻居 k要小于数据个数的平方根
best_p = 1
best_score = 0 # 最高评分
W = ['uniform', 'distance']
for k in range(3, 13):
# 创建出每一个k对应的模型
for p in range(1, 8): # 表示求和次方数
knn_model = KNeighborsClassifier(n_neighbors=k, p=p)
knn_model.fit(x_train, y_train)
score = knn_model.score(x_test, y_test)
if best_score < score: # 如果bestscore小于score
best_score = score
best_k = k
best_p = p
print("best_k:", best_k, "best_sc:", best_score,"best_p:",best_p)
knn_model = KNeighborsClassifier(n_neighbors=best_k, p=best_p)
knn_model.fit(x_train,y_train)
score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
print(y_predict)
print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
最优p为2.
还有超参best_weight。定义初始化,求最优解。
# k初始值为-1
best_k = -1 # 邻居 如何得到一半的邻居 k要小于数据个数的平方根
best_p = 1
best_score = 0 # 最高评分
W = ['uniform', 'distance']
weight = ''
for k in range(3, 13):
# 创建出每一个k对应的模型
for p in range(1, 8): # 表示求和次方数
for w in W:
knn_model = KNeighborsClassifier(n_neighbors=k, p=p, weights=w)
knn_model.fit(x_train, y_train)
score = knn_model.score(x_test, y_test)
if best_score < score: # 如果bestscore小于score
best_score = score
best_k = k
best_p = p
weight = w
print("best_k:", best_k, "best_sc:", best_score,"best_p:",best_p, "weight:",weight)
knn_model = KNeighborsClassifier(n_neighbors=best_k, p=best_p, weights=w)
knn_model.fit(x_train,y_train)
score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
print(y_predict)
print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
算出所有最优超参。
完整代码:
mport numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
# 1.数据加载
from sklearn.utils import shuffle
iris = load_iris()
# 2.二维--使用花萼长、花萼宽、花瓣长、花瓣宽 作为列
pd_data = pd.DataFrame(iris["data"], columns=iris.feature_names)
print(pd_data.head())
# 3.加入label:将iris.target值赋给label加进pd_data
pd_data["label"] = iris.target
print(pd_data.head(6), pd_data.shape)
# 4.数据打乱 特征、标签在同一行
df = shuffle(pd_data)
print(df.head())
# 5.索引重新排序 reset_index
df.reset_index(drop=True, inplace=True)
print(df.head(10), df.shape)
# 6.数据分割 特征---前四列 标签
x = df.iloc[:, :4].values # 转换为ndarray
print(x)
# 标签 拍平 多维转一维
# y = df.iloc[:, 4:].values # 中括号太多,不符合数据格式
y = np.ravel(df.iloc[:, 4:].values)
print(y, y.shape, type(y))
# 7.测试集20% 训练集120 测试集30
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6)
# k初始值为-1
best_k = -1 # 邻居 如何得到一半的邻居 k要小于数据个数的平方根
best_p = 1
best_score = 0 # 最高评分
W = ['uniform', 'distance']
weight = ''
for k in range(3, 13):
# 创建出每一个k对应的模型
for p in range(1, 8): # 表示求和次方数
for w in W:
knn_model = KNeighborsClassifier(n_neighbors=k, p=p, weights=w)
knn_model.fit(x_train, y_train)
score = knn_model.score(x_test, y_test)
if best_score < score: # 如果bestscore小于score
best_score = score
best_k = k
best_p = p
weight = w
print("best_k:", best_k, "best_sc:", best_score,"best_p:",best_p, "weight:",weight)
knn_model = KNeighborsClassifier(n_neighbors=best_k, p=best_p, weights=weight)
knn_model.fit(x_train,y_train)
score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
print(y_predict)
print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
这种操作方法很麻烦。