交叉验证
作用
避免因为数据分配不均匀造成一些标签验证不出效果。
建立网格模型,求最佳超参
步骤
超参调试—得到最优模型
# 网格模型
param_list = [{
"n_neighbors": list(range(3, 13)),
"weights": ['uniform', 'distance'],
"p": [i for i in range(1, 8)] #快速生成列表生成式
}]
# 创建算法
knn_model = KNeighborsClassifier()
# 网格 (每十条分组)
grid=GridSearchCV(knn_model,param_list, cv=10)
# 训练模型
grid.fit(x_train,y_train)
print("11----:",grid.best_estimator_)
print("22----:",grid.best_score_)
print("33----:",grid.best_params_)
2、【完整代码】将得到的最优超参代入进去求最优模型。
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
# 1.数据加载
from sklearn.utils import shuffle
iris = load_iris()
# 2.二维--使用花萼长、花萼宽、花瓣长、花瓣宽 作为列
pd_data = pd.DataFrame(iris["data"], columns=iris.feature_names)
print(pd_data.head())
# 3.加入label:将iris.target值赋给label加进pd_data
pd_data["label"] = iris.target
print(pd_data.head(6), pd_data.shape)
# 4.数据打乱 特征、标签在同一行
df = shuffle(pd_data)
print(df.head())
# 5.索引重新排序 reset_index
df.reset_index(drop=True, inplace=True)
print(df.head(10), df.shape)
# 6.数据分割 特征---前四列 标签
x = df.iloc[:, :4].values # 转换为ndarray
print(x)
# 标签 拍平 多维转一维
# y = df.iloc[:, 4:].values # 中括号太多,不符合数据格式
y = np.ravel(df.iloc[:, 4:].values)
print(y, y.shape, type(y))
# 7.测试集20% 训练集120 测试集30
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6)
# # k初始值为-1
# best_k = -1 # 邻居 如何得到一半的邻居 k要小于数据个数的平方根
# best_p = 1
# best_score = 0 # 最高评分
# W = ['uniform', 'distance']
# weight = ''
# for k in range(3, 13):
# # 创建出每一个k对应的模型
# for p in range(1, 8): # 表示求和次方数
# for w in W:
# knn_model = KNeighborsClassifier(n_neighbors=k, p=p, weights=w)
# knn_model.fit(x_train, y_train)
# score = knn_model.score(x_test, y_test)
# if best_score < score: # 如果bestscore小于score
# best_score = score
# best_k = k
# best_p = p
# weight = w
# # 网格模型
# param_list = [{
# "n_neighbors": list(range(3, 13)),
# "weights": ['uniform', 'distance'],
# "p": [i for i in range(1, 8)] #快速生成列表生成式
# }]
# # 创建算法
# knn_model = KNeighborsClassifier()
# # 网格 (每十条分组)
# grid=GridSearchCV(knn_model,param_list, cv=10)
# # 训练模型
# grid.fit(x_train,y_train)
# print("11----:",grid.best_estimator_)
# print("22----:",grid.best_score_)
# print("33----:",grid.best_params_)
knn_model = KNeighborsClassifier(n_neighbors=5, p=6, weights='uniform')
knn_model.fit(x_train,y_train)
score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
print(y_predict)
print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
#
# # 模型保存
# import joblib
# joblib.dump()
#
交叉验证?
print(cross_val_score(knn_model,x_train,y_train,cv=10,scoring='accuracy').mean())
# score = knn_model.score(x_test, y_test)
y_predict = knn_model.predict(x_test)
# print(y_predict)
# print(y_test)
print(y_predict == y_test) #是否相等,布尔值判断
模型保存
# # 模型保存
import joblib
joblib.dump(value=knn_model, filename='knnModelIris.model')
#
使用训练的模型
1.加载模型
import joblib
import numpy as np
# 加载模型
model = joblib.load("knnModelIris.model")
2.给数据
# 给数据
data = np.array([4.6, 3.1, 1.5, 0.2])
3.将数据变为二维
# 将数据变成二维
data = data.reshape(1,-1)
print(data,data.shape)
4、使用模型对数据进行分类
y_predict = model.predict(data)
print(y_predict)