一、鸢尾花案例
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets
iris = datasets.load_iris()
iris.keys()
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
x = iris.data
y = iris.target
print("x",x.shape)
print("y",y.shape)
x (150, 4)
y (150,)
1.1 训练集和测试集拆分
from sklearn.model_selection import train_test_split
# test_size:测试集占比
# random_state:随机种子
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=666)
1.2 未进行数据归一化
from sklearn.neighbors import KNeighborsClassifier
clf1 = KNeighborsClassifier(n_neighbors=3)
clf1.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=3, p=2, weights='uniform')
clf1.score(X_test,y_test) # 预期得分为1
1.3 数据归一化:StandardScaler
-
为什么数据归一化:将所有数据映射到同一尺度
-
最值归一化:所有数据映射到0-1之间。适用于分布有明显边界的情况,受离群值影响较大
- 均值方差归一化:所有数据归一到均值为0、方差为1的分布中。适用于分布没有明显边界,可能存在极端值的情况
from sklearn.preprocessing import StandardScaler
# 均值方差归一化
scaler = StandardScaler()
# 使用X_train得到均值方差归一化参数,比如std、mean等
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
# 均值
scaler.mean_ # array([5.83416667, 3.08666667, 3.70833333, 1.17 ])
# 数据分布情况(标准差)
scaler.scale_ # array([0.81019502, 0.44327067, 1.76401924, 0.75317107])
# 对训练数据进行归一化
X_train = scaler.transform(X_train)
# 对测试数据进行归一化
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=3, p=2, weights='uniform')
# 准确率计算
clf.score(X_test,y_test) # 1.0
二、手写数字图片案例
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
digits = datasets.load_digits()
digits.keys()
dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
X = digits.data
y = digits.target
print(X.shape) # (1797, 64)
print(y.shape) # (1797,)
2.1 训练集、测试集拆分
# test_size:测试集占比
# random_state:随机种子
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=666)
from sklearn.neighbors import KNeighborsClassifier
# 创建分类器
clf = KNeighborsClassifier(n_neighbors=3)
2.2 训练
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
2.3 预期
y_predict = clf.predict(X_test)
from sklearn.metrics import accuracy_score
# accuracy 精确度得分
accuracy_score(y_test,y_predict) # 0.9888888888888889
# 不经过测试集测试,直接算出模型得分
clf.score(X_test,y_test) # 0.9888888888888889
2.4 Grid Search 网格搜索,查询最优模型参数
# weights=uniform 所有的点具有相同的权重
# weights=distance时,p才有意义
'''
权重是距离反比,距离较近的点比距离远的点影响更大
p:整数,可选,默认值2
闵科夫斯基距离的指数,
当p=1,等价于使用曼哈顿距离
当p=2,等价于欧式距离
'''
param = [
{
'weights':['uniform'],
'n_neighbors':[i for i in range (1,11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range (1,11)],
'p':[i for i in range (1,6)]
}
]
说明:cross-validation 交叉验证,将训练数据分成多分,分别取出一部分进行训练,另一部分进行调参,最后用测试数据进行验证。这样可以有效避免在调参阶段模型对测试数据过拟合
knn_clf = KNeighborsClassifier()
# cross-validation
from sklearn.model_selection import GridSearchCV
%%time
# n_jobs:分配多少个核给它,-1表示所有
# verbose:过程输出,数据越大输出的信息越多越详细
search = GridSearchCV(knn_clf, param, n_jobs=-1,verbose=4)
search.fit(X_train, y_train)
# 最优模型参数,返回最优模型
search.best_estimator_
'''
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform')
'''
# 该模型最好的准确度
search.best_score_ # 0.9860820751064653
# 最佳参数
search.best_params_ # {'n_neighbors': 1, 'weights': 'uniform'}
# 最优模型赋值给knn_clf
knn_clf = search.best_estimator_
knn_clf.score(X_test, y_test) # 0.9833333333333333