import matplotlib
from sklearn.model_selection import train_test_split
from sklearn import datasets
dig=datasets.load_digits()#读入sklearn内置数据
print(dig.keys())
X=dig.data
y=dig.target
X,y
这段代码载入dighits数据,并读取数据的信息
运行如下:
some=X[555]#随便挑选一组数据,作图,可以看出这组数据是手写的数字的数据
print(y[555])
some1=some.reshape(8,8)
plt.imshow(some1,cmap=matplotlib.cm.binary)
plt.show()
运行如下:
#使用train_test_split方法划分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train.shape
#使用sklearn库中的KNN方法
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train,y_train)#得到模型
y_predict=KNN.predict(X_test)
accuracy=sum(y_predict==y_test)/len(y_test)
print("预测结果准确度:",accuracy)
或者用sklearn中的方法求准确度,两个结果一样
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)
这差不多也算一个简单的机器学习过程了,输入数据,建立模型,预测。与完整的相比,还少了数据预处理、模型优化、调参等等优化步骤
超参数:指在运行机器学习算法之前设置的参数,比如knn中的k
模型参数:算法过程中学习的参数,knn中没有模型参数
下面寻找最好的k
best_k=-1
best_score=0.0
for k in range(1,11):
knn_clf=KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train,y_train)
score=knn_clf.score(X_test,y_test)
if score>best_score:
best_k=k
best_score=score
print("best_k=",best_k)
print("best_score=",best_score)
使用网格搜索方法找最好的超参数的值
#weigths,n_neighbors,p都为knn中的超参数
param_grid=[
{
'weights':['uniform'],
'n_neighbors':[i for i in range (1,11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range (1,11)],
'p':[i for i in range (1,6)]
}
]
knn_clf=KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(knn_clf,param_grid)
%%time #得到运行的时间
grid_search.fit(X_train,y_train)
运行结果:
grid_search.best_score_
0.9874739039665971,相比上面的还低些,,,解释说是机器学习算法的评估标准不同
grid_search.best_params_#得到最优参数
{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
knn_clf=grid_search.best_estimator_
knn_clf.score(X_test,y_test)
0.986111111111111
这些就是使用sklearn中的model_selection模块封装的GridSearchCV方法找到最好超参数的步骤