一、交叉验证 cross_val_score:
sklearn.cross_validation.cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’)
参数
estimator:数据对象
X:数据
y:预测数据
soring:调用的方法
cv:交叉验证生成器或可迭代的次数
n_jobs:同时工作的cpu个数(-1代表全部)
verbose:详细程度
fit_params:传递给估计器的拟合方法的参数
pre_dispatch:控制并行执行期间调度的作业数量。减少这个数量对于避免在CPU发送更多作业时CPU内存消耗的扩大是有用的。该参数可以是:
- 没有,在这种情况下,所有的工作立即创建并产生。将其用于轻量级和快速运行的作业,以避免由于按需产生作业而导致延迟
- 一个int,给出所产生的总工作的确切数量
- 一个字符串,给出一个表达式作为n_jobs的函数,如'2 * n_jobs'
返回
交叉验证每次运行的评分数组
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import cross_val_score
#下载数据
iris=load_iris()
X=iris.data
Y=iris.target
#-------------------------------------------------------
#交叉验证
KNN=KNeighborsClassifier()
score=cross_val_score(KNN,X,Y,scoring='accuracy',cv=6) #cv为分成的等份数
score.mean()
'''
0.9660493827160493
'''
#------------------------------------------------------
#-------------------------------------------------------
#K值上限一般为样本量的平方根
#交叉验证,挑选合适的K值
erros=[]
for k in range(1,14):
KNN=KNeighborsClassifier(n_neighbors=k)
score=cross_val_score(KNN,X,Y,scoring='accuracy',cv=6).mean()
#误差越小,说明K选择越合适
erros.append(1-score)
import matplotlib.pyplot as plt
import numpy as np
#K=11时,误差最小,说明K=11对该案例来说是最合适的K值
plt.plot(np.arange(1,14),erros)
#-------------------------------------------------------
#-------------------------------------------------------
#交叉验证,选择合适的K值及W值,该处即为模型调参
weights=['uniform','distance']
erros=[]
result={}
for k in range(1,14):
for w in weights:
KNN=KNeighborsClassifier(n_neighbors=k,weights=w)
sm=cross_val_score(KNN,X,Y,scoring='accuracy',cv=6).mean()
result[w+str(k)]=sm
np.array(list(result.values())).argmax()
result.index[20]
二、网格搜索 GridSearchCV
GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)
estimator:选择使用的分类器,并且传入除需要确定最佳的参数之外的其他参数。每一个分类器都需要一个scoring参数,或者score方法:
estimator=RandomForestClassifier(min_samples_split=100,min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10),
param_grid:需要最优化的参数的取值,值为字典或者列表;
scoring=None:模型评价标准,默认None,这时需要使用score函数;或者如scoring='roc_auc',根据所选模型不同,评价准则不同。
n_jobs: 并行数,int:个数,-1:跟CPU核数一致, 1:默认值
cv=None 交叉验证参数,默认None,使用三折交叉验证。指定fold数量,默认为3,也可以是yield训练/测试数据的生成器。
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
#网格
from sklearn.model_selection import GridSearchCV
import pandas as pd
#导入数据
data=load_breast_cancer()
data_cancer=data.data
target=data.target
cancer=pd.DataFrame(data_cancer,columns=data.feature_names)
X=cancer.copy()
cancer['Diagnosis']=data.target
#分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(pd.DataFrame(data_cancer,columns=data.feature_names),data.target,test_size=0.2,random_state=44)
knn=KNeighborsClassifier()
params={"n_neighbors":[i for i in range(1,30)],
"weights":['uniform','distance'],
"p":[1,2]
}
gcv=GridSearchCV(knn,params,scoring='accuracy',cv=6)
#训练集(X_train,y_train)上训练模型
gcv.fit(X_train,Y_train)
#最佳参数组合
gcv.best_estimator_
#最佳学习器的评估分数
gcv.best_score_
'''
0.9384615384615385
'''
#预测
y_=gcv.predict(X_test)
##评价准确度 :方法一
gcv.score(X_test,Y_test)
##评价准确度 :方法二
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_)
#方法三:混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,y_)
#方法四:精确率、召回率、f1调和平均值
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_,target_names=['B','M']))
#----------------------------------------------
###提升准确率、提升精确率、提升召回率
#数据归一化操作:方法一
X_normal=(X-X.min())/(X.max()-X.min())
#分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X_normal,data.target,test_size=0.2,random_state=44)
knn=KNeighborsClassifier()
params={"n_neighbors":[i for i in range(1,30)],
"weights":['uniform','distance'],
"p":[1,2]
}
gcv=GridSearchCV(knn,params,scoring='accuracy',cv=6)
#训练集(X_train,y_train)上训练模型
gcv.fit(X_train,Y_train)
#最佳参数组合
gcv.best_estimator_
#最佳学习器的评估分数
gcv.best_score_
'''
0.9736263736263736
'''
#预测
y_=gcv.predict(X_test)
##评价准确度 :方法一
gcv.score(X_test,Y_test)
'''
0.9824561403508771
'''
#----------------------------------------------
#数据归一化操作:方法一: Z-Score归一化
X_norm2=(X-X.mean())/X.std()
X_train, X_test, Y_train, Y_test = train_test_split(X_norm2,data.target,test_size=0.2,random_state=44)
knn=KNeighborsClassifier()
params={"n_neighbors":[i for i in range(1,30)],
"weights":['uniform','distance'],
"p":[1,2]
}
gcv=GridSearchCV(knn,params,scoring='accuracy',cv=6)
#训练集(X_train,y_train)上训练模型
gcv.fit(X_train,Y_train)
#最佳参数组合
gcv.best_estimator_
#最佳学习器的评估分数
gcv.best_score_
'''
0.967032967032967
'''
#预测
y_=gcv.predict(X_test)
##评价准确度 :方法一
gcv.score(X_test,Y_test)
'''
0.9736842105263158
'''
#------------------------------
from sklearn.preprocessing import MinMaxScaler,StandardScaler
#MinMaxScaler和最大值最小值归一化效果一样
mms=MinMaxScaler()
mms.fit(X)