摘要
对于模型超参的调整,为了加快搜索速度,我们可以借助GridSearch来穷举找到一个最优值,也可以使用RandomOptimization在更大的搜索域进行随机搜索来找到一个最优值,这里我根据demo分别用两种搜索方法,结合SVM和RandomForest进行简单的超参搜索.
测试环境
Entity | Value |
---|---|
Python | 3.5.2 |
sklearn | 0.18.1 |
代码
# 6.4.1: 寻找最优的超参值的组合
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint # 用于产生随机分布的整数
# 导入数据
df = pd.read_csv('./Data/UCI/wdbc.data.txt')
# Slicing
X, y = df.iloc[:, 2:].values, df.iloc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
# 使用随机搜索,使用SVM分类器
def RPO_SVM(X_train, y_train):
# 构造模型
pipe_svc = Pipeline([('scl', StandardScaler()),
('clf', SVC(random_state=1))])
param_range = [10 ** c for c in range(-4, 4)]
param_dist = { # 对于核SVM则需要同时调优C和gamma值
'clf__C': param_range,
'clf__gamma': param_range,
'clf__kernel': ['linear', 'rbf']
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(pipe_svc,
param_distributions=param_dist,
n_iter=n_iter_search)
random_search.fit(X_train, y_train)
report(random_search.cv_results_)
return random_search.best_estimator_
# 使用随机搜索,使用RandomForest分类器
def RPO_RF(X_train, y_train):
# build a classifier
clf = RandomForestClassifier(n_estimators=20)
# specify parameters and distributions to sample from
param_dist = {
"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf,
param_distributions=param_dist,
n_iter=n_iter_search)
random_search.fit(X_train, y_train)
report(random_search.cv_results_)
return random_search.best_estimator_
# 使用网格搜索,使用SVM分类器
def GS_SVM(X_train, y_train):
# 构造模型
pipe_svc = Pipeline([('scl', StandardScaler()),
('clf', SVC(random_state=1))])
param_range = [10**c for c in range(-4, 4)]
# param_range = np.linspace(0.0001, 1, 10)
param_grid = [
{'clf__C': param_range, 'clf__kernel': ['linear']}, # 对于线性SVM只需要调优正则化参数C
{'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']} # 对于核SVM则需要同时调优C和gamma值
]
gs = GridSearchCV(estimator=pipe_svc,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=-1)
gs = gs.fit(X_train, y_train)
# print(gs.best_score_)
# print(gs.best_params_)
# 使用最优模型进行性能评估
# clf = gs.best_estimator_
# clf.fit(X_train, y_train)
# print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
report(gs.cv_results_)
return gs.best_estimator_
# 使用网格搜索,使用RandomForest分类器
def GS_RF(X_train, y_train):
clf = RandomForestClassifier(n_estimators=20)
# 写法1:
param_grid = [
{
'max_depth': [3, None],
'max_features': range(1, 4),
"min_samples_split": range(2, 4),
"min_samples_leaf": range(1, 4),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
}
]
# 写法2: 分别对不同的信息增益设定参数范围
# param_grid = [
# {
# 'max_depth': [3, None],
# 'max_features': range(1, 4),
# "min_samples_split": range(2, 4),
# "min_samples_leaf": range(1, 4),
# "bootstrap": [True, False],
# "criterion": ["gini"]
# },
# {
# 'max_depth': [3, None],
# 'max_features': range(1, 3),
# "min_samples_split": range(2, 3),
# "min_samples_leaf": range(1, 3),
# "bootstrap": [True, False],
# "criterion": ["entropy"]
# }
# ]
# run grid search
gs = GridSearchCV(estimator=clf,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=-1)
gs = gs.fit(X_train, y_train)
# print(gs.best_score_)
# print(gs.best_params_)
report(gs.cv_results_)
return gs.best_estimator_
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# RPO_SVM(X_train, y_train)
# RPO_RF(X_train, y_train)
GS_RF(X_train, y_train)
# GS_SVM(X_train, y_train)