数据集来源 (kaggle的mobile数据集)
import optuna
import pandas as pd
import numpy as np
import torch
from hyperopt.pyll import scope
from sklearn import ensemble, metrics, model_selection, preprocessing, pipeline, decomposition
from functools import partial
from skopt import space, gp_minimize
from hyperopt import hp, fmin, tpe, Trials
def optimize(params, param_names, x, y):
params = dict(zip(param_names, params))
model = ensemble.RandomForestClassifier(**params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=x, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = x[train_idx]
ytrain = y[train_idx]
xtest = x[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1*np.mean(accuracies)
def optimize_1(params, x, y):
model = ensemble.RandomForestClassifier(**params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=x, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = x[train_idx]
ytrain = y[train_idx]
xtest = x[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1*np.mean(accuracies)
def optimize_2(trial, x, y):
criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
n_estimators = trial.suggest_int("n_estimators", 100, 1500)
max_depth = trial.suggest_int("max_depth", 3, 15)
max_features = trial.suggest_uniform("max_features", 0.1, 1.0)
model = ensemble.RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
max_features=max_features,
criterion=criterion,
)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=x, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = x[train_idx]
ytrain = y[train_idx]
xtest = x[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1*np.mean(accuracies)
if __name__ == "__main__":
df = pd.read_csv("../Data/archive/train.csv")
X = df.drop("price_range", axis=1).values
y = df.price_range.values
classifier = ensemble.RandomForestClassifier(n_jobs=4)
'''
网格搜索
'''
param_grid_1 = {
"n_estimators": [100, 200, 300, 400],
"max_depth": [1, 3],
"criterion": ["gini", "entropy"],
}
'''
随机搜索
'''
param_grid_2 = {
"n_estimators": np.arange(100, 1500, 100),
"max_depth": np.arange(1, 20),
"criterion": ["gini", "entropy"],
}
model1 = model_selection.GridSearchCV(
estimator=classifier,
param_grid=param_grid_1,
scoring="accuracy",
verbose=10,
n_jobs=4,
cv=5,
)
model2 = model_selection.RandomizedSearchCV(
estimator=classifier,
param_distributions=param_grid_2,
n_iter=10,
scoring="accuracy",
verbose=10,
n_jobs=4,
cv=5,
)
scl = preprocessing.StandardScaler()
pca = decomposition.PCA()
rf = ensemble.RandomForestClassifier(n_jobs=8)
classifier1 = pipeline.Pipeline([("scaling", scl), ("pca", pca), ("rf", rf)])
param_grid_3={
"pca__n_components": np.arange(5, 10),
"rf__n_estimators": np.arange(100, 1500, 100),
"rf__max_depth": np.arange(1, 20),
"rf__criterion": ["gini", "entropy"],
}
model3 = model_selection.RandomizedSearchCV(
estimator=classifier1,
param_distributions=param_grid_3,
n_iter=10,
scoring="accuracy",
verbose=10,
n_jobs=1,
cv=5,
)
# model.fit(X, y)
# print(model.best_score_)
# print(model.best_estimator_)
'''
test space
'''
# param_space=[
# space.Integer(3, 15, name="max_depth"),
# space.Integer(100, 600, name="n_estimators"),
# space.Categorical(["gini", "entropy"], name="criterion"),
# space.Real(0.01, 1, prior="uniform", name="max_features")
# ]
# param_names=["max_depth", "n_estimators", "criterion", "max_features"]
# optimization_function = partial(
# optimize,
# param_names=param_names,
# x=X,
# y=y
# )
# result = gp_minimize(
# optimization_function,
# dimensions = param_space,
# n_calls = 15,
# n_random_starts=10,
# verbose = 10,
# )
# print(dict(zip(param_names, result.x)))
'''
hyperopt
'''
# param_space1={
# "max_depth": scope.int(hp.quniform("max_depth", 3, 15, 1)),
# "n_estimators": scope.int(hp.quniform("n_estimators", 100, 600, 1)),
# "criterion": hp.choice("criterion", ["gini", "entropy"]),
# "max_features": hp.uniform("max_features", 0.01, 1),
# }
# optimization_function_1 = partial(
# optimize_1,
# x=X,
# y=y
# )
# trials = Trials()
#
# result = fmin(
# optimization_function_1,
# space=param_space1,
# algo=tpe.suggest,
# max_evals=15,
# trials=trials,
# )
# print(result)
'''
optuna
'''
optimization_function = partial(optimize_2, x=X, y=y)
study = optuna.create_study(study_name='test', direction="minimize", storage='sqlite:///db.sqlite3')
# study = optuna.create_study(direction="minimize")
study.optimize(optimization_function, n_trials=15)