为了节省选择算法和参数的时间,快速地选择最优的分类算法和参数。一个函数函数能够同时比较不同的分类算法(例子中使用朴素贝叶斯、支持向量机、随机森林、XGBoost和LightGBM),在利用GridSearchCV搜索最佳参数的同时对测试集的准确率进行评估,最终输出结果。
1导入所需的库
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
import lightgbm as lgb
2定义函数来进行模型评估
该函数使用GridSearchCV搜索最佳参数并计算在测试集上的准确率。该函数接收训练集和测试集的特征变量和目标变量。
def model_evaluation(X_train, y_train, X_test, y_test):
'''
X_train: 训练集特征变量
y_train: 训练集目标变量
X_test: 测试集特征变量
y_test: 测试集目标变量
'''
# 初始化五个分类算法
nb = GaussianNB()
svm = SVC()
rf = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgb_model = lgb.LGBMClassifier()
# 定义参数网格
params_nb = {}
params_svm = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}
params_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}
params_xgb = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}
params_lgb = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}
# 定义一个字典,将算法和参数网格一一对应
models = {'Naive Bayes': (nb, params_nb),
'SVM': (svm, params_svm),
'Random Forest': (rf, params_rf),
'XGBoost': (xgb_model, params_xgb),
'LightGBM': (lgb_model, params_lgb)}
# 遍历字典中的算法和参数网格,进行网格搜索和模型评价
for name, (model, params) in models.items():
clf = GridSearchCV(model, params, cv=10,n_jobs=-1) # 使用十折交叉验证,并调用所有cpu资源并行计算
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Model: {} , Best Parameters: {} , Accuracy: {}'.format(name, clf.best_params_, round(accuracy, 4)))
3在白酒数据集上进行模型评估
# 使用白酒的例子
from sklearn.datasets import load_wine
wine = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.2)
model_evaluation(Xtrain, Ytrain, Xtest, Ytest)
4输出结果如下,在小样本下,运行不会很耗时,如果是大样本会非常耗时,可以感觉自己的需要控制参数和模型的多少,在选出合适的模型后再单独进行更多的调整和优化。
Model: Naive Bayes , Best Parameters: {} , Accuracy: 0.9722
Model: SVM , Best Parameters: {'C': 0.1, 'kernel': 'linear'} , Accuracy: 0.9167
Model: Random Forest , Best Parameters: {'max_depth': 3, 'n_estimators': 50} , Accuracy: 1.0
Model: XGBoost , Best Parameters: {'learning_rate': 0.1, 'max_depth': 3} , Accuracy: 0.9444
Model: LightGBM , Best Parameters: {'learning_rate': 0.1, 'max_depth': 3} , Accuracy: 0.9444
完整代码
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
import lightgbm as lgb
def model_evaluation(X_train, y_train, X_test, y_test):
'''
X_train: 训练集特征变量
y_train: 训练集目标变量
X_test: 测试集特征变量
y_test: 测试集目标变量
'''
# 初始化五个分类算法
nb = GaussianNB()
svm = SVC()
rf = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgb_model = lgb.LGBMClassifier()
# 定义参数网格
params_nb = {}
params_svm = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}
params_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}
params_xgb = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}
params_lgb = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}
# 定义一个字典,将算法和参数网格一一对应
models = {'Naive Bayes': (nb, params_nb),
'SVM': (svm, params_svm),
'Random Forest': (rf, params_rf),
'XGBoost': (xgb_model, params_xgb),
'LightGBM': (lgb_model, params_lgb)}
# 遍历字典中的算法和参数网格,进行网格搜索和模型评价
for name, (model, params) in models.items():
clf = GridSearchCV(model, params, cv=10, n_jobs=-1) # 使用十折交叉验证,并调用所有cpu资源并行计算
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Model: {} , Best Parameters: {} , Accuracy: {}'.format(name, clf.best_params_, round(accuracy, 4)))
# 使用白酒的例子
from sklearn.datasets import load_wine
wine = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.2)
model_evaluation(Xtrain, Ytrain, Xtest, Ytest)