摘要
在分类问题中,模型的选择方式和评判标准多种多样,为了方便以后使用,在粗浅的知识储备下,我用sklearn封装了一下各种选择器和一部分评分方法,提供了单一分类器,网格搜索和随机搜索三种方式,使用的时候可以根据需求自由定制参数,添加需要用到的分类器
没准以后还会更新_(:з」∠)_
# 指定模型后对数据进行模型评分评估
import numpy as np
import pandas as pd
# 分类器
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier # 多数投票分类器
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 评分方法
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
class ModelSelectionTools:
"""
使用不同的策略进行模型选择,对于有必要进行标准化的数据都进行标准化
"""
classifiers_ = None
def __init__(self):
self.config_ = {
'scoring': 'accuracy',
# 'scoring': 'roc_auc', # 只能适用于二分类问题
'cv': 5
}
self.classifiers_ = {
'lr': Pipeline([('scl', StandardScaler()),
('lr', LogisticRegression(penalty='l2', C=1, random_state=1))]),
'rf': RandomForestClassifier(random_state=1, n_estimators=100),
'gnb': GaussianNB(),
'svm': Pipeline([('scl', StandardScaler()),
('svc', SVC(probability=True, random_state=1))]),
'knn': KNeighborsClassifier(n_neighbors=5,
p=2,
metric='minkowski')
}
def score_single_classifier(self, X, y):
scoring = self.config_['scoring']
cv = self.config_['cv']
classifiers_ = self.classifiers_
# 获取分类器实体以及名称
classifiers_clf_ = [classifiers_[key] for key in classifiers_]
classifiers_name_ = [key for key in classifiers_]
for clf, label in zip(classifiers_clf_, classifiers_name_):
scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))
# 使用网格搜索进行超参搜索
def score_single_gridSearch(self, X, y):
# Step 1: 初始化参数
# -----------------
scoring = self.config_['scoring']
cv = self.config_['cv']
classifiers_ = self.classifiers_
# 存储所有的网格搜索实体
gs_classifiers_ = {}
# End of Step 1
# -----------------
# Step 2: 初始化各网格搜索器
# ----------------------
# 初始化lr搜索器
param_range = [10 ** c for c in range(-4, 4)]
param_grid_lr = [
{'lr__C': param_range, 'lr__penalty': ['l1', 'l2']}
]
gs_lr = GridSearchCV(estimator=classifiers_['lr'],
param_grid=param_grid_lr,
scoring=scoring,
cv=cv,
n_jobs=-1)
gs_classifiers_['lr'] = gs_lr
# 初始化rf搜索器: 但是使用rf和CrossValidation会很慢
# param_grid_rf = [
# {
# 'max_depth':[2,3,4,5,None],
# 'max_features': range(1, X.shape[1]),
# 'min_samples_split':[2,3,4,5],
# 'min_samples_leaf' : [1,2,3,4,5],
# 'bootstrap' : [True, False],
# 'criterion': ["gini", "entropy"]
# }
# ]
# gs_rf = GridSearchCV(estimator=classifiers_['rf'],
# param_grid=param_grid_rf,
# scoring=scoring,
# cv=cv,
# n_jobs=-1)
# gs_classifiers_['rf'] = gs_rf
# 初始化knn搜索器
param_grid_knn = [
{
'n_neighbors': range(3, 10),
# 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
'p': range(2, 10)
}
]
gs_knn = GridSearchCV(estimator=classifiers_['knn'],
param_grid=param_grid_knn,
scoring=scoring,
cv=cv,
n_jobs=-1)
gs_classifiers_['knn'] = gs_knn
# 初始化svm搜索器
param_range = [10 ** c for c in range(-4, 4)]
param_grid_svm = [
{'svc__C': param_range, 'svc__kernel': ['linear']}, # 对于线性SVM只需要调优正则化参数C
{'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf', 'poly', 'sigmoid']} # 对于核SVM则需要同时调优C和gamma值
]
gs_svm = GridSearchCV(estimator=classifiers_['svm'],
param_grid=param_grid_svm,
scoring=scoring,
cv=cv,
n_jobs=-1)
gs_classifiers_['svm'] = gs_svm
# Step 3: 开始评估
# 获取分类器实体以及名称
classifiers_grid_clf_ = [gs_classifiers_[key] for key in classifiers_ if key in gs_classifiers_.keys()]
classifiers_grid_name_ = [key for key in classifiers_ if key in gs_classifiers_.keys()]
# 通过网格搜索评估分类器,找出最优超参
for clf, label in zip(classifiers_grid_clf_, classifiers_grid_name_):
# 默认评分
# clf.fit(X, y)
# print('[%s]' % label)
# self.report(results=clf.cv_results_, n_top=1)
# 交叉评分:在使用随机森林时会很慢
scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))
clf.fit(X, y)
print('Best Model: ', clf.best_params_)
# k-折评分
# print('[%s]: ' % label)
# self.SKF(estimator=clf, X_train=X, y_train=y)
# 使用随机搜索进行超参搜索
def score_single_randomSearch(self, X, y):
# Step 1: 初始化参数
# -----------------
scoring = self.config_['scoring']
cv = self.config_['cv']
classifiers_ = self.classifiers_
# 存储所有的网格搜索实体
rs_classifiers_ = {}
# End of Step 1
# -----------------
# Step 2: 初始化各随机搜索器
# ----------------------
# 初始化lr搜索器
param_range = [10 ** c for c in range(-10, 10)]
param_random_lr = {
'lr__C': param_range,
'lr__penalty': ['l1', 'l2']
}
n_iter_search = 20
rs_lr = RandomizedSearchCV(classifiers_['lr'],
param_distributions=param_random_lr,
n_iter=n_iter_search)
rs_classifiers_['lr'] = rs_lr
# 初始化rf搜索器
# param_random_rf = {
# "max_depth": [3, None],
# # 'max_features': range(1, X.shape[1]),
# # 'min_samples_split': [2, 3, 4, 5],
# # 'min_samples_leaf': [1, 2, 3, 4, 5],
# "max_features": sp_randint(1, X.shape[1]),
# "min_samples_split": sp_randint(2, 11),
# "min_samples_leaf": sp_randint(1, 11),
# 'bootstrap': [True, False],
# 'criterion': ["gini", "entropy"]
# }
# n_iter_search = 10
# rs_rf = RandomizedSearchCV(classifiers_['rf'],
# param_distributions=param_random_rf,
# n_iter=n_iter_search)
# rs_classifiers_['rf'] = rs_rf
# 初始化knn搜索器
param_grid_knn = {
'n_neighbors': range(3, 10),
# 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
'p': range(2, 10)
}
n_iter_search = 5
rs_knn = RandomizedSearchCV(classifiers_['knn'],
param_distributions=param_grid_knn,
n_iter=n_iter_search)
rs_classifiers_['knn'] = rs_knn
# 初始化svm搜索器
param_range = [10 ** c for c in range(-10, 10)]
param_grid_svm = {
'svc__C': param_range,
'svc__gamma': param_range,
'svc__kernel': ['rbf', 'poly', 'sigmoid']
}
n_iter_search = 20
rs_svc = RandomizedSearchCV(classifiers_['svm'],
param_distributions=param_grid_svm,
n_iter=n_iter_search)
rs_classifiers_['svm'] = rs_svc
# Step 3: 开始评估
# 获取分类器实体以及名称
classifiers_random_clf_ = [rs_classifiers_[key] for key in classifiers_ if key in rs_classifiers_.keys()]
classifiers_random_name_ = [key for key in classifiers_ if key in rs_classifiers_.keys()]
# 通过网格搜索评估分类器,找出最优超参
for clf, label in zip(classifiers_random_clf_, classifiers_random_name_):
# 默认评分
# clf.fit(X, y)
# print('[%s]' % label)
# self.report(results=clf.cv_results_, n_top=1)
# 交叉评分:在使用随机森林时会很慢
scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))
clf.fit(X, y)
print('Best Model: ', clf.best_params_)
# k-折评分
# print('[%s]: ' % label)
# self.SKF(estimator=clf, X_train=X, y_train=y)
# Utility function to report best scores
def report(self, results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
# print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} +/- {1:.3f}".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# 分层k折交叉验证->可以保留类别比例
def SKF(self, estimator, X_train, y_train):
skf = StratifiedKFold(n_splits=10, random_state=1)
scores = []
for k, (train, test) in enumerate(skf.split(X_train, y_train)):
estimator.fit(X_train[train], y_train[train])
score = estimator.score(X_train[test], y_train[test])
scores.append(score)
# print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1, np.bincount(y_train[train]), score))
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # np.std计算标准差
print('Best Model: ', estimator.best_params_)
# Demo
if __name__ == "__main__":
# 载入数据
df = pd.read_csv('./Data/UCI/iris/iris.data')
# Data Slicing
X, y = df.iloc[:, 0:4].values, df.iloc[:, 4].values
le = LabelEncoder()
y = le.fit_transform(y)
# Model Building
mst = ModelSelectionTools()
mst.score_single_classifier(X, y)
# mst.score_single_gridSearch(X, y)
# mst.score_single_randomSearch(X, y)