[读书笔记] 《Python 机器学习》- 各种模型选择方法的封装

摘要

在分类问题中,模型的选择方式和评判标准多种多样,为了方便以后使用,在粗浅的知识储备下,我用sklearn封装了一下各种选择器和一部分评分方法,提供了单一分类器,网格搜索和随机搜索三种方式,使用的时候可以根据需求自由定制参数,添加需要用到的分类器

没准以后还会更新_(:з」∠)_

# 指定模型后对数据进行模型评分评估
import numpy as np
import pandas as pd

# 分类器
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier # 多数投票分类器
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 评分方法
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint as sp_randint

class ModelSelectionTools:
    """
    使用不同的策略进行模型选择,对于有必要进行标准化的数据都进行标准化
    """

    classifiers_ = None

    def __init__(self):
        self.config_ = {
            'scoring': 'accuracy',
            # 'scoring': 'roc_auc', # 只能适用于二分类问题
            'cv': 5
        }

        self.classifiers_ = {
            'lr': Pipeline([('scl', StandardScaler()),
                            ('lr', LogisticRegression(penalty='l2', C=1, random_state=1))]),
            'rf': RandomForestClassifier(random_state=1, n_estimators=100),
            'gnb': GaussianNB(),
            'svm': Pipeline([('scl', StandardScaler()),
                             ('svc', SVC(probability=True, random_state=1))]),
            'knn': KNeighborsClassifier(n_neighbors=5,
                                        p=2,
                                        metric='minkowski')
        }

    def score_single_classifier(self, X, y):
        scoring = self.config_['scoring']
        cv = self.config_['cv']
        classifiers_ = self.classifiers_

        # 获取分类器实体以及名称
        classifiers_clf_  = [classifiers_[key] for key in classifiers_]
        classifiers_name_ = [key for key in classifiers_]

        for clf, label in zip(classifiers_clf_, classifiers_name_):
            scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
            print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))


    # 使用网格搜索进行超参搜索
    def score_single_gridSearch(self, X, y):

        # Step 1: 初始化参数
        # -----------------
        scoring = self.config_['scoring']
        cv = self.config_['cv']
        classifiers_ = self.classifiers_

        # 存储所有的网格搜索实体
        gs_classifiers_ = {}

        # End of Step 1
        # -----------------

        # Step 2: 初始化各网格搜索器
        # ----------------------

        # 初始化lr搜索器
        param_range = [10 ** c for c in range(-4, 4)]
        param_grid_lr = [
            {'lr__C': param_range, 'lr__penalty': ['l1', 'l2']}
        ]
        gs_lr = GridSearchCV(estimator=classifiers_['lr'],
                             param_grid=param_grid_lr,
                             scoring=scoring,
                             cv=cv,
                             n_jobs=-1)
        gs_classifiers_['lr'] = gs_lr

        # 初始化rf搜索器: 但是使用rf和CrossValidation会很慢
        # param_grid_rf = [
        #     {
        #         'max_depth':[2,3,4,5,None],
        #         'max_features': range(1, X.shape[1]),
        #         'min_samples_split':[2,3,4,5],
        #         'min_samples_leaf' : [1,2,3,4,5],
        #         'bootstrap' : [True, False],
        #         'criterion': ["gini", "entropy"]
        #     }
        # ]
        # gs_rf = GridSearchCV(estimator=classifiers_['rf'],
        #                      param_grid=param_grid_rf,
        #                      scoring=scoring,
        #                      cv=cv,
        #                      n_jobs=-1)
        # gs_classifiers_['rf'] = gs_rf

        # 初始化knn搜索器
        param_grid_knn = [
            {
                'n_neighbors': range(3, 10),
                # 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'p': range(2, 10)
            }
        ]
        gs_knn = GridSearchCV(estimator=classifiers_['knn'],
                              param_grid=param_grid_knn,
                              scoring=scoring,
                              cv=cv,
                              n_jobs=-1)
        gs_classifiers_['knn'] = gs_knn

        # 初始化svm搜索器
        param_range = [10 ** c for c in range(-4, 4)]
        param_grid_svm = [
            {'svc__C': param_range, 'svc__kernel': ['linear']},  # 对于线性SVM只需要调优正则化参数C
            {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf', 'poly', 'sigmoid']}  # 对于核SVM则需要同时调优C和gamma值
        ]
        gs_svm = GridSearchCV(estimator=classifiers_['svm'],
                              param_grid=param_grid_svm,
                              scoring=scoring,
                              cv=cv,
                              n_jobs=-1)
        gs_classifiers_['svm'] = gs_svm

        # Step 3: 开始评估

        # 获取分类器实体以及名称
        classifiers_grid_clf_ = [gs_classifiers_[key] for key in classifiers_ if key in gs_classifiers_.keys()]
        classifiers_grid_name_ = [key for key in classifiers_ if key in gs_classifiers_.keys()]

        # 通过网格搜索评估分类器,找出最优超参
        for clf, label in zip(classifiers_grid_clf_, classifiers_grid_name_):
            # 默认评分
            # clf.fit(X, y)
            # print('[%s]' % label)
            # self.report(results=clf.cv_results_, n_top=1)

            # 交叉评分:在使用随机森林时会很慢
            scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
            print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))
            clf.fit(X, y)
            print('Best Model: ', clf.best_params_)

            # k-折评分
            # print('[%s]: ' % label)
            # self.SKF(estimator=clf, X_train=X, y_train=y)



    # 使用随机搜索进行超参搜索
    def score_single_randomSearch(self, X, y):
        # Step 1: 初始化参数
        # -----------------
        scoring = self.config_['scoring']
        cv = self.config_['cv']
        classifiers_ = self.classifiers_

        # 存储所有的网格搜索实体
        rs_classifiers_ = {}

        # End of Step 1
        # -----------------

        # Step 2: 初始化各随机搜索器
        # ----------------------

        # 初始化lr搜索器
        param_range = [10 ** c for c in range(-10, 10)]
        param_random_lr = {
            'lr__C': param_range,
            'lr__penalty': ['l1', 'l2']
        }
        n_iter_search = 20
        rs_lr = RandomizedSearchCV(classifiers_['lr'],
                                   param_distributions=param_random_lr,
                                   n_iter=n_iter_search)
        rs_classifiers_['lr'] = rs_lr

        # 初始化rf搜索器
        # param_random_rf = {
        #     "max_depth": [3, None],
        #     # 'max_features': range(1, X.shape[1]),
        #     # 'min_samples_split': [2, 3, 4, 5],
        #     # 'min_samples_leaf': [1, 2, 3, 4, 5],
        #     "max_features": sp_randint(1, X.shape[1]),
        #     "min_samples_split": sp_randint(2, 11),
        #     "min_samples_leaf": sp_randint(1, 11),
        #     'bootstrap': [True, False],
        #     'criterion': ["gini", "entropy"]
        # }
        # n_iter_search = 10
        # rs_rf = RandomizedSearchCV(classifiers_['rf'],
        #                            param_distributions=param_random_rf,
        #                            n_iter=n_iter_search)
        # rs_classifiers_['rf'] = rs_rf

        # 初始化knn搜索器
        param_grid_knn = {
            'n_neighbors': range(3, 10),
            # 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p': range(2, 10)
        }
        n_iter_search = 5
        rs_knn = RandomizedSearchCV(classifiers_['knn'],
                                   param_distributions=param_grid_knn,
                                   n_iter=n_iter_search)
        rs_classifiers_['knn'] = rs_knn


        # 初始化svm搜索器
        param_range = [10 ** c for c in range(-10, 10)]
        param_grid_svm = {
            'svc__C': param_range,
            'svc__gamma': param_range,
            'svc__kernel': ['rbf', 'poly', 'sigmoid']
        }
        n_iter_search = 20
        rs_svc = RandomizedSearchCV(classifiers_['svm'],
                                   param_distributions=param_grid_svm,
                                   n_iter=n_iter_search)
        rs_classifiers_['svm'] = rs_svc

        # Step 3: 开始评估

        # 获取分类器实体以及名称
        classifiers_random_clf_ = [rs_classifiers_[key] for key in classifiers_ if key in rs_classifiers_.keys()]
        classifiers_random_name_ = [key for key in classifiers_ if key in rs_classifiers_.keys()]

        # 通过网格搜索评估分类器,找出最优超参
        for clf, label in zip(classifiers_random_clf_, classifiers_random_name_):
            # 默认评分
            # clf.fit(X, y)
            # print('[%s]' % label)
            # self.report(results=clf.cv_results_, n_top=1)

            # 交叉评分:在使用随机森林时会很慢
            scores = cross_val_score(estimator=clf, X=X, y=y, scoring=scoring, cv=cv)
            print(scoring, ': %.3f +/- %.3f [%s]' % (scores.mean(), scores.std(), label))
            clf.fit(X, y)
            print('Best Model: ', clf.best_params_)

            # k-折评分
            # print('[%s]: ' % label)
            # self.SKF(estimator=clf, X_train=X, y_train=y)

    # Utility function to report best scores
    def report(self, results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            for candidate in candidates:
                # print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} +/- {1:.3f}".format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")

    # 分层k折交叉验证->可以保留类别比例
    def SKF(self, estimator, X_train, y_train):
        skf = StratifiedKFold(n_splits=10, random_state=1)
        scores = []
        for k, (train, test) in enumerate(skf.split(X_train, y_train)):
            estimator.fit(X_train[train], y_train[train])
            score = estimator.score(X_train[test], y_train[test])
            scores.append(score)
            # print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1, np.bincount(y_train[train]), score))
        print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))  # np.std计算标准差
        print('Best Model: ', estimator.best_params_)


# Demo
if __name__ == "__main__":
    # 载入数据
    df = pd.read_csv('./Data/UCI/iris/iris.data')

    # Data Slicing
    X, y = df.iloc[:, 0:4].values, df.iloc[:, 4].values
    le = LabelEncoder()
    y = le.fit_transform(y)


    # Model Building
    mst = ModelSelectionTools()

    mst.score_single_classifier(X, y)
    # mst.score_single_gridSearch(X, y)
    # mst.score_single_randomSearch(X, y)
  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值