AutoML-sklearn and torch

Laughing@me

已于 2023-03-27 21:37:20 修改

阅读量722

点赞数

分类专栏： AutoML 文章标签： sklearn 机器学习人工智能 automl

于 2023-03-27 21:29:55 首次发布

本文链接：https://blog.csdn.net/qq_45503700/article/details/129804473

版权

AutoML 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

一、auto-sklearn

1.1 环境依赖

额外安装swig 第三方库
linux 支持, mac，windows不支持

1.2 示例代码

time_left_for_this_task 设定任务最大时间

per_run_time_limit 每个子任务最大训练时间

include 可以限制任务训练的模型

import autosklearn.classification
import sklearn.model_selection
from sklearn import datasets
import sklearn.metrics

if __name__ == "__main__":
    X, y = datasets.load_breast_cancer(return_X_y=True)

    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=120,
        per_run_time_limit=30,
        tmp_folder="/tmp/autosklearn_classification_example_tmp",
        include={
            'classifier': ["random_forest"],
            'feature_preprocessor': ["no_preprocessing"]
        }
    )
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    automl.get_models_with_weights()
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
    print(automl.leaderboard())
    models_with_weights = automl.get_models_with_weights()
    with open('../../preprocess/models_report.txt', 'w') as f:
        for model in models_with_weights:
            f.write(str(model) + '\n')

结果展示:

可以展示参数任务cost值排列顺序
在这里插入图片描述
以及训练参数配置:

1.3 模块扩展

在不支持的训练模块，可以扩展及自定义模型进行自动调参

代码示例:

继承AutoSklearnClassificationAlgorithm 并重写子方法

autosklearn.pipeline.components.classification.add_classifier(MLPClassifier) 将自定义模块注册至模块中

include 参数添加既可调用

"""
====================================================
Extending Auto-Sklearn with Classification Component
====================================================

The following example demonstrates how to create a new classification
component for using in auto-sklearn.
"""
from typing import Optional
from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    CategoricalHyperparameter,
    UniformIntegerHyperparameter,
    UniformFloatHyperparameter,
)

import sklearn.metrics

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.classification
from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
from autosklearn.pipeline.constants import (
    DENSE,
    SIGNED_DATA,
    UNSIGNED_DATA,
    PREDICTIONS,
)

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


############################################################################
# Create MLP classifier component for auto-sklearn
# ================================================


class MLPClassifier(AutoSklearnClassificationAlgorithm):
    def __init__(
            self,
            hidden_layer_depth,
            num_nodes_per_layer,
            activation,
            alpha,
            solver,
            random_state=None,
    ):
        self.hidden_layer_depth = hidden_layer_depth
        self.num_nodes_per_layer = num_nodes_per_layer
        self.activation = activation
        self.alpha = alpha
        self.solver = solver
        self.random_state = random_state

    def fit(self, X, y):
        self.num_nodes_per_layer = int(self.num_nodes_per_layer)
        self.hidden_layer_depth = int(self.hidden_layer_depth)
        self.alpha = float(self.alpha)

        from sklearn.neural_network import MLPClassifier

        hidden_layer_sizes = tuple(
            self.num_nodes_per_layer for i in range(self.hidden_layer_depth)
        )

        self.estimator = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=self.activation,
            alpha=self.alpha,
            solver=self.solver,
            random_state=self.random_state,
        )
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "MLP Classifier",
            "name": "MLP CLassifier",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": False,
            # Both input and output must be tuple(iterable)
            "input": [DENSE, SIGNED_DATA, UNSIGNED_DATA],
            "output": [PREDICTIONS],
        }

    @staticmethod
    def get_hyperparameter_search_space(
            feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()
        hidden_layer_depth = UniformIntegerHyperparameter(
            name="hidden_layer_depth", lower=1, upper=3, default_value=1
        )
        num_nodes_per_layer = UniformIntegerHyperparameter(
            name="num_nodes_per_layer", lower=16, upper=216, default_value=32
        )
        activation = CategoricalHyperparameter(
            name="activation",
            choices=["identity", "logistic", "tanh", "relu"],
            default_value="relu",
        )
        alpha = UniformFloatHyperparameter(
            name="alpha", lower=0.0001, upper=1.0, default_value=0.0001
        )
        solver = CategoricalHyperparameter(
            name="solver", choices=["lbfgs", "sgd", "adam"], default_value="adam"
        )
        cs.add_hyperparameters(
            [
                hidden_layer_depth,
                num_nodes_per_layer,
                activation,
                alpha,
                solver,
            ]
        )
        return cs


# Add MLP classifier component to auto-sklearn.
autosklearn.pipeline.components.classification.add_classifier(MLPClassifier)
cs = MLPClassifier.get_hyperparameter_search_space()
print(cs)


############################################################################
# Data Loading
# ============
def get_local_csv():
    import pandas as pd
    import numpy as np
    df = pd.read_csv("/data/projects/example/auto_ml/Radiomics-2D/features.csv")
    label = pd.read_csv("/data/projects/example/auto_ml/Radiomics-2D/labels.csv")["label"]
    label = np.array([1 if l == "Positive" else 0 for l in label])
    return df.to_numpy(), label


# local
X, y = get_local_csv()

# breast cancer
# X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y)

############################################################################
# Fit MLP classifier to the data
# ==============================

clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,
    per_run_time_limit=30,
    include={"classifier": ["gradient_boosting", "adaboost", "MLPClassifier"],
             'feature_preprocessor': ["no_preprocessing"]},
)
clf.fit(X_train, y_train)

############################################################################
# Print test accuracy and statistics
# ==================================

y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
print(clf.sprint_statistics())
print(clf.leaderboard(detailed=False,top_k=30))
pprint(clf.show_models(), indent=4)

models_with_weights = clf.get_models_with_weights()
with open('./models_report.txt', 'w') as f:
    for model in models_with_weights:
        f.write(str(model) + '\n')

二、auto-pytorch

1. 1 环境依赖

额外安装brew install cmake

lightgbm 库依赖第三方库 pip install lightgbm

brew install libomp

pip install autoPyTorch

mac 允许不限制memory, M1 芯片对内容限制的操作目前还有bug

在这里插入图片描述

1.2 支持用法

支持大量的表格型数据，图片数据支持少，且不支持扩展
在这里插入图片描述
代码示例:

用法比较固定，没有更多的文档来作为参考，且无法扩展。

import numpy as np

import sklearn.model_selection

import torchvision.datasets

from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline

# Get the training data for tabular classification
trainset = torchvision.datasets.FashionMNIST(root='../datasets/', train=True, download=True)
data = trainset.data.numpy()
data = np.expand_dims(data, axis=3)
# Create a proof of concept pipeline!
dataset_properties = dict()
pipeline = ImageClassificationPipeline(dataset_properties=dataset_properties)

# Train and test split
train_indices, val_indices = sklearn.model_selection.train_test_split(
    list(range(data.shape[0])),
    random_state=1,
    test_size=0.25,
)

# Configuration space
pipeline_cs = pipeline.get_hyperparameter_search_space()
print("Pipeline CS:\n", '_' * 40, f"\n{pipeline_cs}")
config = pipeline_cs.sample_configuration()
print("Pipeline Random Config:\n", '_' * 40, f"\n{config}")
pipeline.set_hyperparameters(config)

# Fit the pipeline
print("Fitting the pipeline...")

pipeline.fit(X=dict(X_train=data,
                    is_small_preprocess=True,
                    dataset_properties=dict(mean=np.array([np.mean(data[:, :, :, i]) for i in range(1)]),
                                            std=np.array([np.std(data[:, :, :, i]) for i in range(1)]),
                                            num_classes=10,
                                            num_features=data.shape[1] * data.shape[2],
                                            image_height=data.shape[1],
                                            image_width=data.shape[2],
                                            is_small_preprocess=True),
                    train_indices=train_indices,
                    val_indices=val_indices,
                    )
             )

# Showcase some components of the pipeline
print(pipeline)