Ray框架下实现sklearn的标签传播算法

from typing import Tuple
import os
from docx import Document
from sklearn.semi_supervised import LabelSpreading
import ray
from ray.data.dataset import Dataset
from ray.ml.batch_predictor import BatchPredictor
from ray.ml.predictors.integrations.sklearn.sklearn_predictor import SklearnPredictor
from ray.ml.preprocessors import Chain, OrdinalEncoder, StandardScaler
from ray.ml.result import Result
from ray.ml.train.integrations.sklearn import SklearnTrainer
from sklearn.model_selection import train_test_split


def prepare_data(data, label) -> Tuple[Dataset, Dataset, Dataset]:
    train_df, test_df = train_test_split(data, test_size=0.2)
    train_df = train_df.fillna(-1).astype(int)
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(train_df)
    test_dataset = ray.data.from_pandas(test_df.drop(label, axis=1))
    return train_dataset, valid_dataset, test_dataset


def train_sklearn(data, label, num_cpus: int, use_gpu: bool = False) -> Result:
    if use_gpu and not LabelSpreading:
        raise RuntimeError("cuML must be installed for GPU enabled sklearn estimators.")

    train_dataset, valid_dataset, _ = prepare_data(data, label)

    # Scale some random columns
    columns_to_scale = ["mean radius", "mean texture"]
    # preprocessor = Chain(
    #     OrdinalEncoder(["categorical_column"]), StandardScaler(columns=columns_to_scale)
    # )

    if use_gpu:
        trainer_resources = {"CPU": 1, "GPU": 1}
        estimator = LabelSpreading(kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2,
                                   max_iter=30, tol=1e-3)
    else:
        trainer_resources = {"CPU": num_cpus}
        estimator = LabelSpreading(kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2,
                                   max_iter=30, tol=1e-3)

    trainer = SklearnTrainer(
        estimator=estimator,
        label_column=label,
        datasets={"train": train_dataset, "valid": valid_dataset},
        # preprocessor=preprocessor,
        cv=5,
        scaling_config={
            "trainer_resources": trainer_resources,
        },
    )
    result = trainer.fit()
    print(result)
    print(result.metrics)

    return result


def predict_sklearn(data, label, result: Result, use_gpu: bool = False):
    _, _, test_dataset = prepare_data(data, label)

    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint, SklearnPredictor
    )

    predicted_labels = (
        batch_predictor.predict(
            test_dataset,
            num_gpus_per_worker=int(use_gpu),
        )
            .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas")
            .to_pandas(limit=float("inf"))
    )
    return predicted_labels
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值