from typing import Tuple import os from docx import Document from sklearn.semi_supervised import LabelSpreading import ray from ray.data.dataset import Dataset from ray.ml.batch_predictor import BatchPredictor from ray.ml.predictors.integrations.sklearn.sklearn_predictor import SklearnPredictor from ray.ml.preprocessors import Chain, OrdinalEncoder, StandardScaler from ray.ml.result import Result from ray.ml.train.integrations.sklearn import SklearnTrainer from sklearn.model_selection import train_test_split def prepare_data(data, label) -> Tuple[Dataset, Dataset, Dataset]: train_df, test_df = train_test_split(data, test_size=0.2) train_df = train_df.fillna(-1).astype(int) train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(train_df) test_dataset = ray.data.from_pandas(test_df.drop(label, axis=1)) return train_dataset, valid_dataset, test_dataset def train_sklearn(data, label, num_cpus: int, use_gpu: bool = False) -> Result: if use_gpu and not LabelSpreading: raise RuntimeError("cuML must be installed for GPU enabled sklearn estimators.") train_dataset, valid_dataset, _ = prepare_data(data, label) # Scale some random columns columns_to_scale = ["mean radius", "mean texture"] # preprocessor = Chain( # OrdinalEncoder(["categorical_column"]), StandardScaler(columns=columns_to_scale) # ) if use_gpu: trainer_resources = {"CPU": 1, "GPU": 1} estimator = LabelSpreading(kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3) else: trainer_resources = {"CPU": num_cpus} estimator = LabelSpreading(kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3) trainer = SklearnTrainer( estimator=estimator, label_column=label, datasets={"train": train_dataset, "valid": valid_dataset}, # preprocessor=preprocessor, cv=5, scaling_config={ "trainer_resources": trainer_resources, }, ) result = trainer.fit() print(result) print(result.metrics) return result def predict_sklearn(data, label, result: Result, use_gpu: bool = False): _, _, test_dataset = prepare_data(data, label) batch_predictor = BatchPredictor.from_checkpoint( result.checkpoint, SklearnPredictor ) predicted_labels = ( batch_predictor.predict( test_dataset, num_gpus_per_worker=int(use_gpu), ) .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas") .to_pandas(limit=float("inf")) ) return predicted_labels
Ray框架下实现sklearn的标签传播算法
最新推荐文章于 2023-12-21 19:41:32 发布