目录
- 说明,本篇博文中所有json文件,在实验中,应删除所有注释
1. 制作数据集
- 数据集:波士顿房价预测数据集,样本数506,13个特征,标签是房屋得均价
from sklearn.datasets import load_boston
import pandas as pd
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston

- 切分数据集:将前406条数据作为训练数据,后100条数据作为测试数据
- 训练数据集切分:随机抽取360条数据和前8个特征作为机构A的本地数据,随机抽取380条数据和后5个特征以及标签作为机构B的本地数据
- 测试数据集切分:随机抽取80条数据和前8个特征作为机构A的本地测试数据,随机抽取85条数据和后5个特征以及标签作为机构B的本地测试数据
# 切分训练数据
from sklearn.datasets import load_boston
import pandas as pd
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston = (boston-boston.mean())/(boston.std())
col_names = boston.columns.values.tolist()
columns = {}
for idx, n in enumerate(col_names):
columns[n] = "x%d"%idx
boston = boston.rename(columns=columns)
boston['y'] = boston_dataset.target
boston['idx'] = range(boston.shape[0])
idx = boston['idx']
boston.drop(labels=['idx'], axis=1, inplace = True)
boston.insert(0, 'idx', idx)
train = boston.iloc[:406]
df1 = train.sample(360)
df2 = train.sample(380)
housing_1_train = df1[["idx", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"]]
housing_1_train.to_csv('data/housing_1_train.csv', index=False, header=True)
housing_2_train = df2[["idx", "y", "x8", "x9", "x10", "x11", "x12"]]
housing_2_train.to_csv('data/housing_2_train.csv', index=False, header=True)
# 切分测试数据
from sklearn.datasets import load_boston
import pandas as pd
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston = (boston-boston.mean())/(boston.std())
col_names = boston.columns.values.tolist()
columns = {}
for idx, n in enumerate(col_names):
columns[n] = "x%d"%idx
boston = boston.rename(columns=columns)
boston['y'] = boston_dataset.target
boston['idx'] = range(boston.shape[0])
idx = boston['idx']
boston.drop(labels=['idx'], axis=1, inplace = True)
boston.insert(0, 'idx', idx)
eval = boston.iloc[406:]
df1 = eval.sample(80)
df2 = eval.sample(85)
housing_1_eval = df1[["idx", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"]]
housing_1_eval.to_csv('data/housing_1_eval.csv', index=True, header=True)
housing_2_eval = df2[["idx", "y", "x8", "x9", "x10", "x11", "x12"]]
housing_2_eval.to_csv('data/housing_2_eval.csv', index=True, header=True)
2. 通过DSL Conf运行训练和预测任务
2.1 数据输入
- upload_train_host_conf.json
{
"file": "workspace/VFL_lr/data/housing_1_train.csv",
"table_name": "homo_housing_1_train",
"namespace": "homo_host_housing_train",
"head": 1,
"partition": 16,
"work_mode": 0,
"backend": 0
}
- upload_train_guest_conf.json
{
"file": "workspace/VFL_lr/data/housing_2_train.csv",
"table_name": "homo_housing_2_train",
"namespace": "homo_guest_housing_train",
"head": 1,
"partition": 16,
"work_mode": 0,
"backend": 0
}
- upload_eval_host_conf.json
{
"file": "workspace/VFL_lr/data/housing_1_eval.csv",
"table_name": "homo_housing_1_eval",
"namespace": "homo_host_housing_eval",
"head": 1,
"partition": 16,
"work_mode": 0,
"backend": 0
}
- upload_eval_guest_conf.json
{
"file": "workspace/VFL_lr/data/housing_2_eval.csv",
"table_name": "homo_housing_2_eval",
"namespace": "homo_guest_housing_eval",
"head": 1,
"partition": 16,
"work_mode": 0,
"backend": 0
}
- 上传数据命令
workspace/VFL_lr/ 是我建立在fate根目录下的目录
$ flow data upload -c workspace/VFL_lr/upload_train_host_conf.json
$ flow data upload -c workspace/VFL_lr/upload_train_guest_conf.json
$ flow data upload -c workspace/VFL_lr/upload_eval_host_conf.json
$ flow data upload -c workspace/VFL_lr/upload_eval_guest_conf.json
2.2 模型训练
2.2.1 配置DSL文件
- 与横向联邦学习相比较,纵向联邦学习需要进行样本对齐,即在不泄露双方数据的前提下,求取出双方用户的交集,从而确定模型训练的训练数据集。
- 关于组件的说明
- reader_0: 数据读取组件(v2版本加入),支持图像数据
- dataio_0: 数据 I/O 组件
- intersection_0: 样本对齐组件
- hetero_linr_0: 纵向线性回归模型组件
- evaluation_0: 模型评估组件
- 官方提供的示例可以在以下目录找到
- /examples/dsl/v1/hetero_linear_regression/test_hetero_linr_train_job_dsl.json,直接可以使用
// v1版本
{
"components" : {
"dataio_0": {
"module": "DataIO",
"input": {
"data": {
"data": ["args.train_data"]
}
},
"output": {
"data": ["train"],
"model": ["dataio"]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_0.train"]
}
},
"output": {
"data": ["train"]
}
},
"hetero_linr_0": {
"module": "HeteroLinR",
"input": {
"data": {
"train_data": ["intersection_0.train"]
}
},
"output": {
"data": ["train"],
"model": ["hetero_linr"]
}
},
"evaluation_0": {
"module": "Evaluation",
"input": {
"data": {
"data": ["hetero_linr_0.train"]
}
}
}
}
}
- /examples/dsl/v2/hetero_linear_regression/test_hetero_linr_train_job_dsl.json,需要修改,增加evaluation_0组件。(相比v1版本,v2版本增加了reader组件,支持图像输入)
// v2版本
{
"components": {
"reader_0": {
"module": "Reader",
"output": {
"data": ["data"]
}
},
"dataio_0": {
"module": "DataIO",
"input": {
"data": {
"data": ["reader_0.data"]
}
},
"output": {
"data": ["data"],
"model": ["model"]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_0.data"]
}
},
"output": {
"data": ["data"]
}
},
"hetero_linr_0": {
"module": "HeteroLinR",
"input": {
"data": {
"train_data": ["intersection_0.data"]
}
},
"output": {
"data": ["data"],
"model": ["model"]
}
},
"evaluation_0": {
"module": "Evaluation",
"input": {
"data": {
"data": ["hetero_linr_0.data"]
}
},
"output": {
"data": ["data"]
}
}
}
}
2.2.2 运行配置Submit Runtime Conf
- /examples/dsl/v1/hetero_linear_regression/test_hetero_linr_train_job_conf.json
- 需要修改party ID为对应ID
- 指定数据对应上传数据时的设置
- 修改label_name
- 设置模型参数
{
"initiator": {
"role": "guest",
"party_id": 10000
},
"job_parameters": {
"work_mode": 0
},
"role": {
"guest": [10000],
"host": [10000],
"arbiter": [10000]
},
"role_parameters": {
"guest": {
"args": {
"data": {
"train_data": [
{
"name": "homo_housing_2_train",
"namespace": "homo_guest_housing_train"
}
]
}
},
"dataio_0": {
"with_label": [true],
"label_name": ["y"],
"label_type": ["float"],
"output_format": ["dense"],
"missing_fill": [true],
"outlier_replace": [false]
},
"evaluation_0": {
"eval_type": ["regression"],
"pos_label": [1]
}
},
"host": {
"args": {
"data": {
"train_data": [
{
"name": "homo_housing_1_train",
"namespace": "homo_host_housing_train"
}
]
}
},
"dataio_0": {
"with_label": [false],
"output_format": ["dense"],
"outlier_replace": [false]
},
"evaluation_0": {
"need_run": [false]
}
}
},
"algorithm_parameters": {
"hetero_linr_0": {
"penalty": "L2",
"optimizer": "sgd",
"tol": 0.001,
"alpha": 0.01,
"max_iter": 20,
"early_stop": "weight_diff",
"batch_size": -1,
"learning_rate": 0.15,
"decay": 0.0,
"decay_sqrt": false,
"init_param": {"init_method": "zeros"},
"encrypted_mode_calculator_param": {"mode": "fast"}
}
}
}
- /examples/dsl/v2/hetero_linear_regression/test_hetero_linr_train_job_conf.json,需要修改
- 需要修改party ID为对应ID
- 指定数据对应上传数据时的设置
- 修改label_name
- 设置模型参数
- 设置evaluation_0相关参数
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 10000
},
"role": {
"arbiter": [10000],
"host": [10000],
"guest": [10000]
},
"job_parameters": {
"common": {
"job_type": "train",
"backend": 0,
"work_mode": 0
}
},
"component_parameters": {
"common": {
"hetero_linr_0": {
"penalty": "L2",
"tol": 0.001,
"alpha": 0.01,
"optimizer": "sgd",
"batch_size": -1,
"learning_rate": 0.15,
"init_param": {"init_method": "zeros"},
"max_iter": 20,
"early_stop": "weight_diff",
"encrypted_mode_calculator_param": {"mode": "fast"},
"decay": 0.0,
"decay_sqrt": false,
"floating_point_precision": 23
},
"evaluation_0": {
"eval_type": "regression",
"pos_label": 1
}
},
"role": {
"host": {
"0": {
"reader_0": {
"table": {
"name": "homo_housing_1_train",
"namespace": "homo_host_housing_train"
}
},
"dataio_0": {"with_label": false}
}
},
"guest": {
"0": {
"reader_0": {
"table": {
"name": "homo_housing_2_train",
"namespace": "homo_guest_housing_train"
}
},
"dataio_0": {
"with_label": true,
"label_name": "y",
"label_type": "float",
"output_format": "dense"
}
}
}
}
}
}
2.2.3 提交任务,训练模型
- 执行pipeline任务
flow job submit -c ${conf_path} -d ${dsl_path} - 对比v1版本与v2版本的DAG(有向无环图)
-
v1

-
v2,相比v1,只多了reader组件

-
- 在arbiter节点中查看训练过程中loss的变化

- 在guest查看模型在训练数据上的效果

2.3 模型评估
2.3.1 修改DSL
- /examples/dsl/v1/hetero_linear_regression/test_hetero_linr_validate_job_dsl.json,无需修改
// v1版本
{
"components" : {
"dataio_0": {
"module": "DataIO",
"input": {
"data": {
"data": ["args.train_data"]
}
},
"output": {
"data": ["train"],
"model": ["dataio"]
}
},
"dataio_1": {
"module": "DataIO",
"input": {
"data": {
"data": ["args.eval_data"]
},
"model": ["dataio_0.dataio"]
},
"output": {
"data": ["eval"],
"model": ["dataio"]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_0.train"]
}
},
"output": {
"data": ["train"]
}
},
"intersection_1": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_1.eval"]
}
},
"output": {
"data": ["eval"]
}
},
"hetero_linr_0": {
"module": "HeteroLinR",
"input": {
"data": {
"train_data": ["intersection_0.train"],
"eval_data": ["intersection_1.eval"]
}
},
"output": {
"data": ["train"],
"model": ["hetero_linr"]
}
},
"evaluation_0": {
"module": "Evaluation",
"input": {
"data": {
"data": ["hetero_linr_0.train"]
}
}
}
}
}
- /examples/dsl/v2/hetero_linear_regression/test_hetero_linr_validate_job_dsl.json,需要修改
- 增加evaluation_0组件
// v2版本
{
"components": {
"reader_0": {
"module": "Reader",
"output": {
"data": ["data"]
}
},
"reader_1": {
"module": "Reader",
"output": {
"data": ["data"]
}
},
"dataio_0": {
"module": "DataIO",
"input": {
"data": {
"data": ["reader_0.data"]
}
},
"output": {
"data": ["data"],
"model": ["model"]
}
},
"dataio_1": {
"module": "DataIO",
"input": {
"data": {
"data": ["reader_1.data"]
},
"model": ["dataio_0.model"]
},
"output": {
"data": ["data"],
"model": ["model"]
}
},
"intersection_0": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_0.data"]
}
},
"output": {
"data": ["data"]
}
},
"intersection_1": {
"module": "Intersection",
"input": {
"data": {
"data": ["dataio_1.data"]
}
},
"output": {
"data": ["data"]
}
},
"hetero_linr_0": {
"module": "HeteroLinR",
"input": {
"data": {
"train_data": ["intersection_0.data"],
"validate_data": ["intersection_1.data"]
}
},
"output": {
"data": ["data"],
"model": ["model"]
}
},
"evaluation_0": {
"module": "Evaluation",
"input": {
"data": {
"data": ["hetero_linr_0.data"]
}
}
}
}
}
2.3.2 修改conf
- /examples/dsl/v1/hetero_linear_regression/test_hetero_linr_validate_job_conf.json,修改party ID、数据源、label_name、设置模型参数即可
// v1版本
{
"initiator": {
"role": "guest",
"party_id": 10000
},
"job_parameters": {
"work_mode": 0
},
"role": {
"guest": [10000],
"host": [10000],
"arbiter": [10000]
},
"role_parameters": {
"guest": {
"args": {
"data": {
"train_data": [
{
"name": "homo_housing_2_train",
"namespace": "homo_guest_housing_train"
}
],
"eval_data": [
{
"name": "homo_housing_2_eval",
"namespace": "homo_guest_housing_eval"
}
]
}
},
"dataio_0": {
"with_label": [true],
"label_name": ["y"],
"label_type": ["float"],
"output_format": ["dense"],
"missing_fill": [true],
"outlier_replace": [false]
},
"dataio_1": {
"with_label": [true],
"label_name": ["y"],
"label_type": ["float"],
"output_format": ["dense"],
"missing_fill": [true],
"outlier_replace": [false]
},
"evaluation_0": {
"eval_type": ["regression"],
"pos_label": [1]
}
},
"host": {
"args": {
"data": {
"train_data": [
{
"name": "homo_housing_1_train",
"namespace": "homo_host_housing_train"
}
],
"eval_data": [
{
"name": "homo_housing_1_eval",
"namespace": "homo_host_housing_eval"
}
]
}
},
"dataio_0": {
"with_label": [false],
"output_format": ["dense"],
"outlier_replace": [false]
},
"dataio_1": {
"with_label": [false],
"output_format": ["dense"],
"outlier_replace": [false]
},
"evaluation_0": {
"need_run": [false]
}
}
},
"algorithm_parameters": {
"hetero_linr_0": {
"penalty": "L2",
"optimizer": "sgd",
"tol": 0.001,
"alpha": 0.01,
"max_iter": 20,
"early_stop": "weight_diff",
"batch_size": -1,
"learning_rate": 0.15,
"decay": 0.0,
"decay_sqrt": false,
"early_stopping_rounds": 1,
"validation_freqs": 5,
"metrics": [
"mean_absolute_error",
"root_mean_squared_error"
],
"use_first_metric_only": false,
"init_param": {"init_method": "zeros"},
"encrypted_mode_calculator_param": {"mode": "fast"}
}
}
}
- /examples/dsl/v2/hetero_linear_regression/test_hetero_linr_validate_job_conf.json,需要修改
- 修改party ID
- 修改数据源,修改label_name
- 设置evaluation_0组件的参数
- 设置模型参数
// v2版本
{
"dsl_version": 2,
"initiator": {
"role": "guest",
"party_id": 10000
},
"role": {
"arbiter": [10000],
"host": [10000],
"guest": [10000]
},
"job_parameters": {
"common": {
"work_mode": 0,
"backend": 0
}
},
"component_parameters": {
"common": {
"hetero_linr_0": {
"penalty": "L2",
"tol": 0.001,
"alpha": 0.01,
"optimizer": "sgd",
"batch_size": -1,
"learning_rate": 0.15,
"init_param": {"init_method": "zeros"},
"max_iter": 20,
"early_stop": "weight_diff",
"encrypted_mode_calculator_param": {"mode": "fast"},
"decay": 0.0,
"decay_sqrt": false,
"validation_freqs": 1,
"early_stopping_rounds": 5,
"metrics": [
"mean_absolute_error",
"root_mean_squared_error"
],
"use_first_metric_only": false
},
"evaluation_0": {
"eval_type": "regression",
"pos_label": 1
}
},
"role": {
"host": {
"0": {
"dataio_0": {"with_label": false},
"reader_0": {
"table": {
"name": "homo_housing_1_train",
"namespace": "homo_host_housing_train"
}
},
"reader_1": {
"table": {
"name": "homo_housing_1_eval",
"namespace": "homo_host_housing_eval"
}
},
"dataio_1": {"with_label": false}
}
},
"guest": {
"0": {
"dataio_0": {
"with_label": true,
"label_name": "y",
"label_type": "float",
"output_format": "dense"
},
"reader_0": {
"table": {
"name": "homo_housing_2_train",
"namespace": "homo_guest_housing_train"
}
},
"reader_1": {
"table": {
"name": "homo_housing_2_eval",
"namespace": "homo_guest_housing_eval"
}
},
"dataio_1": {
"with_label": true,
"label_name": "y",
"label_type": "float",
"output_format": "dense"
}
}
}
}
}
}
2.3.3 提交任务
flow job submit -c ${conf_path} -d ${dsl_path}- 查看DAG图

- 查看模型效果

7821

被折叠的 条评论
为什么被折叠?



