1.1 预处理
- DataFrame是联邦表格数据的封装,由多个参与方的数据块构成,支持数据水平、垂直切分和混合切分,
- 水平:特征一致,但参与方各自有各自的样本,对应API:HDataFrame
- 垂直:参与方的样本是对齐的,但样本特征不同,对应API:VDataFrame
- 混合切分:既有水平又有垂直的切分模式。对应API:MixDataFrame。
- - FedNdarray是联邦ndarray的封装,同样由多个参与方的数据块构成,每一方的数据使用numpy darray来表示,支持水平和垂直切分,对应统一的API:FedNdarray。
DataFrame 和 FedNdarray 各自提供了一些读写 api 可供直接使用。
隐语也提供多种预处理工具来处理这些数据,可以直接使用 DataFrame API 处理数据,或者使用
sf.preprocessing 包内的各类预处理组件处理。
1.2 隐私求交PSI
隐私求交(Private Set Intersection)是一种使用密码学方法,获取两份数据内容的交集的算法。 PSI过程中不泄露任务交集以外的信息 。
在隐语中 PSI 有两种使用方式:
_,alice_psi path =tempfile.mkstemp() _,bob_psi_path =tempfile.mkstemp() spu.psi_csv( key="uid", input_path={alice:alice_ path, bob: bob _path}, output path={alice:alice_psi path,bob: bob_psi_path}, receiver="alice", protocol="ECDH_PSI_2PC", sort=True, )
from secretflow.data.vertical import read csv as v read csv vdf =vread csv( {alice:alice path,bob: bob path}, spu=spu, keys="uid", drop _keys="uid", psi_protocl="ECDH PSI 2PC", ) vdf.columns
同时隐语支持多种 PSI 算法,可根据参与方数量、带宽、算力、数据不平衡度等不同场景合理选择。
2.1 决策树
算法 | SS-XGB | SecureBoost | 水平XGBoost |
API | secretflow.ml.boost.ss_xgb_v.Xgb | secretflow.ml.boost.sgb_v.Sgb | secretflow.ml.boost.homo_boost.SFXgboost |
场景 | 垂直切分 | 垂直切分 | 水平切分 |
安全性 | 可证安全,安全性依赖使用的秘密分享协议安全性 | 非可证安全,存在可能导致数据泄露的已知攻击 | 非可证安全性, |
性能 | 通信成本更高 | 计算量更大,但通信量更小 | — |
import sys
import time
import logging
import secretflow as sf
from secretflow.ml.boost.ss_xgb_v import Xgb
from secretflow.device.driver import wait, reveal
from secretflow.data import FedNdarray, PartitionWay
from secretflow.data.split import train_test_split
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
# init log
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# init all nodes in local Standalone Mode.
sf.init(['alice', 'bob'], address='local')
# init PYU, the Python Processing Unit, process plaintext in each node.
alice = sf.PYU('alice')
bob = sf.PYU('bob')
# init SPU, the Secure Processing Unit,
# process ciphertext under the protection of a multi-party secure computing protocol
spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))
# read data in each party
def read_x(start, end):
from sklearn.datasets import load_breast_cancer
x = load_breast_cancer()['data']
return x[:, start:end]
def read_y():
from sklearn.datasets import load_breast_cancer
return load_breast_cancer()['target']
# alice / bob / carol each hold one third of the features of the data
v_data = FedNdarray(
alice: alice(read_x)(0, 10),
bob: bob(read_x)(10, 20),
# Y label belongs to alice
label_data = FedNdarray(
partitions={alice: alice(read_y)()},
# wait IO finished
wait([p.data for p in v_data.partitions.values()])
wait([p.data for p in label_data.partitions.values()])
# split train data and test date
random_state = 1234
split_factor = 0.8
v_train_data, v_test_data = train_test_split(v_data, train_size=split_factor, random_state=random_state)
v_train_label, v_test_label= train_test_split(label_data, train_size=split_factor, random_state=random_state)
# run SS-XGB
xgb = Xgb(spu)
start = time.time()
params = {
# for more detail, see Xgb API doc
'num_boost_round': 5,
'max_depth': 5,
'learning_rate': 0.1,
'sketch_eps': 0.08,
'objective': 'logistic',
'reg_lambda': 0.1,
'subsample': 1,
'colsample_by_tree': 1,
'base_score': 0.5,
model = xgb.train(params, v_train_data,v_train_label)
logging.info(f"train time: {time.time() - start}")
# Do predict
start = time.time()
# Now the result is saved in the spu by ciphertext
spu_yhat = model.predict(v_test_data)
# reveal for auc, acc and classification report test.
yhat = reveal(spu_yhat)
logging.info(f"predict time: {time.time() - start}")
y = reveal(v_test_label.partitions[alice])
# get the area under curve(auc) score of classification
logging.info(f"auc: {roc_auc_score(y, yhat)}")
binary_class_results = np.where(yhat>0.5, 1, 0)
# get the accuracy score of classification
logging.info(f"acc: {accuracy_score(y, binary_class_results)}")
# get the report of classification
print("classification report:")
print(classification_report(y, binary_class_results))
#### 2.2 线性回归模型
算法 | SS-SGD | HESS-SGD | SS-GLM | 混合联邦LR |
API | secretflow.ml.linear.SSRegression | secretflow.ml.linear.HESSLogisticRegression | secretflow.ml.linear.SSGLM | secretflow.ml.linear.FlLogisticRegressionMix |
场景 | 垂直 | 垂直 | 垂直 | 混合切分(2+n) |
安全性 | 可证安全,安全性依赖于使用的秘密分享协议 | 可证安全,安全性依赖于使用的秘密分享协议和同态加密算法 | 可证安全,安全性依赖于使用的秘密分享协议 | 非可证安全,泄露了部分中间信息 |
算法 | 线性回归、逻辑回归 | 逻辑回归 | 广义线性回归 | 逻辑回归 |
性能 | 通行量更大,大带宽(万兆/局域网)下速度更快 | 计算量更大,网络受限(带宽延迟)的情况下速度更快 | —— | —— |
import sys
import time
import logging
import numpy as np
import spu
import secretflow as sf
from secretflow.data.split import train_test_split
from secretflow.device.driver import wait, reveal
from secretflow.data import FedNdarray, PartitionWay
from secretflow.ml.linear.ss_sgd import SSRegression
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
# init log
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# init all nodes in local Standalone Mode.
sf.init(['alice', 'bob'], address='local')
# init PYU, the Python Processing Unit, process plaintext in each node.
alice = sf.PYU('alice')
bob = sf.PYU('bob')
# init SPU, the Secure Processing Unit,
# process ciphertext under the protection of a multi-party secure computing protocol
spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))
# read data in each party
def read_x(start, end):
# use breast_cancer as example
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
x = load_breast_cancer()['data']
# LR's train dataset must be standardized or normalized
scaler = StandardScaler()
x = scaler.fit_transform(x)
return x[:, start:end]
def read_y():
from sklearn.datasets import load_breast_cancer
return load_breast_cancer()['target']
# alice / bob / carol each hold one third of the features of the data
# read_x is execute locally on each node.
v_data = FedNdarray(
alice: alice(read_x)(0, 15),
bob: bob(read_x)(15, 30),
# Y label belongs to alice
label_data = FedNdarray(
partitions={alice: alice(read_y)()},
# wait IO finished
wait([p.data for p in v_data.partitions.values()])
wait([p.data for p in label_data.partitions.values()])
# split train data and test date
random_state = 1234
split_factor = 0.8
v_train_data, v_test_data = train_test_split(v_data, train_size=split_factor, random_state=random_state)
v_train_label, v_test_label = train_test_split(label_data, train_size=split_factor, random_state=random_state)
# run SS-SGD
# SSRegression use spu to fit model.
model = SSRegression(spu)
start = time.time()
v_train_data, # x
v_train_label, # y
5, # epochs
0.3, # learning_rate
32, # batch_size
't1', # sig_type
'logistic', # reg_type
'l2', # penalty
0.1, # l2_norm
logging.info(f"train time: {time.time() - start}")
# Do predict
start = time.time()
# Now the result is saved in the spu by ciphertext
spu_yhat = model.predict(v_test_data)
# reveal for auc, acc and classification report test.
yhat = reveal(spu_yhat)
logging.info(f"predict time: {time.time() - start}")
y = reveal(v_test_label.partitions[alice])
# get the area under curve(auc) score of classification
logging.info(f"auc: {roc_auc_score(y, yhat)}")
binary_class_results = np.where(yhat > 0.5, 1, 0)
# get the accuracy score of classification
logging.info(f"acc: {accuracy_score(y, binary_class_results)}")
# get the report of classification
print("classification report:")
print(classification_report(y, binary_class_results))
### 3、神经网络算法