一、背景
基于Kubeflow, 创建内置的Job, 如PytorchJob、PaddleJob、XGBoostJob、TFJob等等 基于上述做工厂模式统一处理 通过接口传递枚举类型, 创建相对应的Job实例, 然后调用内部API函数, 以下实例做一个获取Job名称的的获取实例 不对之处请指教, 老样子不废话直接开干
二、构建介绍
整体目录结构如下
❯ tree
.
├── bo
│ └── KubeJobBo.go
├── enum
│ └── framework.enum.go
├── go.mod
├── go.sum
├── kubeflow
│ ├── client
│ │ ├── kubeflow.job.client.service.go
│ │ ├── paddle.job.client.service.go
│ │ └── pytorch.job.client.service.go
│ └── factory
│ ├── kubeflow.factory.go
│ ├── paddle.factory.go
│ └── pytorch.factory.go
└── main.go
创建枚举
type FrameworkType string
const (
PYTORCH FrameworkType = "PYTORCH"
PADDLE FrameworkType = "PADDLE"
)
func ( frame FrameworkType) Value ( ) string {
return strings. ToLower ( string ( frame) )
}
创建返回值结构体
package bo
type KubeJobBo struct {
Name string
}
创建训练框架客户端接口
package client
import (
"context"
v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
kfClientSetV1 "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"xincan.com.cn/demo/bo"
)
type KubeflowJobClient interface {
Get ( ctx context. Context, name string , opts metav1. GetOptions) ( * bo. KubeJobBo, error )
}
type PaddleJobClient struct {
Client kfClientSetV1. PaddleJobInterface
}
func ( job * PaddleJobClient) Get ( ctx context. Context, name string , opts metav1. GetOptions) ( * bo. KubeJobBo, error ) {
obj, _ := job. Client. Get ( ctx, name, opts)
return & bo. KubeJobBo{
Name: obj. Name,
} , nil
}
type PyTorchJobClient struct {
Client kfClientSetV1. PyTorchJobInterface
}
func ( job * PyTorchJobClient) Get ( ctx context. Context, name string , opts metav1. GetOptions) ( * bo. KubeJobBo, error ) {
obj, _ := job. Client. Get ( ctx, name, opts)
return & bo. KubeJobBo{
Name: obj. Name,
} , nil
}
创建训练框架工厂接口
type KubeflowFactory interface {
CreateJobClient ( kubeflowClient * kfClientSetV1. KubeflowV1Client, namespace string ) client. KubeflowJobClient
}
创建PaddleJobClientFactory训练框架客户端工厂, 实现KubeflowFactory接口函数
package factory
import (
v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
kfClientSetV1 "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1"
"xincan.com.cn/demo/kubeflow/client"
)
type PaddleJobClientFactory struct {
* KubeflowJobFactory
}
func NewPaddleJobClientFactory ( client * kfClientSetV1. KubeflowV1Client, namespace string ) * PaddleJobClientFactory {
return & PaddleJobClientFactory{
KubeflowJobFactory: NewKubeflowJobFactory ( client, namespace) ,
}
}
func ( factory * PaddleJobClientFactory) CreateJobClient ( ) client. KubeflowJobClient {
return & client. PaddleJobClient{
Client: factory. kubeflowClient. PaddleJobs ( factory. namespace) ,
}
}
创建PytorchJobClientFactory训练框架客户端工厂, 实现KubeflowFactory接口函数
package factory
import (
v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
kfClientSetV1 "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1"
"xincan.com.cn/demo/kubeflow/client"
)
type PytorchJobClientFactory struct {
* KubeflowJobFactory
}
func NewPytorchJobClientFactory ( client * kfClientSetV1. KubeflowV1Client, namespace string ) * PytorchJobClientFactory {
return & PytorchJobClientFactory{
KubeflowJobFactory: NewKubeflowJobFactory ( client, namespace) ,
}
}
func ( factory * PytorchJobClientFactory) CreateJobClient ( ) client. KubeflowJobClient {
return & client. PyTorchJobClient{
Client: factory. kubeflowClient. PyTorchJobs ( factory. namespace) ,
}
}
创建基础工厂
package factory
import (
"fmt"
kfClientSetV1 "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1"
"xincan.com.cn/demo/enum"
"xincan.com.cn/demo/kubeflow/client"
)
type KubeflowFactory interface {
CreateJobClient ( kubeflowClient * kfClientSetV1. KubeflowV1Client, namespace string ) client. KubeflowJobClient
}
type KubeflowJobFactory struct {
kubeflowClient * kfClientSetV1. KubeflowV1Client
namespace string
}
func NewKubeflowJobFactory ( client * kfClientSetV1. KubeflowV1Client, namespace string ) * KubeflowJobFactory {
return & KubeflowJobFactory{
kubeflowClient: client,
namespace: namespace,
}
}
func ( factory * KubeflowJobFactory) GetJobClient ( frameworkType enum. FrameworkType) ( client. KubeflowJobClient, error ) {
switch frameworkType {
case enum. PYTORCH:
return & client. PyTorchJobClient{
Client: factory. kubeflowClient. PyTorchJobs ( factory. namespace) ,
} , nil
case enum. PADDLE:
return & client. PaddleJobClient{
Client: factory. kubeflowClient. PaddleJobs ( factory. namespace) ,
} , nil
default :
return nil , fmt. Errorf ( "unsupported framework type: %s" , frameworkType)
}
}
主函数测试调用
package main
import (
"context"
"fmt"
kfClientSetV1 "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/clientcmd"
"xincan.com.cn/demo/enum"
"xincan.com.cn/demo/kubeflow/factory"
)
func initKubeflow ( ) * kfClientSetV1. KubeflowV1Client {
kubeConfig, _ := clientcmd. BuildConfigFromFlags ( "" , "./.kube/1148032689266233344" )
return kfClientSetV1. NewForConfigOrDie ( kubeConfig)
}
func main ( ) {
kubeflowV1Client := initKubeflow ( )
jobFactory := factory. NewKubeflowJobFactory ( kubeflowV1Client, "troila" )
pytorchClient, _ := jobFactory. GetJobClient ( enum. PYTORCH)
pytorch, _ := pytorchClient. Get ( context. TODO ( ) , "pytorch-1203457869012345678" , metav1. GetOptions{ } )
fmt. Println ( pytorch. Name)
paddleClient, _ := jobFactory. GetJobClient ( enum. PADDLE)
paddle, _ := paddleClient. Get ( context. TODO ( ) , "paddle-1203457869012345679" , metav1. GetOptions{ } )
fmt. Println ( paddle. Name)
}
结果验证
GOROOT= / Users/ xincan/ software/ go / go - 1.22 .5 #gosetup
GOPATH= / Users/ xincan/ workspace/ goworkspace #gosetup
/ Users/ xincan/ software/ go / go - 1.22 .5 / bin/ go build - o / Users/ xincan/ Library/ Caches/ JetBrains/ GoLand2023. 3 / tmp/ GoLand/ ___1demo xincan. com. cn/ demo #gosetup
/ Users/ xincan/ Library/ Caches/ JetBrains/ GoLand2023. 3 / tmp/ GoLand/ ___1demo
pytorch- 1203457869012345678
paddle- 1203457869012345679
Process finished with the exit code 0
root@node1: ~# kubectl - n troila get PytorchJob, PaddleJob
NAME STATE AGE
pytorchjob. kubeflow. org/ pytorch- 1203457869012345678 Succeeded 22 h
NAME STATE AGE
paddlejob. kubeflow. org/ paddle- 1203457869012345679 Running 8 s
root@node1: ~# kubectl - n troila get pod