Submit The Training Job
This API is used to submit a training job.
Path
pkg/apis/arenaclient.TrainingJobClient
Function
func (t *TrainingJobClient) Submit(job *apistraining.Job) error
Parameters
- job(type pkg/apis/training.Job) => the job which will be submitted,it must be created by some training job builders, please refer the apis of training job builders to build your training jobs.
Example
Submit a tensorflow training job
package main
import (
"fmt"
"time"
"github.com/kubeflow/arena/pkg/apis/arenaclient"
"github.com/kubeflow/arena/pkg/apis/training"
"github.com/kubeflow/arena/pkg/apis/types"
)
func main() {
jobName := "test-dist-tfjob"
jobType := types.TFTrainingJob
// create arena client
client, err := arenaclient.NewArenaClient(types.ArenaClientArgs{
Kubeconfig: "",
LogLevel: "info",
Namespace: "default",
})
if err != nil {
fmt.Printf("failed to create arena client,reason: %v", err)
return
}
// create tfjob
/* command:
arena submit tfjob \
--name=tf-distributed-test \
--gpus=1 \
--workers=1 \
--worker-image=cheyang/tf-mnist-distributed:gpu \
--ps-image=cheyang/tf-mnist-distributed:cpu \
--ps=1 \
--tensorboard \
"python /app/main.py"
*/
submitJob, err := training.NewTFJobBuilder().
Name(jobName).
GPUCount(1).
WorkerCount(1).
WorkerImage("cheyang/tf-mnist-distributed:gpu").
PsImage("cheyang/tf-mnist-distributed:cpu").
PsCount(1).
EnableTensorboard().
Command([]string{"'python /app/main.py'"}).Build()
if err != nil {
fmt.Printf("failed to build tfjob,reason: %v\n", err)
return
}
// submit tfjob
if err := client.Training().Submit(submitJob); err != nil {
fmt.Printf("failed to submit job,reason: %v\n", err)
return
}
}