Skip to content

Submit The Training Job

This API is used to submit a training job.

Path

pkg/apis/arenaclient.TrainingJobClient

Function

func (t *TrainingJobClient) Submit(job *apistraining.Job) error

Parameters

  • job(type pkg/apis/training.Job) => the job which will be submitted,it must be created by some training job builders, please refer the apis of training job builders to build your training jobs.

Example

Submit a tensorflow training job

package main

import (
    "fmt"
    "time"

    "github.com/kubeflow/arena/pkg/apis/arenaclient"
    "github.com/kubeflow/arena/pkg/apis/training"
    "github.com/kubeflow/arena/pkg/apis/types"
)

func main() {
    jobName := "test-dist-tfjob"
    jobType := types.TFTrainingJob
    // create arena client
    client, err := arenaclient.NewArenaClient(types.ArenaClientArgs{
        Kubeconfig: "",
        LogLevel:   "info",
        Namespace:  "default",
    })
    if err != nil {
        fmt.Printf("failed to create arena client,reason: %v", err)
        return
    }
    // create tfjob
    /* command:
            arena submit tfjob \
            --name=tf-distributed-test \
            --gpus=1 \
            --workers=1 \
            --worker-image=cheyang/tf-mnist-distributed:gpu \
            --ps-image=cheyang/tf-mnist-distributed:cpu \
            --ps=1 \
            --tensorboard \
            "python /app/main.py"
    */
    submitJob, err := training.NewTFJobBuilder().
        Name(jobName).
        GPUCount(1).
        WorkerCount(1).
        WorkerImage("cheyang/tf-mnist-distributed:gpu").
        PsImage("cheyang/tf-mnist-distributed:cpu").
        PsCount(1).
        EnableTensorboard().
        Command([]string{"'python /app/main.py'"}).Build()
    if err != nil {
        fmt.Printf("failed to build tfjob,reason: %v\n", err)
        return
    }
    // submit tfjob
    if err := client.Training().Submit(submitJob); err != nil {
        fmt.Printf("failed to submit job,reason: %v\n", err)
        return
    }
}