1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| apiVersion: trainer.kubeflow.org/v1alpha1 kind: ClusterTrainingRuntime metadata: creationTimestamp: "2026-01-05T10:07:51Z" finalizers: - trainer.kubeflow.org/resource-in-use generation: 8 labels: trainer.kubeflow.org/framework: torch name: torch-distributed resourceVersion: "183940" uid: 61da91e8-8f96-4d7a-920c-66b314ebf5d1 spec: mlPolicy: numNodes: 1 torch: numProcPerNode: auto podGroupPolicy: volcano: networkTopology: mode: hard highestTierAllowed: 1 template: metadata: {} spec: replicatedJobs: - groupName: default name: node replicas: 1 template: metadata: labels: trainer.kubeflow.org/trainjob-ancestor-step: trainer spec: template: metadata: {} spec: containers: - image: hub.dev.iot5gx.com:2443/pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime name: node resources: {} schedulerName: volcano
|