kubeflow训练2

安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
https://github.com/kubeflow/trainer.git

git fetch --tags
git checkout v2.1.0
kubectl apply --server-side -k manifests/overlays/manager

## 确认 pod 启动
kubectl get pods -n kubeflow-system
NAME READY STATUS RESTARTS AGE
jobset-controller-manager-7b54585c9c-j7zbx 1/1 Running 8 (3m8s ago) 38m
kubeflow-trainer-controller-manager-86667676b-bshbf 1/1 Running 2 (6m32s ago) 57m

##!! 如果镜像下载不下来 kubectl edit deployment jobset-controller-manager -n kubeflow-system, 修改 deployment 中的镜像配置

## 安装 runtimes
kubectl apply --server-side -k manifests/overlays/runtimes

## 查看 runtimes
kubectl get clustertrainingruntimes
NAME AGE
deepspeed-distributed 38m
mlx-distributed 38m
torch-distributed 38m
torchtune-llama3.2-1b 38m
torchtune-llama3.2-3b 38m
torchtune-qwen2.5-1.5b 38m

## 修改 runtime 中的镜像配置
kubectl edit clustertrainingruntimes torch-distributed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# volcano 配置
apiVersion: trainer.kubeflow.org/v1alpha1
kind: ClusterTrainingRuntime
metadata:
creationTimestamp: "2026-01-05T10:07:51Z"
finalizers:
- trainer.kubeflow.org/resource-in-use
generation: 8
labels:
trainer.kubeflow.org/framework: torch
name: torch-distributed
resourceVersion: "183940"
uid: 61da91e8-8f96-4d7a-920c-66b314ebf5d1
spec:
mlPolicy:
numNodes: 1
torch:
numProcPerNode: auto
podGroupPolicy:
volcano:
networkTopology:
mode: hard
highestTierAllowed: 1
template:
metadata: {}
spec:
replicatedJobs:
- groupName: default
name: node
replicas: 1
template:
metadata:
labels:
trainer.kubeflow.org/trainjob-ancestor-step: trainer
spec:
template:
metadata: {}
spec:
containers:
- image: hub.dev.iot5gx.com:2443/pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
name: node
resources: {}
schedulerName: volcano