Skip to content

Commit edbca21

Browse files
authored
Add coverage of RayClusters and RayJobs to e2e testing (#183)
1 parent c07153b commit edbca21

File tree

3 files changed

+172
-5
lines changed

3 files changed

+172
-5
lines changed

hack/e2e-util.sh

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ DUMP_LOGS="true"
2525
export KUBEFLOW_VERSION=v1.7.0
2626
export IMAGE_KUBEFLOW_OPERATOR="docker.io/kubeflow/training-operator:v1-855e096"
2727

28+
export KUBERAY_VERSION=1.1.0
29+
export IMAGE_KUBERAY_OPERATOR="quay.io/kuberay/operator:v1.1.1"
30+
2831
# These are small images used by the e2e tests.
2932
# Pull and kind load to avoid long delays during testing
3033
export IMAGE_ECHOSERVER="quay.io/project-codeflare/echo-server:1.0"
@@ -66,6 +69,18 @@ function update_test_host {
6669
echo "Kind was sucessfully installed."
6770
fi
6871

72+
which helm >/dev/null 2>&1
73+
if [ $? -ne 0 ]
74+
then
75+
# Installing helm3
76+
echo "Downloading and installing helm..."
77+
curl -fsSL -o ${ROOT_DIR}/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 &&
78+
chmod 700 ${ROOT_DIR}/get_helm.sh && ${ROOT_DIR}/get_helm.sh
79+
[ $? -ne 0 ] && echo "Failed to download and install helm" && exit 1
80+
echo "Helm was sucessfully installed."
81+
rm -rf ${ROOT_DIR}/get_helm.sh
82+
fi
83+
6984
kubectl kuttl version >/dev/null 2>&1
7085
if [ $? -ne 0 ]
7186
then
@@ -113,10 +128,19 @@ function check_prerequisites {
113128
else
114129
echo -n "found kuttl plugin for kubectl, " && kubectl kuttl version
115130
fi
131+
132+
which helm >/dev/null 2>&1
133+
if [ $? -ne 0 ]
134+
then
135+
echo "helm not installed, exiting."
136+
exit 1
137+
else
138+
echo -n "found helm, " && helm version
139+
fi
116140
}
117141

118142
function pull_images {
119-
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR}
143+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR} ${IMAGE_KUBERAY_OPERATOR}
120144
do
121145
docker pull $image
122146
if [ $? -ne 0 ]
@@ -139,7 +163,7 @@ function kind_up_cluster {
139163
fi
140164
CLUSTER_STARTED="true"
141165

142-
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR}
166+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR} ${IMAGE_KUBERAY_OPERATOR}
143167
do
144168
kind load docker-image ${image} ${CLUSTER_CONTEXT}
145169
if [ $? -ne 0 ]
@@ -153,14 +177,21 @@ function kind_up_cluster {
153177
function configure_cluster {
154178
echo "Installing Kubeflow operator version $KUBEFLOW_VERSION"
155179
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$KUBEFLOW_VERSION"
156-
157-
# Sleep until the kubeflow operator is running
158180
echo "Waiting for pods in the kubeflow namespace to become ready"
159181
while [[ $(kubectl get pods -n kubeflow -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
160182
do
161183
echo -n "." && sleep 1;
162184
done
163185
echo ""
186+
187+
echo "Installing Kuberay operator version $KUBERAY_VERSION"
188+
helm install kuberay-operator kuberay-operator --repo https://ray-project.github.io/kuberay-helm/ --version $KUBERAY_VERSION --create-namespace -n kuberay-system
189+
echo "Waiting for pods in the kuberay namespace to become ready"
190+
while [[ $(kubectl get pods -n kuberay-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
191+
do
192+
echo -n "." && sleep 1;
193+
done
194+
echo ""
164195
}
165196

166197
function wait_for_appwrapper_controller {

test/e2e/appwrapper_test.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,21 @@ var _ = Describe("AppWrapper E2E Test", func() {
8282
})
8383
})
8484

85-
// TODO: KubeRay GVKs (would have to deploy KubeRay operator on e2e test cluster)
85+
Describe("Creation of Kuberay GVKs", Label("Kueue", "Standalone"), func() {
86+
It("RayClusters", func() {
87+
aw := createAppWrapper(ctx, raycluster(500, 2, 250))
88+
appwrappers = append(appwrappers, aw)
89+
// Non-functonal RayCluster; will never reach Running Phase
90+
Eventually(AppWrapperPhase(ctx, aw), 15*time.Second).Should(Equal(workloadv1beta2.AppWrapperResuming))
91+
})
92+
93+
It("RayJobs", func() {
94+
aw := createAppWrapper(ctx, rayjob(500, 2, 250))
95+
appwrappers = append(appwrappers, aw)
96+
// Non-functonal RayJob; will never reach Running Phase
97+
Eventually(AppWrapperPhase(ctx, aw), 15*time.Second).Should(Equal(workloadv1beta2.AppWrapperResuming))
98+
})
99+
})
86100

87101
// TODO: JobSets (would have to deploy JobSet controller on e2e test cluster)
88102

test/e2e/fixtures_test.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,128 @@ func pytorchjob(replicasWorker int, milliCPUWorker int64) workloadv1beta2.AppWra
341341
}
342342
}
343343

344+
// This is not a functional RayCluster:
345+
// 1. Using a dummy busybox image to avoid pulling a large & rate-limited image from dockerhub,
346+
// which means the command injected by the kuberay operator will never work.
347+
//
348+
// It is only useful to check that we validate the PodSpecTemplates and can reach the Resuming state.
349+
const rayclusterYAML = `
350+
apiVersion: ray.io/v1
351+
kind: RayCluster
352+
metadata:
353+
name: %v
354+
spec:
355+
rayVersion: '2.9.0'
356+
headGroupSpec:
357+
rayStartParams: {}
358+
template:
359+
spec:
360+
containers:
361+
- name: ray-head
362+
image: quay.io/project-codeflare/busybox:1.36
363+
command: ["sh", "-c", "sleep 10"]
364+
resources:
365+
requests:
366+
cpu: %v
367+
368+
workerGroupSpecs:
369+
- replicas: %v
370+
minReplicas: %v
371+
maxReplicas: %v
372+
groupName: small-group
373+
rayStartParams: {}
374+
# Pod template
375+
template:
376+
spec:
377+
containers:
378+
- name: ray-worker
379+
image: quay.io/project-codeflare/busybox:1.36
380+
command: ["sh", "-c", "sleep 10"]
381+
resources:
382+
requests:
383+
cpu: %v
384+
`
385+
386+
func raycluster(milliCPUHead int64, replicasWorker int, milliCPUWorker int64) workloadv1beta2.AppWrapperComponent {
387+
yamlString := fmt.Sprintf(rayclusterYAML,
388+
randName("raycluster"),
389+
resource.NewMilliQuantity(milliCPUHead, resource.DecimalSI),
390+
replicasWorker, replicasWorker, replicasWorker,
391+
resource.NewMilliQuantity(milliCPUWorker, resource.DecimalSI),
392+
)
393+
jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString))
394+
Expect(err).NotTo(HaveOccurred())
395+
return workloadv1beta2.AppWrapperComponent{
396+
DeclaredPodSets: []workloadv1beta2.AppWrapperPodSet{
397+
{Replicas: ptr.To(int32(1)), Path: "template.spec.headGroupSpec.template"},
398+
{Replicas: ptr.To(int32(replicasWorker)), Path: "template.spec.workerGroupSpecs[0].template"},
399+
},
400+
Template: runtime.RawExtension{Raw: jsonBytes},
401+
}
402+
}
403+
404+
// This is not a functional RayJob:
405+
// 1. Using a dummy busybox image to avoid pulling a large & rate-limited image from dockerhub,
406+
// which means the command injected by the kuberay operator will never work.
407+
//
408+
// It is only useful to check that we validate the PodSpecTemplates and can reach the Resuming state.
409+
const rayjobYAML = `
410+
apiVersion: ray.io/v1
411+
kind: RayJob
412+
metadata:
413+
name: %v
414+
spec:
415+
shutdownAfterJobFinishes: true
416+
rayClusterSpec:
417+
rayVersion: '2.9.0'
418+
headGroupSpec:
419+
rayStartParams: {}
420+
template:
421+
spec:
422+
containers:
423+
- name: ray-head
424+
image: quay.io/project-codeflare/busybox:1.36
425+
command: ["sh", "-c", "sleep 10"]
426+
resources:
427+
requests:
428+
cpu: %v
429+
430+
workerGroupSpecs:
431+
- replicas: %v
432+
minReplicas: %v
433+
maxReplicas: %v
434+
groupName: small-group
435+
rayStartParams: {}
436+
# Pod template
437+
template:
438+
spec:
439+
containers:
440+
- name: ray-worker
441+
image: quay.io/project-codeflare/busybox:1.36
442+
command: ["sh", "-c", "sleep 10"]
443+
resources:
444+
requests:
445+
cpu: %v
446+
`
447+
448+
func rayjob(milliCPUHead int64, replicasWorker int, milliCPUWorker int64) workloadv1beta2.AppWrapperComponent {
449+
yamlString := fmt.Sprintf(rayjobYAML,
450+
randName("raycluster"),
451+
resource.NewMilliQuantity(milliCPUHead, resource.DecimalSI),
452+
replicasWorker, replicasWorker, replicasWorker,
453+
resource.NewMilliQuantity(milliCPUWorker, resource.DecimalSI),
454+
)
455+
jsonBytes, err := yaml.YAMLToJSON([]byte(yamlString))
456+
Expect(err).NotTo(HaveOccurred())
457+
return workloadv1beta2.AppWrapperComponent{
458+
DeclaredPodSets: []workloadv1beta2.AppWrapperPodSet{
459+
{Replicas: ptr.To(int32(1)), Path: "template.spec.rayClusterSpec.headGroupSpec.template"},
460+
{Replicas: ptr.To(int32(replicasWorker)), Path: "template.spec.rayClusterSpec.workerGroupSpecs[0].template"},
461+
},
462+
Template: runtime.RawExtension{Raw: jsonBytes},
463+
}
464+
}
465+
344466
const jobSetYAML = `
345467
apiVersion: jobset.x-k8s.io/v1alpha2
346468
kind: JobSet

0 commit comments

Comments
 (0)