Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/actions/kind/action.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
name: "Set up KinD"
description: "Step to start and configure KinD cluster"

inputs:
kind-node-hostname:
description: "Hostname of the main kind node"
required: false
default: kind

runs:
using: "composite"
steps:
Expand Down Expand Up @@ -56,3 +62,13 @@ runs:
curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f -
kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true"
kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all

- name: Add ${{ inputs.kind-node-hostname }} host to machine hosts
shell: bash
run: echo "127.0.0.1 ${{ inputs.kind-node-hostname }}" | sudo tee -a /etc/hosts

- name: Set env variables for tests to properly leverage KinD cluster
shell: bash
run: |
echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV
echo "CLUSTER_HOSTNAME=${{ inputs.kind-node-hostname }}" >> $GITHUB_ENV
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.19
require (
github.com/onsi/gomega v1.27.10
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
github.com/project-codeflare/instascale v0.3.0
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0
github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16 h1:TRMLDP6IYt0CAd3+BkvY/r2lkpjI3sOsxf3tnQojZ9k=
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM=
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
github.com/project-codeflare/instascale v0.3.0 h1:PSlwbqqUsFTkTQ5KUhMFRebfokySnEZwav97xZixLQs=
github.com/project-codeflare/instascale v0.3.0/go.mod h1:IU1Wl+zqTpMpZ49BOcr6U+A6gF3AjcmFdKo9ZwP3TDI=
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0 h1:dU2Ev0SijdNm30Y9mjdKJL1Fp6l07rnRBKhSbx1kX9g=
Expand Down
24 changes: 23 additions & 1 deletion test/e2e/mnist_raycluster_sdk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import os

from time import sleep

Expand All @@ -8,17 +9,38 @@
from codeflare_sdk.job.jobs import DDPJobDefinition

namespace = sys.argv[1]
ray_image = os.getenv('RAY_IMAGE')
host = os.getenv('CLUSTER_HOSTNAME')

ingress_options = {}
if host is not None:
ingress_options = {
"ingresses": [
{
"ingressName": "ray-dashboard",
"port": 8265,
"pathType": "Prefix",
"path": "/",
"host": host,
},
]
}


cluster = Cluster(ClusterConfiguration(
name='mnist',
namespace=namespace,
num_workers=1,
head_cpus='500m',
head_memory=2,
min_cpus='500m',
max_cpus=1,
min_memory=0.5,
max_memory=1,
max_memory=2,
num_gpus=0,
instascale=False,
image=ray_image,
ingress_options=ingress_options,
))

cluster.up()
Expand Down
151 changes: 63 additions & 88 deletions test/e2e/mnist_raycluster_sdk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,104 +40,59 @@ func TestMNISTRayClusterSDK(t *testing.T) {
test := With(t)
test.T().Parallel()

// Currently blocked by https://github.com/project-codeflare/codeflare-sdk/pull/251 , remove the skip once SDK with the PR is released
test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/251")

// Create a namespace
namespace := test.NewTestNamespace()

// Test configuration
config := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist-raycluster-sdk",
Namespace: namespace.Name,
config := CreateConfigMap(test, namespace.Name, map[string][]byte{
// SDK script
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
})

// Create RBAC, retrieve token for user with limited rights
policyRules := []rbacv1.PolicyRule{
{
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
APIGroups: []string{mcadv1beta1.GroupName},
Resources: []string{"appwrappers"},
},
BinaryData: map[string][]byte{
// SDK script
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
{
Verbs: []string{"get", "list"},
APIGroups: []string{rayv1.GroupVersion.Group},
Resources: []string{"rayclusters", "rayclusters/status"},
},
Immutable: Ptr(true),
}
config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)

// SDK client RBAC
serviceAccount := &corev1.ServiceAccount{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ServiceAccount",
{
Verbs: []string{"get", "list"},
APIGroups: []string{"route.openshift.io"},
Resources: []string{"routes"},
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk-user",
Namespace: namespace.Name,
{
Verbs: []string{"get", "list"},
APIGroups: []string{"networking.k8s.io"},
Resources: []string{"ingresses"},
},
}
serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())

role := &rbacv1.Role{
TypeMeta: metav1.TypeMeta{
APIVersion: rbacv1.SchemeGroupVersion.String(),
Kind: "Role",
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk",
Namespace: namespace.Name,
},
Rules: []rbacv1.PolicyRule{
{
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
APIGroups: []string{mcadv1beta1.GroupName},
Resources: []string{"appwrappers"},
},
{
Verbs: []string{"get", "list"},
APIGroups: []string{rayv1.GroupVersion.Group},
Resources: []string{"rayclusters", "rayclusters/status"},
},
{
Verbs: []string{"get", "list"},
APIGroups: []string{"route.openshift.io"},
Resources: []string{"routes"},
},
// Create cluster wide RBAC, required for SDK OpenShift check
// TODO reevaluate once SDK change OpenShift detection logic
clusterPolicyRules := []rbacv1.PolicyRule{
{
Verbs: []string{"get", "list"},
APIGroups: []string{"config.openshift.io"},
Resources: []string{"ingresses"},
ResourceNames: []string{"cluster"},
},
}
role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())

roleBinding := &rbacv1.RoleBinding{
TypeMeta: metav1.TypeMeta{
APIVersion: rbacv1.SchemeGroupVersion.String(),
Kind: "RoleBinding",
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk",
},
RoleRef: rbacv1.RoleRef{
APIGroup: rbacv1.SchemeGroupVersion.Group,
Kind: "Role",
Name: role.Name,
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
APIGroup: corev1.SchemeGroupVersion.Group,
Name: serviceAccount.Name,
Namespace: serviceAccount.Namespace,
},
},
}
_, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
sa := CreateServiceAccount(test, namespace.Name)
role := CreateRole(test, namespace.Name, policyRules)
CreateRoleBinding(test, namespace.Name, sa, role)
clusterRole := CreateClusterRole(test, clusterPolicyRules)
CreateClusterRoleBinding(test, sa, clusterRole)

job := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
Expand All @@ -161,7 +116,8 @@ func TestMNISTRayClusterSDK(t *testing.T) {
// See https://github.com/project-codeflare/codeflare-sdk/pull/146
Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e",
Env: []corev1.EnvVar{
corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"},
{Name: "PYTHONUSERBASE", Value: "/workdir"},
{Name: "RAY_IMAGE", Value: GetRayImage()},
},
Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name},
VolumeMounts: []corev1.VolumeMount{
Expand Down Expand Up @@ -206,12 +162,31 @@ func TestMNISTRayClusterSDK(t *testing.T) {
},
},
RestartPolicy: corev1.RestartPolicyNever,
ServiceAccountName: serviceAccount.Name,
ServiceAccountName: sa.Name,
},
},
},
}
job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
if GetClusterType(test) == KindCluster {
// Take first KinD node and redirect pod hostname requests there
node := GetNodes(test)[0]
hostname := GetClusterHostname(test)
IP := GetNodeInternalIP(test, node)

test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP)
job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{
{
IP: IP,
Hostnames: []string{hostname},
},
}

// Propagate hostname into Python code as env variable
hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname}
job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar)
}

job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name)

Expand Down
4 changes: 2 additions & 2 deletions test/e2e/mnist_rayjob_mcad_raycluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1G"),
corev1.ResourceMemory: resource.MustParse("2G"),
},
},
VolumeMounts: []corev1.VolumeMount{
Expand Down Expand Up @@ -168,7 +168,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1G"),
corev1.ResourceMemory: resource.MustParse("2G"),
},
},
},
Expand Down