Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/v1beta2/appwrapper_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ const (
)

const (
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
Expand Down
20 changes: 18 additions & 2 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,13 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
podDetailsMessage := fmt.Sprintf("%v pods pending; %v pods running; %v pods succeeded", podStatus.pending, podStatus.running, podStatus.succeeded)
clearCondition(aw, workloadv1beta2.PodsReady, "InsufficientPodsReady", podDetailsMessage)
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
warmupDuration := r.warmupGraceDuration(ctx, aw)
if time.Now().Before(whenDeployed.Add(warmupDuration)) {
var graceDuration time.Duration
if podStatus.pending+podStatus.running+podStatus.succeeded >= podStatus.expected {
graceDuration = r.warmupGraceDuration(ctx, aw)
} else {
graceDuration = r.admissionGraceDuration(ctx, aw)
}
if time.Now().Before(whenDeployed.Add(graceDuration)) {
return ctrl.Result{RequeueAfter: 5 * time.Second}, r.Status().Update(ctx, aw)
} else {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Expand Down Expand Up @@ -459,6 +464,17 @@ func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duratio
}
}

func (r *AppWrapperReconciler) admissionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
}
}
return r.limitDuration(r.Config.FaultTolerance.AdmissionGracePeriod)
}

func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.WarmupGracePeriodDurationAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
Expand Down
34 changes: 20 additions & 14 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@ type AppWrapperConfig struct {
}

type FaultToleranceConfig struct {
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
ResetPause time.Duration `json:"resetPause,omitempty"`
RetryLimit int32 `json:"retryLimit,omitempty"`
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
ResetPause time.Duration `json:"resetPause,omitempty"`
RetryLimit int32 `json:"retryLimit,omitempty"`
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
}

type CertManagementConfig struct {
Expand Down Expand Up @@ -81,13 +82,14 @@ func NewAppWrapperConfig() *AppWrapperConfig {
DisableChildAdmissionCtrl: false,
UserRBACAdmissionCheck: true,
FaultTolerance: &FaultToleranceConfig{
WarmupGracePeriod: 5 * time.Minute,
FailureGracePeriod: 1 * time.Minute,
ResetPause: 90 * time.Second,
RetryLimit: 3,
DeletionGracePeriod: 10 * time.Minute,
GracePeriodCeiling: 24 * time.Hour,
SuccessTTLCeiling: 7 * 24 * time.Hour,
AdmissionGracePeriod: 1 * time.Minute,
WarmupGracePeriod: 5 * time.Minute,
FailureGracePeriod: 1 * time.Minute,
ResetPause: 90 * time.Second,
RetryLimit: 3,
DeletionGracePeriod: 10 * time.Minute,
GracePeriodCeiling: 24 * time.Hour,
SuccessTTLCeiling: 7 * 24 * time.Hour,
},
}
}
Expand All @@ -105,6 +107,10 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodCeiling)
}
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
}
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodCeiling {
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)
Expand Down
34 changes: 19 additions & 15 deletions site/_pages/arch-fault-tolerance.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ classes: wide
### Overall Design

The `podSets` contained in the AppWrapper specification enable the AppWrapper
controller to inject labels into every `Pod` that is created by
controller to inject labels into every Pod that is created by
the workload during its execution. Throughout the execution of the
workload, the AppWrapper controller monitors the number and health of
all labeled `Pods` and uses this information to determine if a
workload is unhealthy. A workload can be deemed *unhealthy* either
because it contains a non-zero number of `Failed` pods or because
after the `WarmupGracePeriod` has passed and it has fewer
`Running` and `Completed` pods than expected.
all labeled Pods and uses this information to determine if a
workload is unhealthy. A workload can be deemed *unhealthy* if any of
the following conditions are true:
+ There are a non-zero number of `Failed` Pods.
+ It takes longer than `AdmissionGracePeriod` for the expected
number of Pods to at least reach the `Pending` state.
+ It takes longer than the `WarmupGracePeriod` for the expected
number of Pods to at least reach the `Running` state.

If a workload is determined to be unhealthy, the AppWrapper controller
first waits for a `FailureGracePeriod` to allow the primary resource
Expand Down Expand Up @@ -54,15 +57,16 @@ and can be customized on a per-AppWrapper basis by adding annotations.
The table below lists the parameters, gives their default, and the annotation that
can be used to customize them.

| Parameter | Default Value | Annotation |
|---------------------|---------------|---------------------------------------------------------------|
| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration |
| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration |
| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration |
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
| GracePeriodCeiling | 24 Hours | Not Applicable |
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |
| Parameter | Default Value | Annotation |
|------------------------|---------------|------------------------------------------------------------------|
| AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration |
| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration |
| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration |
| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration |
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
| GracePeriodCeiling | 24 Hours | Not Applicable |
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |


The `GracePeriodCeiling` imposes an upper limit on the other grace periods to
Expand Down