Skip to content

Commit 5a6ec6b

Browse files
authored
🐛 set taint on managedcluster before starting spoke cleanup (#76)
* fix: set taint on managedcluster before starting spoke cleanup Signed-off-by: Artur Shad Nik <[email protected]> * fix: order of operations Signed-off-by: Artur Shad Nik <[email protected]> * fix: use different taints for regular workloads vs addons Signed-off-by: Artur Shad Nik <[email protected]> * chore: split out preflight cleanup to reduce complexity Signed-off-by: Artur Shad Nik <[email protected]> * refactor: tighten up requeues Signed-off-by: Artur Shad Nik <[email protected]> * chore: shorten requeues during deletion Signed-off-by: Artur Shad Nik <[email protected]> * chore: make cluster drain optional, default to false Signed-off-by: Artur Shad Nik <[email protected]> * fix: update helm chart Signed-off-by: Artur Shad Nik <[email protected]> * test: update e2e test Signed-off-by: Artur Shad Nik <[email protected]> * chore: rabbit Signed-off-by: Artur Shad Nik <[email protected]> * chore: bump fcc to 0.1.2 Signed-off-by: Artur Shad Nik <[email protected]> --------- Signed-off-by: Artur Shad Nik <[email protected]>
1 parent 6e016a2 commit 5a6ec6b

File tree

13 files changed

+364
-210
lines changed

13 files changed

+364
-210
lines changed

fleetconfig-controller/api/v1beta1/constants.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@ const (
129129

130130
// AgentCleanupWatcherName is the name of the watcher for cleaning up the spoke agent.
131131
AgentCleanupWatcherName = "agent-cleanup-watcher"
132+
133+
// ManagedClusterWorkloadCleanupTaint is applied to a ManagedCluster to remove non-addon workloads.
134+
// Addons can tolerate this taint to continue running during initial cleanup phase.
135+
ManagedClusterWorkloadCleanupTaint = "fleetconfig.open-cluster-management.io/workload-cleanup"
136+
137+
// ManagedClusterTerminatingTaint is applied to remove all workloads including addons.
138+
// Nothing should tolerate this taint - it signals final cluster termination.
139+
ManagedClusterTerminatingTaint = "fleetconfig.open-cluster-management.io/terminating"
132140
)
133141

134142
// SupportedInstanceTypes are the valid cluster types that the controller can be installed in.

fleetconfig-controller/api/v1beta1/spoke_types.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ type SpokeSpec struct {
8989

9090
// CleanupConfig is the configuration for cleaning up resources during Spoke cleanup.
9191
type CleanupConfig struct {
92-
// If true, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined.
92+
// If set, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined.
9393
// +kubebuilder:default:=false
9494
// +optional
9595
PurgeAgentNamespace bool `json:"purgeAgentNamespace,omitempty"`
@@ -104,6 +104,14 @@ type CleanupConfig struct {
104104
// +kubebuilder:default:=false
105105
// +optional
106106
PurgeKubeconfigSecret bool `json:"purgeKubeconfigSecret,omitempty"`
107+
108+
// If set, all ManifestWorks which were created using a Placement will be automatically descheduled from the Spoke cluster during deletion.
109+
// This includes AddOns installed using installStrategy.type=Placements. If an AddOn must stay running to reconcile deletion of other ManifestWorks,
110+
// it should tolerate the `fleetconfig.open-cluster-management.io/workload-cleanup` taint.
111+
// Manually created ManifestWorks will not be affected and must be manually cleaned up for Spoke deletion to proceed.
112+
// +kubebuilder:default:=false
113+
// +optional
114+
ForceClusterDrain bool `json:"forceClusterDrain,omitempty"`
107115
}
108116

109117
// HubRef is the information required to get a Hub resource.

fleetconfig-controller/charts/fleetconfig-controller/README.md

Lines changed: 74 additions & 73 deletions
Large diffs are not rendered by default.

fleetconfig-controller/charts/fleetconfig-controller/crds/fleetconfig.open-cluster-management.io_spokes.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,17 @@ spec:
7373
description: CleanupConfig is used to configure which resources should
7474
be automatically garbage collected during cleanup.
7575
properties:
76+
forceClusterDrain:
77+
default: false
78+
description: |-
79+
If set, all ManifestWorks which were created using a Placement will be automatically descheduled from the Spoke cluster during deletion.
80+
This includes AddOns installed using installStrategy.type=Placements. If an AddOn must stay running to reconcile deletion of other ManifestWorks,
81+
it should tolerate the `fleetconfig.open-cluster-management.io/workload-cleanup` taint.
82+
Manually created ManifestWorks will not be affected and must be manually cleaned up for Spoke deletion to proceed.
83+
type: boolean
7684
purgeAgentNamespace:
7785
default: false
78-
description: If true, the agent will attempt to garbage collect
86+
description: If set, the agent will attempt to garbage collect
7987
its own namespace after the spoke cluster is unjoined.
8088
type: boolean
8189
purgeKlusterletOperator:

fleetconfig-controller/charts/fleetconfig-controller/templates/fleetconfig.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,14 @@ metadata:
7575
spec:
7676
{{- with .cleanupConfig }}
7777
cleanupConfig:
78-
purgeKlusterletOperator: {{ .purgeKlusterletOperator | default true }}
78+
{{- if hasKey . "purgeKlusterletOperator" }}
79+
purgeKlusterletOperator: {{ .purgeKlusterletOperator }}
80+
{{- else }}
81+
purgeKlusterletOperator: true
82+
{{- end }}
7983
purgeKubeconfigSecret: {{ .purgeKubeconfigSecret | default false }}
8084
purgeAgentNamespace: {{ .purgeAgentNamespace | default false }}
85+
forceClusterDrain: {{ .forceClusterDrain | default false }}
8186
{{- end }}
8287
hubRef:
8388
name: {{ .hubRef.name }}

fleetconfig-controller/charts/fleetconfig-controller/values.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ fleetConfig:
184184
## @param fleetConfig.spokes[0].cleanupConfig.purgeKlusterletOperator If set, the klusterlet operator will be purged and all open-cluster-management namespaces deleted when the klusterlet is unjoined from its Hub cluster.
185185
## @param fleetConfig.spokes[0].cleanupConfig.purgeKubeconfigSecret If set, the kubeconfig secret will be automatically deleted after the agent has taken over managing the Spoke.
186186
## @param fleetConfig.spokes[0].cleanupConfig.purgeAgentNamespace If true, the agent will attempt to garbage collect its own namespace after the spoke cluster is unjoined.
187+
## @param fleetConfig.spokes[0].cleanupConfig.forceClusterDrain If set, all ManifestWorks which were created using a Placement will be automatically descheduled from the Spoke cluster during deletion. This includes AddOns installed using installStrategy.type=Placements. If an AddOn must stay running to reconcile deletion of other ManifestWorks, it should tolerate the `fleetconfig.open-cluster-management.io/workload-cleanup` taint. Manually created ManifestWorks will not be affected and must be manually cleaned up for Spoke deletion to proceed.
187188
## @param fleetConfig.spokes[0].kubeconfig.context The context to use in the kubeconfig file. Leave empty to use the current context.
188189
## @param fleetConfig.spokes[0].kubeconfig.inCluster If set, the kubeconfig will be read from the cluster. Only applicable for same-cluster operations.
189190
## @param fleetConfig.spokes[0].kubeconfig.secretReference.name The name of the secret.
@@ -216,6 +217,7 @@ fleetConfig:
216217
purgeKlusterletOperator: true
217218
purgeKubeconfigSecret: true
218219
purgeAgentNamespace: true
220+
forceClusterDrain: false
219221
## Kubeconfig details for the Spoke cluster.
220222
kubeconfig:
221223
context: ""
@@ -292,7 +294,7 @@ imageRegistry: ""
292294
## @param image.pullPolicy Image pull policy
293295
image:
294296
repository: quay.io/open-cluster-management/fleetconfig-controller
295-
tag: v0.1.1
297+
tag: v0.1.2
296298
pullPolicy: IfNotPresent
297299

298300
## @param imagePullSecrets Image pull secrets

fleetconfig-controller/devspace.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ deployments:
117117
enabled: ${FLEETCONFIG_ENABLED}
118118
addonMode: ${ADDON_MODE}
119119
enableLegacyControllers: ${ENABLE_LEGACY_CONTROLLERS}
120+
cert-manager:
121+
enabled: false
120122
valuesFiles:
121123
- ${CONTEXT}/charts/fleetconfig-controller/values.yaml
122124
updateImageTags: false
@@ -131,6 +133,8 @@ deployments:
131133
enabled: ${FLEETCONFIG_ENABLED}
132134
addonMode: ${ADDON_MODE}
133135
enableLegacyControllers: ${ENABLE_LEGACY_CONTROLLERS}
136+
cert-manager:
137+
enabled: false
134138
valuesFiles:
135139
- ${CONTEXT}/charts/fleetconfig-controller/values.yaml
136140

@@ -146,6 +150,8 @@ deployments:
146150
tag: local
147151
addonMode: ${ADDON_MODE}
148152
enableLegacyControllers: ${ENABLE_LEGACY_CONTROLLERS}
153+
cert-manager:
154+
enabled: false
149155
valuesFiles:
150156
- ${CONTEXT}/charts/fleetconfig-controller/values.yaml
151157
- ${CONTEXT}/test/data/fleetconfig-values.yaml

fleetconfig-controller/internal/controller/v1beta1/constants.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,14 @@ import (
77

88
// generic
99
const (
10-
clusteradm = "clusteradm"
11-
requeue = 30 * time.Second
12-
amwExistsError = "you should manually clean them, uninstall kluster will cause those works out of control."
10+
clusteradm = "clusteradm"
11+
12+
hubRequeuePreInit = 30 * time.Second
13+
hubRequeuePostInit = 2 * time.Minute
14+
requeueDeleting = 5 * time.Second
15+
spokeRequeuePreJoin = 15 * time.Second
16+
spokeRequeuePostJoin = 1 * time.Minute
17+
spokeWatchInterval = 30 * time.Second
1318
)
1419

1520
var csrSuffixPattern = regexp.MustCompile(`-[a-zA-Z0-9]{5}$`)

fleetconfig-controller/internal/controller/v1beta1/hub_controller.go

Lines changed: 39 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func (r *HubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
9494
// Add a finalizer and requeue if not already present
9595
if !slices.Contains(hub.Finalizers, v1beta1.HubCleanupFinalizer) {
9696
hub.Finalizers = append(hub.Finalizers, v1beta1.HubCleanupFinalizer)
97-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
97+
return ret(ctx, ctrl.Result{RequeueAfter: hubRequeuePreInit}, nil)
9898
}
9999

100100
hubKubeconfig, err := kube.KubeconfigFromSecretOrCluster(ctx, r.Client, hub.Spec.Kubeconfig, hub.Namespace)
@@ -106,20 +106,22 @@ func (r *HubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
106106
if !hub.DeletionTimestamp.IsZero() {
107107
if hub.Status.Phase != v1beta1.Deleting {
108108
hub.Status.Phase = v1beta1.Deleting
109-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
109+
return ret(ctx, ctrl.Result{RequeueAfter: requeueDeleting}, nil)
110110
}
111111

112112
if slices.Contains(hub.Finalizers, v1beta1.HubCleanupFinalizer) {
113-
if err := r.cleanHub(ctx, hub, hubKubeconfig); err != nil {
113+
requeue, err := r.cleanHub(ctx, hub, hubKubeconfig)
114+
if err != nil {
114115
hub.SetConditions(true, v1beta1.NewCondition(
115116
err.Error(), v1beta1.CleanupFailed, metav1.ConditionTrue, metav1.ConditionFalse,
116117
))
117118
return ret(ctx, ctrl.Result{}, err)
118119
}
120+
if requeue {
121+
return ret(ctx, ctrl.Result{RequeueAfter: requeueDeleting}, nil)
122+
}
119123
}
120-
hub.Finalizers = slices.DeleteFunc(hub.Finalizers, func(s string) bool {
121-
return s == v1beta1.HubCleanupFinalizer
122-
})
124+
123125
// end reconciliation
124126
return ret(ctx, ctrl.Result{}, nil)
125127
}
@@ -145,7 +147,7 @@ func (r *HubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
145147

146148
if previousPhase == "" {
147149
// set initial phase/conditions and requeue
148-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
150+
return ret(ctx, ctrl.Result{RequeueAfter: hubRequeuePreInit}, nil)
149151
}
150152

151153
// Handle Hub cluster: initialization and/or upgrade
@@ -155,22 +157,22 @@ func (r *HubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
155157
}
156158
hubInitializedCond := hub.GetCondition(v1beta1.HubInitialized)
157159
if hubInitializedCond == nil || hubInitializedCond.Status == metav1.ConditionFalse {
158-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
160+
return ret(ctx, ctrl.Result{RequeueAfter: hubRequeuePreInit}, nil)
159161
}
160162

161163
// Finalize phase
162164
for _, c := range hub.Status.Conditions {
163165
if c.Status != c.WantStatus {
164166
logger.Info("WARNING: condition does not have the desired status", "type", c.Type, "reason", c.Reason, "message", c.Message, "status", c.Status, "wantStatus", c.WantStatus)
165167
hub.Status.Phase = v1beta1.Unhealthy
166-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
168+
return ret(ctx, ctrl.Result{RequeueAfter: hubRequeuePreInit}, nil)
167169
}
168170
}
169171
if hub.Status.Phase == v1beta1.HubStarting {
170172
hub.Status.Phase = v1beta1.HubRunning
171173
}
172174

173-
return ret(ctx, ctrl.Result{RequeueAfter: requeue}, nil)
175+
return ret(ctx, ctrl.Result{RequeueAfter: hubRequeuePostInit}, nil)
174176
}
175177

176178
type contextKey int
@@ -185,56 +187,55 @@ func withOriginalHub(ctx context.Context, hub *v1beta1.Hub) context.Context {
185187
}
186188

187189
// cleanup cleans up a Hub and its associated resources.
188-
func (r *HubReconciler) cleanHub(ctx context.Context, hub *v1beta1.Hub, hubKubeconfig []byte) error {
190+
func (r *HubReconciler) cleanHub(ctx context.Context, hub *v1beta1.Hub, hubKubeconfig []byte) (bool, error) {
189191
logger := log.FromContext(ctx)
190192
logger.V(0).Info("cleanHub", "hub", hub.Name)
191193

192194
// Check if there are any Spokes that need to be deleted
193195
spokeList := &v1beta1.SpokeList{}
194196
err := r.List(ctx, spokeList)
195197
if err != nil {
196-
return err
198+
return true, err
197199
}
198200

199-
spokes := spokeList.Items
200-
if len(spokes) > 0 {
201-
// Mark all Spokes for deletion if they haven't been deleted yet
202-
for i := range spokes {
203-
spoke := &spokes[i]
204-
if spoke.DeletionTimestamp.IsZero() {
205-
if !spoke.IsManagedBy(hub.ObjectMeta) {
206-
continue
207-
}
208-
logger.Info("Marking Spoke for deletion", "spoke", spoke.Name)
209-
if err := r.Delete(ctx, spoke); err != nil && !kerrs.IsNotFound(err) {
210-
return fmt.Errorf("failed to delete spoke %s: %w", spoke.Name, err)
211-
}
201+
managedRemaining := 0
202+
for i := range spokeList.Items {
203+
s := &spokeList.Items[i]
204+
if !s.IsManagedBy(hub.ObjectMeta) {
205+
continue
206+
}
207+
if s.DeletionTimestamp.IsZero() {
208+
logger.Info("Marking Spoke for deletion", "spoke", s.Name)
209+
if err := r.Delete(ctx, s); err != nil && !kerrs.IsNotFound(err) {
210+
return true, fmt.Errorf("failed to delete spoke %s: %w", s.Name, err)
212211
}
213212
}
214-
215-
logger.V(1).Info("Waiting for all Spokes to be deleted before proceeding with Hub cleanup",
216-
"remainingSpokes", len(spokes))
217-
// Return a retriable error to requeue and check again later
218-
return fmt.Errorf("waiting for background spoke deletion. Remaining: %d spokes", len(spokes))
213+
// Count managed spokes until they're fully deleted
214+
managedRemaining++
215+
}
216+
if managedRemaining > 0 {
217+
logger.V(1).Info("Waiting for managed Spokes to be deleted before proceeding with Hub cleanup",
218+
"remainingSpokes", managedRemaining)
219+
return true, nil
219220
}
220221

221222
logger.Info("All Spokes have been deleted, proceeding with Hub cleanup")
222223

223224
addonC, err := common.AddOnClient(hubKubeconfig)
224225
if err != nil {
225-
return fmt.Errorf("failed to create addon client for cleanup: %w", err)
226+
return true, fmt.Errorf("failed to create addon client for cleanup: %w", err)
226227
}
227228

228229
hubCopy := hub.DeepCopy()
229230
hubCopy.Spec.AddOnConfigs = nil
230231
hubCopy.Spec.HubAddOns = nil
231232
_, err = handleAddonConfig(ctx, r.Client, addonC, hubCopy)
232233
if err != nil {
233-
return err
234+
return true, err
234235
}
235236
_, err = handleHubAddons(ctx, addonC, hubCopy)
236237
if err != nil {
237-
return err
238+
return true, err
238239
}
239240

240241
purgeOperator := false
@@ -253,12 +254,15 @@ func (r *HubReconciler) cleanHub(ctx context.Context, hub *v1beta1.Hub, hubKubec
253254
stdout, stderr, err := exec_utils.CmdWithLogs(ctx, cmd, "waiting for 'clusteradm clean' to complete...")
254255
if err != nil {
255256
out := append(stdout, stderr...)
256-
return fmt.Errorf("failed to clean hub cluster: %v, output: %s", err, string(out))
257+
return true, fmt.Errorf("failed to clean hub cluster: %v, output: %s", err, string(out))
257258
}
258259
logger.V(1).Info("hub cleaned", "output", string(stdout))
259260

260-
return nil
261+
hub.Finalizers = slices.DeleteFunc(hub.Finalizers, func(s string) bool {
262+
return s == v1beta1.HubCleanupFinalizer
263+
})
261264

265+
return false, nil
262266
}
263267

264268
// handleHub manages Hub cluster init and upgrade operations

0 commit comments

Comments
 (0)