Skip to content

Commit feeb1f7

Browse files
norbertcyranMaximilianoUribe
authored andcommitted
Force delete nodes with errors
DeleteNodes checks if the node group is below its min size. If it is and a scale up fails, CA is unable to clean up the instances with errors, which causes CA ending up in an error loop. Using ForceDeleteNodes instead, so the min size won't be validated when removing failed instances.
1 parent adadd3d commit feeb1f7

File tree

3 files changed

+372
-366
lines changed

3 files changed

+372
-366
lines changed

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,8 @@ type AutoscalingOptions struct {
309309
CheckCapacityProvisioningRequestBatchTimebox time.Duration
310310
// ForceDeleteLongUnregisteredNodes is used to enable/disable ignoring min size constraints during removal of long unregistered nodes
311311
ForceDeleteLongUnregisteredNodes bool
312+
// ForceDeleteFailedNodes is used to enable/disable ignoring min size constraints during removal of failed nodes
313+
ForceDeleteFailedNodes bool
312314
// DynamicResourceAllocationEnabled configures whether logic for handling DRA objects is enabled.
313315
DynamicResourceAllocationEnabled bool
314316
}

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -866,8 +866,15 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
866866
nodeGroup := nodeGroups[nodeGroupId]
867867
if nodeGroup == nil {
868868
err = fmt.Errorf("node group %s not found", nodeGroupId)
869-
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
870-
err = nodeGroup.DeleteNodes(nodesToDelete)
869+
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil && len(nodesToDelete) > 0 {
870+
if a.ForceDeleteFailedNodes {
871+
err = nodeGroup.ForceDeleteNodes(nodesToDelete)
872+
if errors.Is(err, cloudprovider.ErrNotImplemented) {
873+
err = nodeGroup.DeleteNodes(nodesToDelete)
874+
}
875+
} else {
876+
err = nodeGroup.DeleteNodes(nodesToDelete)
877+
}
871878
}
872879

873880
if err != nil {

0 commit comments

Comments
 (0)