Skip to content

Commit 921b399

Browse files
authored
Merge pull request #7068 from bskiba/cleanup-for-stockout-2
Extract getting nodes to delete for atomic node groups
2 parents 45dfba4 + 14b3357 commit 921b399

File tree

2 files changed

+83
-37
lines changed

2 files changed

+83
-37
lines changed

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
743743
csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) {
744744

745745
nodeGroups := a.nodeGroupsById()
746-
nodesToBeDeletedByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode)
746+
nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode)
747747
for _, unregisteredNode := range allUnregisteredNodes {
748748
nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node)
749749
if err != nil {
@@ -762,12 +762,12 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
762762

763763
if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) {
764764
klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name)
765-
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], unregisteredNode)
765+
nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode)
766766
}
767767
}
768768

769769
removedAny := false
770-
for nodeGroupId, unregisteredNodesToDelete := range nodesToBeDeletedByNodeGroupId {
770+
for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId {
771771
nodeGroup := nodeGroups[nodeGroupId]
772772

773773
klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId)
@@ -787,21 +787,11 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
787787
}
788788
nodesToDelete := toNodes(unregisteredNodesToDelete)
789789

790-
opts, err := nodeGroup.GetOptions(a.NodeGroupDefaults)
791-
if err != nil && err != cloudprovider.ErrNotImplemented {
792-
klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err)
790+
nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete)
791+
if err != nil {
792+
klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err)
793793
continue
794794
}
795-
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
796-
// should stick to the all-or-nothing principle. Deleting all nodes.
797-
if opts != nil && opts.ZeroOrMaxNodeScaling {
798-
instances, err := nodeGroup.Nodes()
799-
if err != nil {
800-
klog.Warningf("Failed to fill in unregistered nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err)
801-
continue
802-
}
803-
nodesToDelete = instancesToFakeNodes(instances)
804-
}
805795

806796
err = nodeGroup.DeleteNodes(nodesToDelete)
807797
csr.InvalidateNodeInstancesCacheEntry(nodeGroup)
@@ -835,35 +825,19 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
835825
// We always schedule deleting of incoming errornous nodes
836826
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
837827
nodeGroups := a.nodeGroupsById()
838-
nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()
828+
nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()
839829

840830
deletedAny := false
841831

842-
for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
832+
for nodeGroupId, nodesToDelete := range nodesToDeleteByNodeGroupId {
843833
var err error
844-
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId)
834+
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToDelete), nodeGroupId)
845835

846836
nodeGroup := nodeGroups[nodeGroupId]
847837
if nodeGroup == nil {
848838
err = fmt.Errorf("node group %s not found", nodeGroupId)
849-
} else {
850-
var opts *config.NodeGroupAutoscalingOptions
851-
opts, err = nodeGroup.GetOptions(a.NodeGroupDefaults)
852-
if err != nil && err != cloudprovider.ErrNotImplemented {
853-
klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err)
854-
continue
855-
}
856-
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
857-
// should stick to the all-or-nothing principle. Deleting all nodes.
858-
if opts != nil && opts.ZeroOrMaxNodeScaling {
859-
instances, err := nodeGroup.Nodes()
860-
if err != nil {
861-
klog.Warningf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err)
862-
continue
863-
}
864-
nodesToBeDeleted = instancesToFakeNodes(instances)
865-
}
866-
err = nodeGroup.DeleteNodes(nodesToBeDeleted)
839+
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
840+
err = nodeGroup.DeleteNodes(nodesToDelete)
867841
}
868842

869843
if err != nil {
@@ -877,6 +851,27 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
877851
return deletedAny
878852
}
879853

854+
// overrideNodesToDeleteForZeroOrMax returns a list of nodes to delete, taking into account that
855+
// node deletion for a "ZeroOrMaxNodeScaling" node group is atomic and should delete all nodes.
856+
// For a non-"ZeroOrMaxNodeScaling" node group it returns the unchanged list of nodes to delete.
857+
func overrideNodesToDeleteForZeroOrMax(defaults config.NodeGroupAutoscalingOptions, nodeGroup cloudprovider.NodeGroup, nodesToDelete []*apiv1.Node) ([]*apiv1.Node, error) {
858+
opts, err := nodeGroup.GetOptions(defaults)
859+
if err != nil && err != cloudprovider.ErrNotImplemented {
860+
return []*apiv1.Node{}, fmt.Errorf("Failed to get node group options for %s: %s", nodeGroup.Id(), err)
861+
}
862+
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
863+
// should stick to the all-or-nothing principle. Deleting all nodes.
864+
if opts != nil && opts.ZeroOrMaxNodeScaling {
865+
instances, err := nodeGroup.Nodes()
866+
if err != nil {
867+
return []*apiv1.Node{}, fmt.Errorf("Failed to fill in nodes to delete from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroup.Id(), err)
868+
}
869+
return instancesToFakeNodes(instances), nil
870+
}
871+
// No override needed.
872+
return nodesToDelete, nil
873+
}
874+
880875
// instancesToNodes returns a list of fake nodes with just names populated,
881876
// so that they can be passed as nodes to delete
882877
func instancesToFakeNodes(instances []cloudprovider.Instance) []*apiv1.Node {

cluster-autoscaler/core/static_autoscaler_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,6 +1910,57 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
19101910
}
19111911
return names["D1"] && names["D2"] && names["D3"]
19121912
}))
1913+
1914+
// Node group with getOptions error gets no deletes.
1915+
nodeGroupError := &mockprovider.NodeGroup{}
1916+
nodeGroupError.On("Exist").Return(true)
1917+
nodeGroupError.On("Autoprovisioned").Return(false)
1918+
nodeGroupError.On("TargetSize").Return(1, nil)
1919+
nodeGroupError.On("Id").Return("E")
1920+
nodeGroupError.On("DeleteNodes", mock.Anything).Return(nil)
1921+
nodeGroupError.On("GetOptions", options.NodeGroupDefaults).Return(nil, fmt.Errorf("Failed to get options"))
1922+
nodeGroupError.On("Nodes").Return([]cloudprovider.Instance{
1923+
{
1924+
Id: "E1",
1925+
Status: &cloudprovider.InstanceStatus{
1926+
State: cloudprovider.InstanceRunning,
1927+
},
1928+
},
1929+
{
1930+
1931+
Id: "E2",
1932+
Status: &cloudprovider.InstanceStatus{
1933+
State: cloudprovider.InstanceCreating,
1934+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
1935+
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
1936+
ErrorCode: "QUOTA",
1937+
},
1938+
},
1939+
},
1940+
}, nil)
1941+
1942+
provider = &mockprovider.CloudProvider{}
1943+
provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupError})
1944+
provider.On("NodeGroupForNode", mock.Anything).Return(
1945+
func(node *apiv1.Node) cloudprovider.NodeGroup {
1946+
if strings.HasPrefix(node.Spec.ProviderID, "E") {
1947+
return nodeGroupError
1948+
}
1949+
return nil
1950+
}, nil).Times(2)
1951+
1952+
clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodeGroupConfigProcessor)
1953+
clusterState.RefreshCloudProviderNodeInstancesCache()
1954+
autoscaler.CloudProvider = provider
1955+
autoscaler.clusterStateRegistry = clusterState
1956+
// propagate nodes info in cluster state
1957+
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)
1958+
1959+
// delete nodes with create errors
1960+
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
1961+
assert.False(t, removedNodes)
1962+
1963+
nodeGroupError.AssertNumberOfCalls(t, "DeleteNodes", 0)
19131964
}
19141965

19151966
type candidateTrackingFakePlanner struct {

0 commit comments

Comments
 (0)