Skip to content

Commit d67f22b

Browse files
authored
Merge pull request #7103 from abdelrahman882/async-node-group-creation
Async node group creation
2 parents 5ba61cb + e30bf14 commit d67f22b

File tree

18 files changed

+630
-144
lines changed

18 files changed

+630
-144
lines changed

cluster-autoscaler/cloudprovider/test/test_cloud_provider.go

Lines changed: 19 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -214,62 +214,51 @@ func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string
214214
}, nil
215215
}
216216

217-
// NewNodeGroupWithId creates a new node group with custom ID suffix.
218-
func (tcp *TestCloudProvider) NewNodeGroupWithId(machineType string, labels map[string]string, systemLabels map[string]string,
219-
taints []apiv1.Taint, extraResources map[string]resource.Quantity, id string) (cloudprovider.NodeGroup, error) {
220-
return &TestNodeGroup{
221-
cloudProvider: tcp,
222-
id: "autoprovisioned-" + machineType + "-" + id,
223-
minSize: 0,
224-
maxSize: 1000,
225-
targetSize: 0,
226-
exist: false,
227-
autoprovisioned: true,
228-
machineType: machineType,
229-
labels: labels,
230-
taints: taints,
231-
}, nil
232-
}
233-
234-
// InsertNodeGroup adds already created node group to test cloud provider.
235-
func (tcp *TestCloudProvider) InsertNodeGroup(nodeGroup cloudprovider.NodeGroup) {
236-
tcp.Lock()
237-
defer tcp.Unlock()
238-
239-
tcp.groups[nodeGroup.Id()] = nodeGroup
240-
}
241-
242217
// BuildNodeGroup returns a test node group.
243-
func (tcp *TestCloudProvider) BuildNodeGroup(id string, min, max, size int, autoprovisioned bool, machineType string, opts *config.NodeGroupAutoscalingOptions) *TestNodeGroup {
218+
func (tcp *TestCloudProvider) BuildNodeGroup(id string, min, max, size int, exists bool, autoprovisioned bool, machineType string, opts *config.NodeGroupAutoscalingOptions) *TestNodeGroup {
244219
return &TestNodeGroup{
245220
cloudProvider: tcp,
246221
id: id,
247222
minSize: min,
248223
maxSize: max,
249224
targetSize: size,
250-
exist: true,
225+
exist: exists,
251226
autoprovisioned: autoprovisioned,
252227
machineType: machineType,
253228
opts: opts,
254229
}
255230
}
256231

232+
// InsertNodeGroup adds already created node group to test cloud provider.
233+
func (tcp *TestCloudProvider) InsertNodeGroup(nodeGroup cloudprovider.NodeGroup) {
234+
tcp.Lock()
235+
defer tcp.Unlock()
236+
237+
tcp.groups[nodeGroup.Id()] = nodeGroup
238+
}
239+
257240
// AddNodeGroup adds node group to test cloud provider.
258241
func (tcp *TestCloudProvider) AddNodeGroup(id string, min int, max int, size int) {
259-
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, false, "", nil)
242+
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, true, false, "", nil)
243+
tcp.InsertNodeGroup(nodeGroup)
244+
}
245+
246+
// AddUpcomingNodeGroup adds upcoming node group to test cloud provider.
247+
func (tcp *TestCloudProvider) AddUpcomingNodeGroup(id string, min int, max int, size int) {
248+
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, false, false, "", nil)
260249
tcp.InsertNodeGroup(nodeGroup)
261250
}
262251

263252
// AddNodeGroupWithCustomOptions adds node group with custom options
264253
// to test cloud provider.
265254
func (tcp *TestCloudProvider) AddNodeGroupWithCustomOptions(id string, min int, max int, size int, opts *config.NodeGroupAutoscalingOptions) {
266-
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, false, "", opts)
255+
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, true, false, "", opts)
267256
tcp.InsertNodeGroup(nodeGroup)
268257
}
269258

270259
// AddAutoprovisionedNodeGroup adds node group to test cloud provider.
271260
func (tcp *TestCloudProvider) AddAutoprovisionedNodeGroup(id string, min int, max int, size int, machineType string) *TestNodeGroup {
272-
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, true, machineType, nil)
261+
nodeGroup := tcp.BuildNodeGroup(id, min, max, size, true, true, machineType, nil)
273262
tcp.InsertNodeGroup(nodeGroup)
274263
return nodeGroup
275264
}

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
3030
"k8s.io/autoscaler/cluster-autoscaler/metrics"
3131
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupconfig"
32+
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups/asyncnodegroups"
3233
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
3334
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
3435
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
@@ -141,6 +142,7 @@ type ClusterStateRegistry struct {
141142
cloudProviderNodeInstancesCache *utils.CloudProviderNodeInstancesCache
142143
interrupt chan struct{}
143144
nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor
145+
asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker
144146

145147
// scaleUpFailures contains information about scale-up failures for each node group. It should be
146148
// cleared periodically to avoid unnecessary accumulation.
@@ -155,7 +157,7 @@ type NodeGroupScalingSafety struct {
155157
}
156158

157159
// NewClusterStateRegistry creates new ClusterStateRegistry.
158-
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry {
160+
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor, asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker) *ClusterStateRegistry {
159161
return &ClusterStateRegistry{
160162
scaleUpRequests: make(map[string]*ScaleUpRequest),
161163
scaleDownRequests: make([]*ScaleDownRequest, 0),
@@ -175,6 +177,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
175177
interrupt: make(chan struct{}),
176178
scaleUpFailures: make(map[string][]ScaleUpFailure),
177179
nodeGroupConfigProcessor: nodeGroupConfigProcessor,
180+
asyncNodeGroupStateChecker: asyncNodeGroupStateChecker,
178181
}
179182
}
180183

@@ -684,6 +687,11 @@ func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.
684687
klog.Warningf("Acceptable range for node group %s not found", nodeGroup.Id())
685688
continue
686689
}
690+
if csr.asyncNodeGroupStateChecker.IsUpcoming(nodeGroup) {
691+
// Nodes for upcoming node groups reside in-memory and wait for node group to be fully
692+
// created. There is no need to mark their sizes incorrect.
693+
continue
694+
}
687695
readiness, found := csr.perNodeGroupReadiness[nodeGroup.Id()]
688696
if !found {
689697
// if MinNodes == 0 node group has been scaled to 0 and everything's fine
@@ -981,6 +989,13 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() (upcomingCounts map[string]i
981989
registeredNodeNames = map[string][]string{}
982990
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
983991
id := nodeGroup.Id()
992+
if csr.asyncNodeGroupStateChecker.IsUpcoming(nodeGroup) {
993+
size, err := nodeGroup.TargetSize()
994+
if size >= 0 || err != nil {
995+
upcomingCounts[id] = size
996+
}
997+
continue
998+
}
984999
readiness := csr.perNodeGroupReadiness[id]
9851000
ar := csr.acceptableRanges[id]
9861001
// newNodes is the number of nodes that

0 commit comments

Comments
 (0)