Skip to content

Commit 9226cf6

Browse files
authored
Merge pull request #7145 from abdelrahman882/proactive-scaleup
Add proactive scaleup
2 parents 527de12 + 01e9433 commit 9226cf6

14 files changed

+1558
-0
lines changed

cluster-autoscaler/main.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ import (
5959
ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
6060
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
6161
"k8s.io/autoscaler/cluster-autoscaler/processors/nodeinfosprovider"
62+
"k8s.io/autoscaler/cluster-autoscaler/processors/podinjection"
63+
podinjectionbackoff "k8s.io/autoscaler/cluster-autoscaler/processors/podinjection/backoff"
64+
"k8s.io/autoscaler/cluster-autoscaler/processors/pods"
6265
"k8s.io/autoscaler/cluster-autoscaler/processors/provreq"
6366
"k8s.io/autoscaler/cluster-autoscaler/processors/scaledowncandidates"
6467
"k8s.io/autoscaler/cluster-autoscaler/processors/scaledowncandidates/emptycandidates"
@@ -266,6 +269,8 @@ var (
266269
provisioningRequestsEnabled = flag.Bool("enable-provisioning-requests", false, "Whether the clusterautoscaler will be handling the ProvisioningRequest CRs.")
267270
frequentLoopsEnabled = flag.Bool("frequent-loops-enabled", false, "Whether clusterautoscaler triggers new iterations more frequently when it's needed")
268271
asyncNodeGroupsEnabled = flag.Bool("async-node-groups", false, "Whether clusterautoscaler creates and deletes node groups asynchronously. Experimental: requires cloud provider supporting async node group operations, enable at your own risk.")
272+
proactiveScaleupEnabled = flag.Bool("enable-proactive-scaleup", false, "Whether to enable/disable proactive scale-ups, defaults to false")
273+
podInjectionLimit = flag.Int("pod-injection-limit", 5000, "Limits total number of pods while injecting fake pods. If unschedulable pods already exceeds the limit, pod injection is disabled but pods are not truncated.")
269274
)
270275

271276
func isFlagPassed(name string) bool {
@@ -527,6 +532,20 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter
527532
podListProcessor.AddProcessor(injector)
528533
podListProcessor.AddProcessor(provreqProcesor)
529534
}
535+
536+
if *proactiveScaleupEnabled {
537+
podInjectionBackoffRegistry := podinjectionbackoff.NewFakePodControllerRegistry()
538+
539+
podInjectionPodListProcessor := podinjection.NewPodInjectionPodListProcessor(podInjectionBackoffRegistry)
540+
enforceInjectedPodsLimitProcessor := podinjection.NewEnforceInjectedPodsLimitProcessor(*podInjectionLimit)
541+
542+
podListProcessor = pods.NewCombinedPodListProcessor([]pods.PodListProcessor{podInjectionPodListProcessor, podListProcessor, enforceInjectedPodsLimitProcessor})
543+
544+
// FakePodsScaleUpStatusProcessor processor needs to be the first processor in ScaleUpStatusProcessor as it filters out fake pods from
545+
// Scale Up status so that we don't emit events.
546+
opts.Processors.ScaleUpStatusProcessor = podinjection.NewFakePodsScaleUpStatusProcessor(podInjectionBackoffRegistry)
547+
}
548+
530549
opts.Processors.PodListProcessor = podListProcessor
531550
scaleDownCandidatesComparers := []scaledowncandidates.CandidatesComparer{}
532551
if autoscalingOptions.ParallelDrain {
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package podinjectionbackoff
18+
19+
import (
20+
"time"
21+
22+
"github.com/cenkalti/backoff/v4"
23+
"k8s.io/apimachinery/pkg/types"
24+
)
25+
26+
const (
27+
baseBackoff = 5 * time.Minute
28+
backoffThreshold = 30 * time.Minute
29+
)
30+
31+
// controllerEntry describes a backed off controller
32+
type controllerEntry struct {
33+
until time.Time
34+
backoff backoff.ExponentialBackOff
35+
}
36+
37+
// ControllerRegistry contains backed off controllers to be used in time-based backing off of controllers considered in fake pod injection
38+
type ControllerRegistry struct {
39+
backedOffControllers map[types.UID]controllerEntry
40+
}
41+
42+
// NewFakePodControllerRegistry Creates & returns an instance of fakePodControllerBackoffRegistry
43+
func NewFakePodControllerRegistry() *ControllerRegistry {
44+
return &ControllerRegistry{
45+
backedOffControllers: make(map[types.UID]controllerEntry),
46+
}
47+
}
48+
49+
// newExponentialBackOff creates an instance of ExponentialBackOff using non-default values.
50+
func newExponentialBackOff(clock backoff.Clock) backoff.ExponentialBackOff {
51+
b := backoff.ExponentialBackOff{
52+
InitialInterval: baseBackoff,
53+
// Disables randomization for easier testing and better predictability
54+
RandomizationFactor: 0,
55+
Multiplier: backoff.DefaultMultiplier,
56+
MaxInterval: backoffThreshold,
57+
// Disable stopping if it reaches threshold
58+
MaxElapsedTime: 0,
59+
Stop: backoff.Stop,
60+
Clock: clock,
61+
}
62+
b.Reset()
63+
return b
64+
}
65+
66+
// BackoffController Backs off a controller
67+
// If the controller is already in backoff it's backoff time is exponentially increased
68+
// If the controller was in backoff, it resets its entry and makes it in backoff
69+
// If the controller is not in backoff and not stored, a new entry is created
70+
func (r *ControllerRegistry) BackoffController(ownerUID types.UID, now time.Time) {
71+
if ownerUID == "" {
72+
return
73+
}
74+
75+
controller, found := r.backedOffControllers[ownerUID]
76+
77+
if !found || now.After(controller.until) {
78+
controller = controllerEntry{
79+
backoff: newExponentialBackOff(backoff.SystemClock),
80+
}
81+
}
82+
// NextBackOff() needs to be called to increase the next interval
83+
controller.until = now.Add(controller.backoff.NextBackOff())
84+
85+
r.backedOffControllers[ownerUID] = controller
86+
}
87+
88+
// BackOffUntil Returns the back off status a controller with id `uid`
89+
func (r *ControllerRegistry) BackOffUntil(uid types.UID, now time.Time) time.Time {
90+
controller, found := r.backedOffControllers[uid]
91+
92+
if !found {
93+
return time.Time{}
94+
}
95+
96+
return controller.until
97+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package podinjectionbackoff
18+
19+
import (
20+
"testing"
21+
"time"
22+
23+
"github.com/cenkalti/backoff/v4"
24+
"github.com/stretchr/testify/assert"
25+
"k8s.io/apimachinery/pkg/types"
26+
)
27+
28+
func TestBackoffControllerOfPod(t *testing.T) {
29+
c1 := types.UID("c1")
30+
c2 := types.UID("c2")
31+
clock := &clock{}
32+
33+
testCases := map[string]struct {
34+
backoffCounts map[types.UID]int
35+
spendTime time.Duration
36+
expectedBackedoffControllers map[types.UID]controllerEntry
37+
}{
38+
"backing-off a controller adds its controller UID in backoff correctly": {
39+
backoffCounts: map[types.UID]int{
40+
c1: 1,
41+
},
42+
expectedBackedoffControllers: map[types.UID]controllerEntry{
43+
c1: {
44+
until: clock.now.Add(baseBackoff),
45+
},
46+
},
47+
},
48+
"backing-off an already backed-off controller exponentially increases backoff duration": {
49+
backoffCounts: map[types.UID]int{
50+
c1: 2,
51+
},
52+
expectedBackedoffControllers: map[types.UID]controllerEntry{
53+
c1: {
54+
until: clock.now.Add(time.Duration(float64(baseBackoff) * backoff.DefaultMultiplier)),
55+
},
56+
},
57+
},
58+
"backing-off a controller doesn't affect other controllers": {
59+
backoffCounts: map[types.UID]int{
60+
c1: 1,
61+
c2: 2,
62+
},
63+
expectedBackedoffControllers: map[types.UID]controllerEntry{
64+
c1: {
65+
until: clock.now.Add(baseBackoff),
66+
},
67+
c2: {
68+
until: clock.now.Add(time.Duration(float64(baseBackoff) * backoff.DefaultMultiplier)),
69+
},
70+
},
71+
},
72+
"backing-off a past backed-off controller resets backoff": {
73+
backoffCounts: map[types.UID]int{
74+
c1: 1,
75+
},
76+
spendTime: baseBackoff * 2,
77+
expectedBackedoffControllers: map[types.UID]controllerEntry{
78+
c1: {
79+
until: clock.now.Add(baseBackoff * 2).Add(baseBackoff),
80+
},
81+
},
82+
},
83+
"back-off duration doesn't exceed backoffThreshold": {
84+
backoffCounts: map[types.UID]int{
85+
c1: 15,
86+
},
87+
expectedBackedoffControllers: map[types.UID]controllerEntry{
88+
c1: {
89+
until: clock.now.Add(backoffThreshold),
90+
},
91+
},
92+
},
93+
}
94+
95+
for name, tc := range testCases {
96+
t.Run(name, func(t *testing.T) {
97+
// Reset time between test cases
98+
clock.now = time.Time{}
99+
clock.now = clock.now.Add(tc.spendTime)
100+
101+
registry := NewFakePodControllerRegistry()
102+
103+
for uid, backoffCount := range tc.backoffCounts {
104+
for i := 0; i < backoffCount; i++ {
105+
registry.BackoffController(uid, clock.now)
106+
}
107+
}
108+
109+
assert.Equal(t, len(registry.backedOffControllers), len(tc.expectedBackedoffControllers))
110+
for uid, backoffController := range tc.expectedBackedoffControllers {
111+
assert.NotNil(t, registry.backedOffControllers[uid])
112+
assert.Equal(t, backoffController.until, registry.backedOffControllers[uid].until)
113+
}
114+
})
115+
}
116+
}
117+
118+
type clock struct {
119+
now time.Time
120+
}
121+
122+
func (c *clock) Now() time.Time {
123+
return c.now
124+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package podinjection
18+
19+
import (
20+
apiv1 "k8s.io/api/core/v1"
21+
"k8s.io/autoscaler/cluster-autoscaler/context"
22+
)
23+
24+
// EnforceInjectedPodsLimitProcessor is a PodListProcessor used to limit the number of injected fake pods.
25+
type EnforceInjectedPodsLimitProcessor struct {
26+
podLimit int
27+
}
28+
29+
// NewEnforceInjectedPodsLimitProcessor return an instance of EnforceInjectedPodsLimitProcessor
30+
func NewEnforceInjectedPodsLimitProcessor(podLimit int) *EnforceInjectedPodsLimitProcessor {
31+
return &EnforceInjectedPodsLimitProcessor{
32+
podLimit: podLimit,
33+
}
34+
}
35+
36+
// Process filters unschedulablePods and enforces the limit of the number of injected pods
37+
func (p *EnforceInjectedPodsLimitProcessor) Process(ctx *context.AutoscalingContext, unschedulablePods []*apiv1.Pod) ([]*apiv1.Pod, error) {
38+
39+
numberOfFakePodsToRemove := len(unschedulablePods) - p.podLimit
40+
var unschedulablePodsAfterProcessing []*apiv1.Pod
41+
42+
for _, pod := range unschedulablePods {
43+
if IsFake(pod) && numberOfFakePodsToRemove > 0 {
44+
numberOfFakePodsToRemove -= 1
45+
continue
46+
}
47+
48+
unschedulablePodsAfterProcessing = append(unschedulablePodsAfterProcessing, pod)
49+
}
50+
51+
return unschedulablePodsAfterProcessing, nil
52+
}
53+
54+
// CleanUp is called at CA termination
55+
func (p *EnforceInjectedPodsLimitProcessor) CleanUp() {
56+
}

0 commit comments

Comments
 (0)