Skip to content

Commit b7b84de

Browse files
committed
feat(recommender): add OOMMinBumpUp&OOMBumpUpRatio to CRD
Signed-off-by: Omer Aplatony <[email protected]>
1 parent 386c294 commit b7b84de

File tree

8 files changed

+216
-29
lines changed

8 files changed

+216
-29
lines changed

vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ kind: CustomResourceDefinition
44
metadata:
55
annotations:
66
api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
7-
controller-gen.kubebuilder.io/version: v0.16.5
7+
controller-gen.kubebuilder.io/version: v0.17.2
88
name: verticalpodautoscalercheckpoints.autoscaling.k8s.io
99
spec:
1010
group: autoscaling.k8s.io
@@ -225,7 +225,7 @@ kind: CustomResourceDefinition
225225
metadata:
226226
annotations:
227227
api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
228-
controller-gen.kubebuilder.io/version: v0.16.5
228+
controller-gen.kubebuilder.io/version: v0.17.2
229229
name: verticalpodautoscalers.autoscaling.k8s.io
230230
spec:
231231
group: autoscaling.k8s.io
@@ -372,6 +372,16 @@ spec:
372372
- Auto
373373
- "Off"
374374
type: string
375+
oomBumpUpRatio:
376+
description: OOMBumpUpRatio is the ratio to increase resources
377+
when OOM is detected.
378+
minimum: 1
379+
type: number
380+
oomMinBumpUp:
381+
description: OOMMinBumpUp is the minimum increase in resources
382+
when OOM is detected.
383+
minimum: 0
384+
type: number
375385
type: object
376386
type: array
377387
type: object

vertical-pod-autoscaler/docs/api.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ _Appears in:_
4848
| `maxAllowed` _[ResourceList](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#resourcelist-v1-core)_ | Specifies the maximum amount of resources that will be recommended<br />for the container. The default is no maximum. | | |
4949
| `controlledResources` _[ResourceName](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#resourcename-v1-core)_ | Specifies the type of recommendations that will be computed<br />(and possibly applied) by VPA.<br />If not specified, the default of [ResourceCPU, ResourceMemory] will be used. | | |
5050
| `controlledValues` _[ContainerControlledValues](#containercontrolledvalues)_ | Specifies which resource values should be controlled.<br />The default is "RequestsAndLimits". | | Enum: [RequestsAndLimits RequestsOnly] <br /> |
51+
| `oomBumpUpRatio` _float_ | OOMBumpUpRatio is the ratio to increase resources when OOM is detected. | | Minimum: 1 <br /> |
52+
| `oomMinBumpUp` _float_ | OOMMinBumpUp is the minimum increase in resources when OOM is detected. | | Minimum: 0 <br /> |
5153

5254

5355
#### ContainerScalingMode

vertical-pod-autoscaler/docs/flags.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ This document is auto-generated from the flag definitions in the VPA recommender
9393
| `--metric-for-pod-labels` | "up{job=\"kubernetes-pods\"}" | Which metric to look for pod labels in metrics |
9494
| `--min-checkpoints` | 10 | Minimum number of checkpoints to write per recommender's main loop |
9595
| `--one-output` | | If true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true) |
96-
| `--oom-bump-up-ratio` | 1.2 | The memory bump up ratio when OOM occurred, default is 1.2. |
97-
| `--oom-min-bump-up-bytes` | 1.048576e+08 | The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024 |
96+
| `--oom-bump-up-ratio` | 1.2 | Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2. |
97+
| `--oom-min-bump-up-bytes` | 1.048576e+08 | Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi). |
9898
| `--password` | | The password used in the prometheus server basic auth |
9999
| `--pod-label-prefix` | "pod_label_" | Which prefix to look for pod labels in metrics |
100100
| `--pod-name-label` | "kubernetes_pod_name" | Label name to look for pod names |

vertical-pod-autoscaler/e2e/v1/admission_controller.go

Lines changed: 143 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -831,26 +831,149 @@ var _ = AdmissionControllerE2eDescribe("Admission-controller", func() {
831831
err := InstallRawVPA(f, validVPA)
832832
gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Valid VPA object rejected")
833833

834-
ginkgo.By("Setting up invalid VPA object")
835-
// The invalid object differs by name and minAllowed - there is an invalid "requests" field.
836-
invalidVPA := []byte(`{
837-
"kind": "VerticalPodAutoscaler",
838-
"apiVersion": "autoscaling.k8s.io/v1",
839-
"metadata": {"name": "hamster-vpa-invalid"},
840-
"spec": {
841-
"targetRef": {
842-
"apiVersion": "apps/v1",
843-
"kind": "Deployment",
844-
"name":"hamster"
845-
},
846-
"resourcePolicy": {
847-
"containerPolicies": [{"containerName": "*", "minAllowed":{"requests":{"cpu":"50m"}}}]
848-
}
849-
}
850-
}`)
851-
err2 := InstallRawVPA(f, invalidVPA)
852-
gomega.Expect(err2).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
853-
gomega.Expect(err2.Error()).To(gomega.MatchRegexp(`.*admission webhook .*vpa.* denied the request: .*`))
834+
ginkgo.By("Setting up invalid VPA objects")
835+
testCases := []struct {
836+
name string
837+
vpaJSON string
838+
expectedErr string
839+
}{
840+
{
841+
name: "Invalid oomBumpUpRatio (negative value)",
842+
vpaJSON: `{
843+
"apiVersion": "autoscaling.k8s.io/v1",
844+
"kind": "VerticalPodAutoscaler",
845+
"metadata": {"name": "oom-test-vpa"},
846+
"spec": {
847+
"targetRef": {
848+
"apiVersion": "apps/v1",
849+
"kind": "Deployment",
850+
"name": "oom-test"
851+
},
852+
"updatePolicy": {
853+
"updateMode": "Auto"
854+
},
855+
"resourcePolicy": {
856+
"containerPolicies": [{
857+
"containerName": "*",
858+
"oomBumpUpRatio": -1,
859+
"oomMinBumpUp": 104857600
860+
}]
861+
}
862+
}
863+
}`,
864+
expectedErr: "spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio: Invalid value: -1: spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio in body should be greater than or equal to 1",
865+
},
866+
{
867+
name: "Invalid oomBumpUpRatio (string value)",
868+
vpaJSON: `{
869+
"apiVersion": "autoscaling.k8s.io/v1",
870+
"kind": "VerticalPodAutoscaler",
871+
"metadata": {"name": "oom-test-vpa"},
872+
"spec": {
873+
"targetRef": {
874+
"apiVersion": "apps/v1",
875+
"kind": "Deployment",
876+
"name": "oom-test"
877+
},
878+
"updatePolicy": {
879+
"updateMode": "Auto"
880+
},
881+
"resourcePolicy": {
882+
"containerPolicies": [{
883+
"containerName": "*",
884+
"oomBumpUpRatio": "12",
885+
"oomMinBumpUp": 104857600
886+
}]
887+
}
888+
}
889+
}`,
890+
expectedErr: "json: cannot unmarshal string into Go struct field ContainerResourcePolicy.spec.resourcePolicy.containerPolicies.oomBumpUpRatio of type float64",
891+
},
892+
{
893+
name: "Invalid oomBumpUpRatio (less than 1)",
894+
vpaJSON: `{
895+
"apiVersion": "autoscaling.k8s.io/v1",
896+
"kind": "VerticalPodAutoscaler",
897+
"metadata": {"name": "oom-test-vpa"},
898+
"spec": {
899+
"targetRef": {
900+
"apiVersion": "apps/v1",
901+
"kind": "Deployment",
902+
"name": "oom-test"
903+
},
904+
"updatePolicy": {
905+
"updateMode": "Auto"
906+
},
907+
"resourcePolicy": {
908+
"containerPolicies": [{
909+
"containerName": "*",
910+
"oomBumpUpRatio": 0.5,
911+
"oomMinBumpUp": 104857600
912+
}]
913+
}
914+
}
915+
}`,
916+
expectedErr: "spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio: Invalid value: 0.5: spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio in body should be greater than or equal to 1",
917+
},
918+
{
919+
name: "Invalid oomMinBumpUp (negative value)",
920+
vpaJSON: `{
921+
"apiVersion": "autoscaling.k8s.io/v1",
922+
"kind": "VerticalPodAutoscaler",
923+
"metadata": {"name": "oom-test-vpa"},
924+
"spec": {
925+
"targetRef": {
926+
"apiVersion": "apps/v1",
927+
"kind": "Deployment",
928+
"name": "oom-test"
929+
},
930+
"updatePolicy": {
931+
"updateMode": "Auto"
932+
},
933+
"resourcePolicy": {
934+
"containerPolicies": [{
935+
"containerName": "*",
936+
"oomBumpUpRatio": 2,
937+
"oomMinBumpUp": -1
938+
}]
939+
}
940+
}
941+
}`,
942+
expectedErr: "spec.resourcePolicy.containerPolicies[0].oomMinBumpUp: Invalid value: -1: spec.resourcePolicy.containerPolicies[0].oomMinBumpUp in body should be greater than or equal to 0",
943+
},
944+
{
945+
name: "Invalid minAllowed (invalid requests field)",
946+
vpaJSON: `{
947+
"apiVersion": "autoscaling.k8s.io/v1",
948+
"kind": "VerticalPodAutoscaler",
949+
"metadata": {"name": "hamster-vpa-invalid"},
950+
"spec": {
951+
"targetRef": {
952+
"apiVersion": "apps/v1",
953+
"kind": "Deployment",
954+
"name": "hamster"
955+
},
956+
"resourcePolicy": {
957+
"containerPolicies": [{
958+
"containerName": "*",
959+
"minAllowed": {
960+
"requests": {
961+
"cpu": "50m"
962+
}
963+
}
964+
}]
965+
}
966+
}
967+
}`,
968+
expectedErr: "admission webhook .*vpa.* denied the request:",
969+
},
970+
}
971+
for _, tc := range testCases {
972+
ginkgo.By(fmt.Sprintf("Testing %s", tc.name))
973+
err := InstallRawVPA(f, []byte(tc.vpaJSON))
974+
gomega.Expect(err).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
975+
gomega.Expect(err.Error()).To(gomega.MatchRegexp(tc.expectedErr))
976+
}
854977
})
855978

856979
ginkgo.It("reloads the webhook leaf and CA certificate", func(ctx ginkgo.SpecContext) {

vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1/types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,16 @@ type ContainerResourcePolicy struct {
214214
// The default is "RequestsAndLimits".
215215
// +optional
216216
ControlledValues *ContainerControlledValues `json:"controlledValues,omitempty" protobuf:"bytes,6,rep,name=controlledValues"`
217+
218+
// OOMBumpUpRatio is the ratio to increase resources when OOM is detected.
219+
// +kubebuilder:validation:Minimum=1.0
220+
// +optional
221+
OOMBumpUpRatio *float64 `json:"oomBumpUpRatio,omitempty" protobuf:"bytes,1,opt,name=oomBumpUpRatio"`
222+
223+
// OOMMinBumpUp is the minimum increase in resources when OOM is detected.
224+
// +kubebuilder:validation:Minimum=0
225+
// +optional
226+
OOMMinBumpUp *float64 `json:"oomMinBumpUp,omitempty" protobuf:"bytes,2,opt,name=oomMinBumpUp"`
217227
}
218228

219229
const (

vertical-pod-autoscaler/pkg/recommender/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ var (
9696
memoryAggregationIntervalCount = flag.Int64("memory-aggregation-interval-count", model.DefaultMemoryAggregationIntervalCount, `The number of consecutive memory-aggregation-intervals which make up the MemoryAggregationWindowLength which in turn is the period for memory usage aggregation by VPA. In other words, MemoryAggregationWindowLength = memory-aggregation-interval * memory-aggregation-interval-count.`)
9797
memoryHistogramDecayHalfLife = flag.Duration("memory-histogram-decay-half-life", model.DefaultMemoryHistogramDecayHalfLife, `The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period.`)
9898
cpuHistogramDecayHalfLife = flag.Duration("cpu-histogram-decay-half-life", model.DefaultCPUHistogramDecayHalfLife, `The amount of time it takes a historical CPU usage sample to lose half of its weight.`)
99-
oomBumpUpRatio = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `The memory bump up ratio when OOM occurred, default is 1.2.`)
100-
oomMinBumpUp = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024`)
99+
oomBumpUpRatio = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2.`)
100+
oomMinBumpUp = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi).`)
101101
)
102102

103103
// Post processors flags

vertical-pod-autoscaler/pkg/recommender/model/aggregate_container_state.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ type ContainerStateAggregator interface {
8181
// GetUpdateMode returns the update mode of VPA controlling this aggregator,
8282
// nil if aggregator is not autoscaled.
8383
GetUpdateMode() *vpa_types.UpdateMode
84+
// GetOomBumpUpRatio returns the OOM bump up ratio for this container
85+
GetOomBumpUpRatio() float64
86+
// GetOOMMinBumpUp returns the minimum OOM bump up value for this container
87+
GetOOMMinBumpUp() float64
8488
}
8589

8690
// AggregateContainerState holds input signals aggregated from a set of containers.
@@ -109,6 +113,8 @@ type AggregateContainerState struct {
109113
IsUnderVPA bool
110114
UpdateMode *vpa_types.UpdateMode
111115
ScalingMode *vpa_types.ContainerScalingMode
116+
OomBumpUpRatio float64
117+
OOMMinBumpUp float64
112118
ControlledResources *[]ResourceName
113119
}
114120

@@ -143,6 +149,16 @@ func (a *AggregateContainerState) GetControlledResources() []ResourceName {
143149
return DefaultControlledResources
144150
}
145151

152+
// GetOomBumpUpRatio returns the ratio by which to increase the memory recommendation in case of OOM
153+
func (a *AggregateContainerState) GetOomBumpUpRatio() float64 {
154+
return a.OomBumpUpRatio
155+
}
156+
157+
// GetOOMMinBumpUp returns the minimum absolute increase in memory recommendation in case of OOM
158+
func (a *AggregateContainerState) GetOOMMinBumpUp() float64 {
159+
return a.OOMMinBumpUp
160+
}
161+
146162
// MarkNotAutoscaled registers that this container state is not controlled by
147163
// a VPA object.
148164
func (a *AggregateContainerState) MarkNotAutoscaled() {
@@ -175,6 +191,8 @@ func NewAggregateContainerState() *AggregateContainerState {
175191
AggregateCPUUsage: util.NewDecayingHistogram(config.CPUHistogramOptions, config.CPUHistogramDecayHalfLife),
176192
AggregateMemoryPeaks: util.NewDecayingHistogram(config.MemoryHistogramOptions, config.MemoryHistogramDecayHalfLife),
177193
CreationTime: time.Now(),
194+
OomBumpUpRatio: config.OOMBumpUpRatio,
195+
OOMMinBumpUp: config.OOMMinBumpUp,
178196
}
179197
}
180198

@@ -276,6 +294,12 @@ func (a *AggregateContainerState) UpdateFromPolicy(resourcePolicy *vpa_types.Con
276294
// ContainerScalingModeAuto is the default scaling mode
277295
scalingModeAuto := vpa_types.ContainerScalingModeAuto
278296
a.ScalingMode = &scalingModeAuto
297+
if resourcePolicy != nil && resourcePolicy.OOMBumpUpRatio != nil {
298+
a.OomBumpUpRatio = *resourcePolicy.OOMBumpUpRatio
299+
}
300+
if resourcePolicy != nil && resourcePolicy.OOMMinBumpUp != nil {
301+
a.OOMMinBumpUp = *resourcePolicy.OOMMinBumpUp
302+
}
279303
if resourcePolicy != nil && resourcePolicy.Mode != nil {
280304
a.ScalingMode = resourcePolicy.Mode
281305
}
@@ -351,3 +375,11 @@ func (p *ContainerStateAggregatorProxy) GetScalingMode() *vpa_types.ContainerSca
351375
aggregator := p.cluster.findOrCreateAggregateContainerState(p.containerID)
352376
return aggregator.GetScalingMode()
353377
}
378+
379+
func (p *ContainerStateAggregatorProxy) GetOOMMinBumpUp() float64 {
380+
return 0
381+
}
382+
383+
func (p *ContainerStateAggregatorProxy) GetOomBumpUpRatio() float64 {
384+
return 0
385+
}

vertical-pod-autoscaler/pkg/recommender/model/container.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,14 @@ func (container *ContainerState) GetMaxMemoryPeak() ResourceAmount {
125125
return ResourceAmountMax(container.memoryPeak, container.oomPeak)
126126
}
127127

128+
func (container *ContainerState) GetOomBumpUpRatio() float64 {
129+
return container.aggregator.GetOomBumpUpRatio()
130+
}
131+
132+
func (container *ContainerState) GetOOMMinBumpUp() float64 {
133+
return container.aggregator.GetOOMMinBumpUp()
134+
}
135+
128136
func (container *ContainerState) addMemorySample(sample *ContainerUsageSample, isOOM bool) bool {
129137
ts := sample.MeasureStart
130138
// We always process OOM samples.
@@ -183,14 +191,16 @@ func (container *ContainerState) addMemorySample(sample *ContainerUsageSample, i
183191
// RecordOOM adds info regarding OOM event in the model as an artificial memory sample.
184192
func (container *ContainerState) RecordOOM(timestamp time.Time, requestedMemory ResourceAmount) error {
185193
// Discard old OOM
186-
if timestamp.Before(container.WindowEnd.Add(-1 * GetAggregationsConfig().MemoryAggregationInterval)) {
194+
config := GetAggregationsConfig()
195+
// TODO(omerap12): remove MemoryAggregationInterval to per-container configuration as well
196+
if timestamp.Before(container.WindowEnd.Add(-1 * config.MemoryAggregationInterval)) {
187197
return fmt.Errorf("OOM event will be discarded - it is too old (%v)", timestamp)
188198
}
189199
// Get max of the request and the recent usage-based memory peak.
190200
// Omitting oomPeak here to protect against recommendation running too high on subsequent OOMs.
191201
memoryUsed := ResourceAmountMax(requestedMemory, container.memoryPeak)
192-
memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(GetAggregationsConfig().OOMMinBumpUp),
193-
ScaleResource(memoryUsed, GetAggregationsConfig().OOMBumpUpRatio))
202+
memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(container.GetOOMMinBumpUp()),
203+
ScaleResource(memoryUsed, container.GetOomBumpUpRatio()))
194204

195205
oomMemorySample := ContainerUsageSample{
196206
MeasureStart: timestamp,

0 commit comments

Comments
 (0)