feat(recommender): add OOMMinBumpUp&OOMBumpUpRatio to CRD

omerap12 · omerap12 · commit b7b84debfce7 · 2025-04-07T16:54:32.000Z
Signed-off-by: Omer Aplatony &lt;omerap12@gmail.com&gt;
diff --git a/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml b/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml
@@ -4,7 +4,7 @@ kind: CustomResourceDefinition
 metadata:
   annotations:
     api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
-    controller-gen.kubebuilder.io/version: v0.16.5
+    controller-gen.kubebuilder.io/version: v0.17.2
   name: verticalpodautoscalercheckpoints.autoscaling.k8s.io
 spec:
   group: autoscaling.k8s.io
@@ -225,7 +225,7 @@ kind: CustomResourceDefinition
 metadata:
   annotations:
     api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
-    controller-gen.kubebuilder.io/version: v0.16.5
+    controller-gen.kubebuilder.io/version: v0.17.2
   name: verticalpodautoscalers.autoscaling.k8s.io
 spec:
   group: autoscaling.k8s.io
@@ -372,6 +372,16 @@ spec:
                           - Auto
                           - "Off"
                           type: string
+                        oomBumpUpRatio:
+                          description: OOMBumpUpRatio is the ratio to increase resources
+                            when OOM is detected.
+                          minimum: 1
+                          type: number
+                        oomMinBumpUp:
+                          description: OOMMinBumpUp is the minimum increase in resources
+                            when OOM is detected.
+                          minimum: 0
+                          type: number
                       type: object
                     type: array
                 type: object
diff --git a/vertical-pod-autoscaler/docs/api.md b/vertical-pod-autoscaler/docs/api.md
@@ -48,6 +48,8 @@ _Appears in:_
 | `maxAllowed` _[ResourceList](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#resourcelist-v1-core)_ | Specifies the maximum amount of resources that will be recommended<br />for the container. The default is no maximum. |  |  |
 | `controlledResources` _[ResourceName](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#resourcename-v1-core)_ | Specifies the type of recommendations that will be computed<br />(and possibly applied) by VPA.<br />If not specified, the default of [ResourceCPU, ResourceMemory] will be used. |  |  |
 | `controlledValues` _[ContainerControlledValues](#containercontrolledvalues)_ | Specifies which resource values should be controlled.<br />The default is "RequestsAndLimits". |  | Enum: [RequestsAndLimits RequestsOnly] <br /> |
+| `oomBumpUpRatio` _float_ | OOMBumpUpRatio is the ratio to increase resources when OOM is detected. |  | Minimum: 1 <br /> |
+| `oomMinBumpUp` _float_ | OOMMinBumpUp is the minimum increase in resources when OOM is detected. |  | Minimum: 0 <br /> |
 
 
 #### ContainerScalingMode
diff --git a/vertical-pod-autoscaler/docs/flags.md b/vertical-pod-autoscaler/docs/flags.md
@@ -93,8 +93,8 @@ This document is auto-generated from the flag definitions in the VPA recommender
 | `--metric-for-pod-labels` | "up{job=\"kubernetes-pods\"}" |                           Which metric to look for pod labels in metrics |
 | `--min-checkpoints` | 10 |                                    Minimum number of checkpoints to write per recommender's main loop |
 | `--one-output` |  |                                             If true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true) |
-| `--oom-bump-up-ratio` | 1.2 |                                The memory bump up ratio when OOM occurred, default is 1.2. |
-| `--oom-min-bump-up-bytes` | 1.048576e+08 |                            The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024 |
+| `--oom-bump-up-ratio` | 1.2 |                                Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2. |
+| `--oom-min-bump-up-bytes` | 1.048576e+08 |                            Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi). |
 | `--password` |  |                                        The password used in the prometheus server basic auth |
 | `--pod-label-prefix` | "pod_label_" |                                Which prefix to look for pod labels in metrics |
 | `--pod-name-label` | "kubernetes_pod_name" |                                  Label name to look for pod names |
diff --git a/vertical-pod-autoscaler/e2e/v1/admission_controller.go b/vertical-pod-autoscaler/e2e/v1/admission_controller.go
@@ -831,26 +831,149 @@ var _ = AdmissionControllerE2eDescribe("Admission-controller", func() {
 		err := InstallRawVPA(f, validVPA)
 		gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Valid VPA object rejected")
 
-		ginkgo.By("Setting up invalid VPA object")
-		// The invalid object differs by name and minAllowed - there is an invalid "requests" field.
-		invalidVPA := []byte(`{
-			"kind": "VerticalPodAutoscaler",
-			"apiVersion": "autoscaling.k8s.io/v1",
-			"metadata": {"name": "hamster-vpa-invalid"},
-			"spec": {
-				"targetRef": {
-					"apiVersion": "apps/v1",
-					"kind": "Deployment",
-					"name":"hamster"
-				},
-		   	"resourcePolicy": {
-		  		"containerPolicies": [{"containerName": "*", "minAllowed":{"requests":{"cpu":"50m"}}}]
-		  	}
-		  }
-		}`)
-		err2 := InstallRawVPA(f, invalidVPA)
-		gomega.Expect(err2).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
-		gomega.Expect(err2.Error()).To(gomega.MatchRegexp(`.*admission webhook .*vpa.* denied the request: .*`))
+		ginkgo.By("Setting up invalid VPA objects")
+		testCases := []struct {
+			name        string
+			vpaJSON     string
+			expectedErr string
+		}{
+			{
+				name: "Invalid oomBumpUpRatio (negative value)",
+				vpaJSON: `{
+            "apiVersion": "autoscaling.k8s.io/v1",
+            "kind": "VerticalPodAutoscaler",
+            "metadata": {"name": "oom-test-vpa"},
+            "spec": {
+                "targetRef": {
+                    "apiVersion": "apps/v1",
+                    "kind": "Deployment",
+                    "name": "oom-test"
+                },
+                "updatePolicy": {
+                    "updateMode": "Auto"
+                },
+                "resourcePolicy": {
+                    "containerPolicies": [{
+                        "containerName": "*",
+                        "oomBumpUpRatio": -1,
+                        "oomMinBumpUp": 104857600
+                    }]
+                }
+            }
+        }`,
+				expectedErr: "spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio: Invalid value: -1: spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio in body should be greater than or equal to 1",
+			},
+			{
+				name: "Invalid oomBumpUpRatio (string value)",
+				vpaJSON: `{
+            "apiVersion": "autoscaling.k8s.io/v1",
+            "kind": "VerticalPodAutoscaler",
+            "metadata": {"name": "oom-test-vpa"},
+            "spec": {
+                "targetRef": {
+                    "apiVersion": "apps/v1",
+                    "kind": "Deployment",
+                    "name": "oom-test"
+                },
+                "updatePolicy": {
+                    "updateMode": "Auto"
+                },
+                "resourcePolicy": {
+                    "containerPolicies": [{
+                        "containerName": "*",
+                        "oomBumpUpRatio": "12",
+                        "oomMinBumpUp": 104857600
+                    }]
+                }
+            }
+        }`,
+				expectedErr: "json: cannot unmarshal string into Go struct field ContainerResourcePolicy.spec.resourcePolicy.containerPolicies.oomBumpUpRatio of type float64",
+			},
+			{
+				name: "Invalid oomBumpUpRatio (less than 1)",
+				vpaJSON: `{
+            "apiVersion": "autoscaling.k8s.io/v1",
+            "kind": "VerticalPodAutoscaler",
+            "metadata": {"name": "oom-test-vpa"},
+            "spec": {
+                "targetRef": {
+                    "apiVersion": "apps/v1",
+                    "kind": "Deployment",
+                    "name": "oom-test"
+                },
+                "updatePolicy": {
+                    "updateMode": "Auto"
+                },
+                "resourcePolicy": {
+                    "containerPolicies": [{
+                        "containerName": "*",
+                        "oomBumpUpRatio": 0.5,
+                        "oomMinBumpUp": 104857600
+                    }]
+                }
+            }
+        }`,
+				expectedErr: "spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio: Invalid value: 0.5: spec.resourcePolicy.containerPolicies[0].oomBumpUpRatio in body should be greater than or equal to 1",
+			},
+			{
+				name: "Invalid oomMinBumpUp (negative value)",
+				vpaJSON: `{
+            "apiVersion": "autoscaling.k8s.io/v1",
+            "kind": "VerticalPodAutoscaler",
+            "metadata": {"name": "oom-test-vpa"},
+            "spec": {
+                "targetRef": {
+                    "apiVersion": "apps/v1",
+                    "kind": "Deployment",
+                    "name": "oom-test"
+                },
+                "updatePolicy": {
+                    "updateMode": "Auto"
+                },
+                "resourcePolicy": {
+                    "containerPolicies": [{
+                        "containerName": "*",
+                        "oomBumpUpRatio": 2,
+                        "oomMinBumpUp": -1
+                    }]
+                }
+            }
+        }`,
+				expectedErr: "spec.resourcePolicy.containerPolicies[0].oomMinBumpUp: Invalid value: -1: spec.resourcePolicy.containerPolicies[0].oomMinBumpUp in body should be greater than or equal to 0",
+			},
+			{
+				name: "Invalid minAllowed (invalid requests field)",
+				vpaJSON: `{
+            "apiVersion": "autoscaling.k8s.io/v1",
+            "kind": "VerticalPodAutoscaler",
+            "metadata": {"name": "hamster-vpa-invalid"},
+            "spec": {
+                "targetRef": {
+                    "apiVersion": "apps/v1",
+                    "kind": "Deployment",
+                    "name": "hamster"
+                },
+                "resourcePolicy": {
+                    "containerPolicies": [{
+                        "containerName": "*",
+                        "minAllowed": {
+                            "requests": {
+                                "cpu": "50m"
+                            }
+                        }
+                    }]
+                }
+            }
+        }`,
+				expectedErr: "admission webhook .*vpa.* denied the request:",
+			},
+		}
+		for _, tc := range testCases {
+			ginkgo.By(fmt.Sprintf("Testing %s", tc.name))
+			err := InstallRawVPA(f, []byte(tc.vpaJSON))
+			gomega.Expect(err).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
+			gomega.Expect(err.Error()).To(gomega.MatchRegexp(tc.expectedErr))
+		}
 	})
 
 	ginkgo.It("reloads the webhook leaf and CA certificate", func(ctx ginkgo.SpecContext) {
diff --git a/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1/types.go b/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1/types.go
@@ -214,6 +214,16 @@ type ContainerResourcePolicy struct {
 	// The default is "RequestsAndLimits".
 	// +optional
 	ControlledValues *ContainerControlledValues `json:"controlledValues,omitempty" protobuf:"bytes,6,rep,name=controlledValues"`
+
+	// OOMBumpUpRatio is the ratio to increase resources when OOM is detected.
+	// +kubebuilder:validation:Minimum=1.0
+	// +optional
+	OOMBumpUpRatio *float64 `json:"oomBumpUpRatio,omitempty" protobuf:"bytes,1,opt,name=oomBumpUpRatio"`
+
+	// OOMMinBumpUp is the minimum increase in resources when OOM is detected.
+	// +kubebuilder:validation:Minimum=0
+	// +optional
+	OOMMinBumpUp *float64 `json:"oomMinBumpUp,omitempty" protobuf:"bytes,2,opt,name=oomMinBumpUp"`
 }
 
 const (
diff --git a/vertical-pod-autoscaler/pkg/recommender/main.go b/vertical-pod-autoscaler/pkg/recommender/main.go
@@ -96,8 +96,8 @@ var (
 	memoryAggregationIntervalCount = flag.Int64("memory-aggregation-interval-count", model.DefaultMemoryAggregationIntervalCount, `The number of consecutive memory-aggregation-intervals which make up the MemoryAggregationWindowLength which in turn is the period for memory usage aggregation by VPA. In other words, MemoryAggregationWindowLength = memory-aggregation-interval * memory-aggregation-interval-count.`)
 	memoryHistogramDecayHalfLife   = flag.Duration("memory-histogram-decay-half-life", model.DefaultMemoryHistogramDecayHalfLife, `The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period.`)
 	cpuHistogramDecayHalfLife      = flag.Duration("cpu-histogram-decay-half-life", model.DefaultCPUHistogramDecayHalfLife, `The amount of time it takes a historical CPU usage sample to lose half of its weight.`)
-	oomBumpUpRatio                 = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `The memory bump up ratio when OOM occurred, default is 1.2.`)
-	oomMinBumpUp                   = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024`)
+	oomBumpUpRatio                 = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2.`)
+	oomMinBumpUp                   = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi).`)
 )
 
 // Post processors flags
diff --git a/vertical-pod-autoscaler/pkg/recommender/model/aggregate_container_state.go b/vertical-pod-autoscaler/pkg/recommender/model/aggregate_container_state.go
@@ -81,6 +81,10 @@ type ContainerStateAggregator interface {
 	// GetUpdateMode returns the update mode of VPA controlling this aggregator,
 	// nil if aggregator is not autoscaled.
 	GetUpdateMode() *vpa_types.UpdateMode
+	// GetOomBumpUpRatio returns the OOM bump up ratio for this container
+	GetOomBumpUpRatio() float64
+	// GetOOMMinBumpUp returns the minimum OOM bump up value for this container
+	GetOOMMinBumpUp() float64
 }
 
 // AggregateContainerState holds input signals aggregated from a set of containers.
@@ -109,6 +113,8 @@ type AggregateContainerState struct {
 	IsUnderVPA          bool
 	UpdateMode          *vpa_types.UpdateMode
 	ScalingMode         *vpa_types.ContainerScalingMode
+	OomBumpUpRatio      float64
+	OOMMinBumpUp        float64
 	ControlledResources *[]ResourceName
 }
 
@@ -143,6 +149,16 @@ func (a *AggregateContainerState) GetControlledResources() []ResourceName {
 	return DefaultControlledResources
 }
 
+// GetOomBumpUpRatio returns the ratio by which to increase the memory recommendation in case of OOM
+func (a *AggregateContainerState) GetOomBumpUpRatio() float64 {
+	return a.OomBumpUpRatio
+}
+
+// GetOOMMinBumpUp returns the minimum absolute increase in memory recommendation in case of OOM
+func (a *AggregateContainerState) GetOOMMinBumpUp() float64 {
+	return a.OOMMinBumpUp
+}
+
 // MarkNotAutoscaled registers that this container state is not controlled by
 // a VPA object.
 func (a *AggregateContainerState) MarkNotAutoscaled() {
@@ -175,6 +191,8 @@ func NewAggregateContainerState() *AggregateContainerState {
 		AggregateCPUUsage:    util.NewDecayingHistogram(config.CPUHistogramOptions, config.CPUHistogramDecayHalfLife),
 		AggregateMemoryPeaks: util.NewDecayingHistogram(config.MemoryHistogramOptions, config.MemoryHistogramDecayHalfLife),
 		CreationTime:         time.Now(),
+		OomBumpUpRatio:       config.OOMBumpUpRatio,
+		OOMMinBumpUp:         config.OOMMinBumpUp,
 	}
 }
 
@@ -276,6 +294,12 @@ func (a *AggregateContainerState) UpdateFromPolicy(resourcePolicy *vpa_types.Con
 	// ContainerScalingModeAuto is the default scaling mode
 	scalingModeAuto := vpa_types.ContainerScalingModeAuto
 	a.ScalingMode = &scalingModeAuto
+	if resourcePolicy != nil && resourcePolicy.OOMBumpUpRatio != nil {
+		a.OomBumpUpRatio = *resourcePolicy.OOMBumpUpRatio
+	}
+	if resourcePolicy != nil && resourcePolicy.OOMMinBumpUp != nil {
+		a.OOMMinBumpUp = *resourcePolicy.OOMMinBumpUp
+	}
 	if resourcePolicy != nil && resourcePolicy.Mode != nil {
 		a.ScalingMode = resourcePolicy.Mode
 	}
@@ -351,3 +375,11 @@ func (p *ContainerStateAggregatorProxy) GetScalingMode() *vpa_types.ContainerSca
 	aggregator := p.cluster.findOrCreateAggregateContainerState(p.containerID)
 	return aggregator.GetScalingMode()
 }
+
+func (p *ContainerStateAggregatorProxy) GetOOMMinBumpUp() float64 {
+	return 0
+}
+
+func (p *ContainerStateAggregatorProxy) GetOomBumpUpRatio() float64 {
+	return 0
+}
diff --git a/vertical-pod-autoscaler/pkg/recommender/model/container.go b/vertical-pod-autoscaler/pkg/recommender/model/container.go
@@ -125,6 +125,14 @@ func (container *ContainerState) GetMaxMemoryPeak() ResourceAmount {
 	return ResourceAmountMax(container.memoryPeak, container.oomPeak)
 }
 
+func (container *ContainerState) GetOomBumpUpRatio() float64 {
+	return container.aggregator.GetOomBumpUpRatio()
+}
+
+func (container *ContainerState) GetOOMMinBumpUp() float64 {
+	return container.aggregator.GetOOMMinBumpUp()
+}
+
 func (container *ContainerState) addMemorySample(sample *ContainerUsageSample, isOOM bool) bool {
 	ts := sample.MeasureStart
 	// We always process OOM samples.
@@ -183,14 +191,16 @@ func (container *ContainerState) addMemorySample(sample *ContainerUsageSample, i
 // RecordOOM adds info regarding OOM event in the model as an artificial memory sample.
 func (container *ContainerState) RecordOOM(timestamp time.Time, requestedMemory ResourceAmount) error {
 	// Discard old OOM
-	if timestamp.Before(container.WindowEnd.Add(-1 * GetAggregationsConfig().MemoryAggregationInterval)) {
+	config := GetAggregationsConfig()
+	// TODO(omerap12): remove MemoryAggregationInterval to per-container configuration as well
+	if timestamp.Before(container.WindowEnd.Add(-1 * config.MemoryAggregationInterval)) {
 		return fmt.Errorf("OOM event will be discarded - it is too old (%v)", timestamp)
 	}
 	// Get max of the request and the recent usage-based memory peak.
 	// Omitting oomPeak here to protect against recommendation running too high on subsequent OOMs.
 	memoryUsed := ResourceAmountMax(requestedMemory, container.memoryPeak)
-	memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(GetAggregationsConfig().OOMMinBumpUp),
-		ScaleResource(memoryUsed, GetAggregationsConfig().OOMBumpUpRatio))
+	memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(container.GetOOMMinBumpUp()),
+		ScaleResource(memoryUsed, container.GetOomBumpUpRatio()))
 
 	oomMemorySample := ContainerUsageSample{
 		MeasureStart: timestamp,