Skip to content

Commit 5e23c1d

Browse files
committed
feat(recommender): add OOMMinBumpUp&OOMBumpUpRatio to CRD
Signed-off-by: Omer Aplatony <[email protected]>
1 parent 1de2160 commit 5e23c1d

File tree

13 files changed

+312
-53
lines changed

13 files changed

+312
-53
lines changed

vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ kind: CustomResourceDefinition
44
metadata:
55
annotations:
66
api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
7-
controller-gen.kubebuilder.io/version: v0.16.5
7+
controller-gen.kubebuilder.io/version: v0.17.2
88
name: verticalpodautoscalercheckpoints.autoscaling.k8s.io
99
spec:
1010
group: autoscaling.k8s.io
@@ -225,7 +225,7 @@ kind: CustomResourceDefinition
225225
metadata:
226226
annotations:
227227
api-approved.kubernetes.io: https://github.com/kubernetes/kubernetes/pull/63797
228-
controller-gen.kubebuilder.io/version: v0.16.5
228+
controller-gen.kubebuilder.io/version: v0.17.2
229229
name: verticalpodautoscalers.autoscaling.k8s.io
230230
spec:
231231
group: autoscaling.k8s.io
@@ -254,6 +254,9 @@ spec:
254254
- jsonPath: .metadata.creationTimestamp
255255
name: Age
256256
type: date
257+
- jsonPath: .spec.recommenderConfig
258+
name: RecommenderConfig
259+
type: string
257260
name: v1
258261
schema:
259262
openAPIV3Schema:
@@ -284,6 +287,21 @@ spec:
284287
Specification of the behavior of the autoscaler.
285288
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status.
286289
properties:
290+
recommenderConfig:
291+
description: RecommenderConfig specifies the configuration for the
292+
recommender.
293+
properties:
294+
oomBumpUpRatio:
295+
description: OOMBumpUpRatio is the ratio to increase resources
296+
when OOM is detected.
297+
minimum: 1
298+
type: number
299+
oomMinBumpUp:
300+
description: OOMMinBumpUp is the minimum increase in resources
301+
when OOM is detected.
302+
minimum: 0
303+
type: number
304+
type: object
287305
recommenders:
288306
description: |-
289307
Recommender responsible for generating recommendation for this object.

vertical-pod-autoscaler/docs/api.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,23 @@ _Appears in:_
201201
| `containerRecommendations` _[RecommendedContainerResources](#recommendedcontainerresources) array_ | Resources recommended by the autoscaler for each container. | | |
202202

203203

204+
#### RecommenderConfig
205+
206+
207+
208+
RecommenderConfig contains configuration for the recommender.
209+
210+
211+
212+
_Appears in:_
213+
- [VerticalPodAutoscalerSpec](#verticalpodautoscalerspec)
214+
215+
| Field | Description | Default | Validation |
216+
| --- | --- | --- | --- |
217+
| `oomBumpUpRatio` _float_ | OOMBumpUpRatio is the ratio to increase resources when OOM is detected. | | Minimum: 1 <br /> |
218+
| `oomMinBumpUp` _float_ | OOMMinBumpUp is the minimum increase in resources when OOM is detected. | | Minimum: 0 <br /> |
219+
220+
204221
#### UpdateMode
205222

206223
_Underlying type:_ _string_
@@ -376,6 +393,7 @@ _Appears in:_
376393
| `updatePolicy` _[PodUpdatePolicy](#podupdatepolicy)_ | Describes the rules on how changes are applied to the pods.<br />If not specified, all fields in the `PodUpdatePolicy` are set to their<br />default values. | | |
377394
| `resourcePolicy` _[PodResourcePolicy](#podresourcepolicy)_ | Controls how the autoscaler computes recommended resources.<br />The resource policy may be used to set constraints on the recommendations<br />for individual containers.<br />If any individual containers need to be excluded from getting the VPA recommendations, then<br />it must be disabled explicitly by setting mode to "Off" under containerPolicies.<br />If not specified, the autoscaler computes recommended resources for all containers in the pod,<br />without additional constraints. | | |
378395
| `recommenders` _[VerticalPodAutoscalerRecommenderSelector](#verticalpodautoscalerrecommenderselector) array_ | Recommender responsible for generating recommendation for this object.<br />List should be empty (then the default recommender will generate the<br />recommendation) or contain exactly one recommender. | | |
396+
| `recommenderConfig` _[RecommenderConfig](#recommenderconfig)_ | RecommenderConfig specifies the configuration for the recommender. | | |
379397

380398

381399
#### VerticalPodAutoscalerStatus

vertical-pod-autoscaler/docs/flags.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ This document is auto-generated from the flag definitions in the VPA recommender
9393
| `--metric-for-pod-labels` | "up{job=\"kubernetes-pods\"}" | Which metric to look for pod labels in metrics |
9494
| `--min-checkpoints` | 10 | Minimum number of checkpoints to write per recommender's main loop |
9595
| `--one-output` | | If true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true) |
96-
| `--oom-bump-up-ratio` | 1.2 | The memory bump up ratio when OOM occurred, default is 1.2. |
97-
| `--oom-min-bump-up-bytes` | 1.048576e+08 | The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024 |
96+
| `--oom-bump-up-ratio` | 1.2 | Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2. |
97+
| `--oom-min-bump-up-bytes` | 1.048576e+08 | Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi). |
9898
| `--password` | | The password used in the prometheus server basic auth |
9999
| `--pod-label-prefix` | "pod_label_" | Which prefix to look for pod labels in metrics |
100100
| `--pod-name-label` | "kubernetes_pod_name" | Label name to look for pod names |

vertical-pod-autoscaler/e2e/v1/admission_controller.go

Lines changed: 131 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -831,26 +831,137 @@ var _ = AdmissionControllerE2eDescribe("Admission-controller", func() {
831831
err := InstallRawVPA(f, validVPA)
832832
gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Valid VPA object rejected")
833833

834-
ginkgo.By("Setting up invalid VPA object")
835-
// The invalid object differs by name and minAllowed - there is an invalid "requests" field.
836-
invalidVPA := []byte(`{
837-
"kind": "VerticalPodAutoscaler",
838-
"apiVersion": "autoscaling.k8s.io/v1",
839-
"metadata": {"name": "hamster-vpa-invalid"},
840-
"spec": {
841-
"targetRef": {
842-
"apiVersion": "apps/v1",
843-
"kind": "Deployment",
844-
"name":"hamster"
845-
},
846-
"resourcePolicy": {
847-
"containerPolicies": [{"containerName": "*", "minAllowed":{"requests":{"cpu":"50m"}}}]
848-
}
849-
}
850-
}`)
851-
err2 := InstallRawVPA(f, invalidVPA)
852-
gomega.Expect(err2).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
853-
gomega.Expect(err2.Error()).To(gomega.MatchRegexp(`.*admission webhook .*vpa.* denied the request: .*`))
834+
ginkgo.By("Setting up invalid VPA objects")
835+
testCases := []struct {
836+
name string
837+
vpaJSON string
838+
expectedErr string
839+
}{
840+
{
841+
name: "Invalid oomBumpUpRatio (negative value)",
842+
vpaJSON: `{
843+
"apiVersion": "autoscaling.k8s.io/v1",
844+
"kind": "VerticalPodAutoscaler",
845+
"metadata": {"name": "oom-test-vpa"},
846+
"spec": {
847+
"targetRef": {
848+
"apiVersion": "apps/v1",
849+
"kind": "Deployment",
850+
"name": "oom-test"
851+
},
852+
"updatePolicy": {
853+
"updateMode": "Auto"
854+
},
855+
"recommenderConfig": {
856+
"oomBumpUpRatio": -1,
857+
"oomMinBumpUp": 104857600
858+
}
859+
}
860+
}`,
861+
expectedErr: "spec.recommenderConfig.oomBumpUpRatio: Invalid value: -1: spec.recommenderConfig.oomBumpUpRatio in body should be greater than or equal to 1",
862+
},
863+
{
864+
name: "Invalid oomBumpUpRatio (string value)",
865+
vpaJSON: `{
866+
"apiVersion": "autoscaling.k8s.io/v1",
867+
"kind": "VerticalPodAutoscaler",
868+
"metadata": {"name": "oom-test-vpa"},
869+
"spec": {
870+
"targetRef": {
871+
"apiVersion": "apps/v1",
872+
"kind": "Deployment",
873+
"name": "oom-test"
874+
},
875+
"updatePolicy": {
876+
"updateMode": "Auto"
877+
},
878+
"recommenderConfig": {
879+
"oomBumpUpRatio": "12",
880+
"oomMinBumpUp": 104857600
881+
}
882+
}
883+
}`,
884+
expectedErr: "json: cannot unmarshal string into Go struct field RecommenderConfig.spec.recommenderConfig.oomBumpUpRatio of type float64",
885+
},
886+
{
887+
name: "Invalid oomBumpUpRatio (less than 1)",
888+
vpaJSON: `{
889+
"apiVersion": "autoscaling.k8s.io/v1",
890+
"kind": "VerticalPodAutoscaler",
891+
"metadata": {"name": "oom-test-vpa"},
892+
"spec": {
893+
"targetRef": {
894+
"apiVersion": "apps/v1",
895+
"kind": "Deployment",
896+
"name": "oom-test"
897+
},
898+
"updatePolicy": {
899+
"updateMode": "Auto"
900+
},
901+
"recommenderConfig": {
902+
"oomBumpUpRatio": 0.5,
903+
"oomMinBumpUp": 104857600
904+
}
905+
}
906+
}`,
907+
expectedErr: "spec.recommenderConfig.oomBumpUpRatio: Invalid value: 0.5: spec.recommenderConfig.oomBumpUpRatio in body should be greater than or equal to 1",
908+
},
909+
{
910+
name: "Invalid oomMinBumpUp (negative value)",
911+
vpaJSON: `{
912+
"apiVersion": "autoscaling.k8s.io/v1",
913+
"kind": "VerticalPodAutoscaler",
914+
"metadata": {"name": "oom-test-vpa"},
915+
"spec": {
916+
"targetRef": {
917+
"apiVersion": "apps/v1",
918+
"kind": "Deployment",
919+
"name": "oom-test"
920+
},
921+
"updatePolicy": {
922+
"updateMode": "Auto"
923+
},
924+
"recommenderConfig": {
925+
"oomBumpUpRatio": 2,
926+
"oomMinBumpUp": -1
927+
}
928+
}
929+
}`,
930+
expectedErr: "spec.recommenderConfig.oomMinBumpUp: Invalid value: -1: spec.recommenderConfig.oomMinBumpUp in body should be greater than or equal to 0",
931+
},
932+
{
933+
name: "Invalid minAllowed (invalid requests field)",
934+
vpaJSON: `{
935+
"apiVersion": "autoscaling.k8s.io/v1",
936+
"kind": "VerticalPodAutoscaler",
937+
"metadata": {"name": "hamster-vpa-invalid"},
938+
"spec": {
939+
"targetRef": {
940+
"apiVersion": "apps/v1",
941+
"kind": "Deployment",
942+
"name": "hamster"
943+
},
944+
"resourcePolicy": {
945+
"containerPolicies": [{
946+
"containerName": "*",
947+
"minAllowed": {
948+
"requests": {
949+
"cpu": "50m"
950+
}
951+
}
952+
}]
953+
}
954+
}
955+
}`,
956+
expectedErr: "admission webhook .*vpa.* denied the request:",
957+
},
958+
}
959+
for _, tc := range testCases {
960+
ginkgo.By(fmt.Sprintf("Testing %s", tc.name))
961+
err := InstallRawVPA(f, []byte(tc.vpaJSON))
962+
gomega.Expect(err).To(gomega.HaveOccurred(), "Invalid VPA object accepted")
963+
gomega.Expect(err.Error()).To(gomega.MatchRegexp(tc.expectedErr))
964+
}
854965
})
855966

856967
ginkgo.It("reloads the webhook leaf and CA certificate", func(ctx ginkgo.SpecContext) {

vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1/types.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type VerticalPodAutoscalerList struct {
4545
// +kubebuilder:printcolumn:name="Mem",type="string",JSONPath=".status.recommendation.containerRecommendations[0].target.memory"
4646
// +kubebuilder:printcolumn:name="Provided",type="string",JSONPath=".status.conditions[?(@.type=='RecommendationProvided')].status"
4747
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
48+
// +kubebuilder:printcolumn:name="RecommenderConfig",type="string",JSONPath=".spec.recommenderConfig"
4849
// +kubebuilder:metadata:annotations="api-approved.kubernetes.io=https://github.com/kubernetes/kubernetes/pull/63797"
4950

5051
// VerticalPodAutoscaler is the configuration for a vertical pod
@@ -107,6 +108,23 @@ type VerticalPodAutoscalerSpec struct {
107108
// recommendation) or contain exactly one recommender.
108109
// +optional
109110
Recommenders []*VerticalPodAutoscalerRecommenderSelector `json:"recommenders,omitempty" protobuf:"bytes,4,opt,name=recommenders"`
111+
112+
// RecommenderConfig specifies the configuration for the recommender.
113+
// +optional
114+
RecommenderConfig *RecommenderConfig `json:"recommenderConfig,omitempty" protobuf:"bytes,5,opt,name=recommenderConfig"`
115+
}
116+
117+
// RecommenderConfig contains configuration for the recommender.
118+
type RecommenderConfig struct {
119+
// OOMBumpUpRatio is the ratio to increase resources when OOM is detected.
120+
// +kubebuilder:validation:Minimum=1.0
121+
// +optional
122+
OOMBumpUpRatio *float64 `json:"oomBumpUpRatio,omitempty" protobuf:"bytes,1,opt,name=oomBumpUpRatio"`
123+
124+
// OOMMinBumpUp is the minimum increase in resources when OOM is detected.
125+
// +kubebuilder:validation:Minimum=0
126+
// +optional
127+
OOMMinBumpUp *float64 `json:"oomMinBumpUp,omitempty" protobuf:"bytes,2,opt,name=oomMinBumpUp"`
110128
}
111129

112130
// EvictionChangeRequirement refers to the relationship between the new target recommendation for a Pod and its current requests, what kind of change is necessary for the Pod to be evicted

vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,14 @@ Loop:
524524
select {
525525
case oomInfo := <-feeder.oomChan:
526526
klog.V(3).InfoS("OOM detected", "oomInfo", oomInfo)
527-
if err = feeder.clusterState.RecordOOM(oomInfo.ContainerID, oomInfo.Timestamp, oomInfo.Memory); err != nil {
527+
pod := feeder.clusterState.Pods()[oomInfo.ContainerID.PodID]
528+
controlledVpa := feeder.clusterState.GetControllingVPA(pod)
529+
var oomBumpUpRatio, oomMinBumpUp *float64
530+
if controlledVpa != nil && controlledVpa.RecommenderConfig != nil {
531+
oomBumpUpRatio = controlledVpa.RecommenderConfig.OOMBumpUpRatio
532+
oomMinBumpUp = controlledVpa.RecommenderConfig.OOMMinBumpUp
533+
}
534+
if err = feeder.clusterState.RecordOOM(oomInfo.ContainerID, oomInfo.Timestamp, oomInfo.Memory, oomBumpUpRatio, oomMinBumpUp); err != nil {
528535
klog.V(0).InfoS("Failed to record OOM", "oomInfo", oomInfo, "error", err)
529536
}
530537
default:

vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,11 +600,12 @@ type fakeMetricsClient struct {
600600
snapshots []*metrics.ContainerMetricsSnapshot
601601
}
602602

603-
func (m fakeMetricsClient) GetContainersMetrics() ([]*metrics.ContainerMetricsSnapshot, error) {
603+
func (m fakeMetricsClient) GetContainersMetrics(_ context.Context) ([]*metrics.ContainerMetricsSnapshot, error) {
604604
return m.snapshots, nil
605605
}
606606

607607
func TestClusterStateFeeder_LoadRealTimeMetrics(t *testing.T) {
608+
_, tctx := ktesting.NewTestContext(t)
608609
namespaceName := "test-namespace"
609610
podID := model.PodID{Namespace: namespaceName, PodName: "Pod"}
610611
regularContainer1 := model.ContainerID{PodID: podID, ContainerName: "Container1"}
@@ -639,7 +640,7 @@ func TestClusterStateFeeder_LoadRealTimeMetrics(t *testing.T) {
639640
metricsClient: fakeMetricsClient{snapshots: containerMetricsSnapshots},
640641
}
641642

642-
feeder.LoadRealTimeMetrics()
643+
feeder.LoadRealTimeMetrics(tctx)
643644

644645
assert.Equal(t, 2, len(clusterState.addedSamples))
645646

vertical-pod-autoscaler/pkg/recommender/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ var (
9696
memoryAggregationIntervalCount = flag.Int64("memory-aggregation-interval-count", model.DefaultMemoryAggregationIntervalCount, `The number of consecutive memory-aggregation-intervals which make up the MemoryAggregationWindowLength which in turn is the period for memory usage aggregation by VPA. In other words, MemoryAggregationWindowLength = memory-aggregation-interval * memory-aggregation-interval-count.`)
9797
memoryHistogramDecayHalfLife = flag.Duration("memory-histogram-decay-half-life", model.DefaultMemoryHistogramDecayHalfLife, `The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period.`)
9898
cpuHistogramDecayHalfLife = flag.Duration("cpu-histogram-decay-half-life", model.DefaultCPUHistogramDecayHalfLife, `The amount of time it takes a historical CPU usage sample to lose half of its weight.`)
99-
oomBumpUpRatio = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `The memory bump up ratio when OOM occurred, default is 1.2.`)
100-
oomMinBumpUp = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024`)
99+
oomBumpUpRatio = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `Default memory bump up ratio when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 1.2.`)
100+
oomMinBumpUp = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `Default minimal increase of memory (in bytes) when OOM occurs. This value applies to all VPAs unless overridden in the VPA spec. Default is 100 * 1024 * 1024 (100Mi).`)
101101
)
102102

103103
// Post processors flags

vertical-pod-autoscaler/pkg/recommender/model/cluster.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ type ClusterState interface {
4747
DeletePod(podID PodID)
4848
AddOrUpdateContainer(containerID ContainerID, request Resources) error
4949
AddSample(sample *ContainerUsageSampleWithKey) error
50-
RecordOOM(containerID ContainerID, timestamp time.Time, requestedMemory ResourceAmount) error
50+
RecordOOM(containerID ContainerID, timestamp time.Time, requestedMemory ResourceAmount, OOMBumpUpRatio *float64, OOMMinBumpUp *float64) error
5151
AddOrUpdateVpa(apiObject *vpa_types.VerticalPodAutoscaler, selector labels.Selector) error
5252
DeleteVpa(vpaID VpaID) error
5353
MakeAggregateStateKey(pod *PodState, containerName string) AggregateStateKey
@@ -250,7 +250,7 @@ func (cluster *clusterState) AddSample(sample *ContainerUsageSampleWithKey) erro
250250
}
251251

252252
// RecordOOM adds info regarding OOM event in the model as an artificial memory sample.
253-
func (cluster *clusterState) RecordOOM(containerID ContainerID, timestamp time.Time, requestedMemory ResourceAmount) error {
253+
func (cluster *clusterState) RecordOOM(containerID ContainerID, timestamp time.Time, requestedMemory ResourceAmount, OOMBumpUpRatio *float64, OOMMinBumpUp *float64) error {
254254
pod, podExists := cluster.pods[containerID.PodID]
255255
if !podExists {
256256
return NewKeyError(containerID.PodID)
@@ -259,7 +259,7 @@ func (cluster *clusterState) RecordOOM(containerID ContainerID, timestamp time.T
259259
if !containerExists {
260260
return NewKeyError(containerID.ContainerName)
261261
}
262-
err := containerState.RecordOOM(timestamp, requestedMemory)
262+
err := containerState.RecordOOM(timestamp, requestedMemory, OOMBumpUpRatio, OOMMinBumpUp)
263263
if err != nil {
264264
return fmt.Errorf("error while recording OOM for %v, Reason: %v", containerID, err)
265265
}
@@ -305,6 +305,7 @@ func (cluster *clusterState) AddOrUpdateVpa(apiObject *vpa_types.VerticalPodAuto
305305
vpa.Recommendation = currentRecommendation
306306
vpa.SetUpdateMode(apiObject.Spec.UpdatePolicy)
307307
vpa.SetResourcePolicy(apiObject.Spec.ResourcePolicy)
308+
vpa.SetRecommenderConfig(apiObject.Spec.RecommenderConfig)
308309
vpa.SetAPIVersion(apiObject.GetObjectKind().GroupVersionKind().Version)
309310
return nil
310311
}

0 commit comments

Comments
 (0)