Skip to content

Commit f655871

Browse files
DRA: Add configurable health check timeout per device
Implements device-specific health check timeouts in the DRA health monitoring system as defined in KEP-4680. This allows DRA drivers to specify custom timeout values for individual devices through the gRPC health API. Changes: - Add HealthCheckTimeout field to state.DeviceHealth struct to store device-specific timeout durations - Add health_check_timeout_seconds field to DeviceHealth proto message in the DRA health gRPC API (v1alpha1) - Update manager.go to extract timeout from gRPC responses and apply DefaultHealthTimeout (30s) when not specified - Handle negative timeout values defensively by logging a warning and falling back to the default timeout - Simplify healthinfo.go by removing redundant fallback logic since timeouts are now always set at creation time - Update tests to include HealthCheckTimeout in test fixtures The timeout behavior is: - Positive values: Use the specified timeout in seconds - Zero or unspecified: Use DefaultHealthTimeout (30 seconds) - Negative values: Log warning and use DefaultHealthTimeout This implementation provides flexibility for DRA drivers to define appropriate health check intervals for different device types while maintaining backward compatibility through sensible defaults. Ref: KEP-4680 (Add Resource Health to Pod Status) Ref: kubernetes/enhancements#5476 Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 8c1094d commit f655871

File tree

7 files changed

+110
-51
lines changed

7 files changed

+110
-51
lines changed

pkg/kubelet/cm/dra/healthinfo.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ import (
2828
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
2929
)
3030

31-
// TODO(#133118): Make health timeout configurable.
3231
const (
33-
healthTimeout = 30 * time.Second
32+
// DefaultHealthTimeout is the default timeout for device health checks when not specified by the plugin
33+
DefaultHealthTimeout = 30 * time.Second
3434
)
3535

3636
// healthInfoCache is a cache of known device health.
@@ -129,7 +129,14 @@ func (cache *healthInfoCache) getHealthInfo(driverName, poolName, deviceName str
129129
if driver, ok := (*cache.HealthInfo)[driverName]; ok {
130130
key := poolName + "/" + deviceName
131131
if device, ok := driver.Devices[key]; ok {
132-
if now.Sub(device.LastUpdated) > healthTimeout {
132+
// Use device-specific timeout if set, otherwise use default
133+
timeout := device.HealthCheckTimeout
134+
if timeout == 0 {
135+
timeout = DefaultHealthTimeout
136+
}
137+
138+
// Check if device health has timed out
139+
if now.Sub(device.LastUpdated) > timeout {
133140
res = state.DeviceHealthStatusUnknown
134141
} else {
135142
res = device.Health
@@ -178,7 +185,14 @@ func (cache *healthInfoCache) updateHealthInfo(driverName string, devices []stat
178185
// them. Mark them as "Unknown" if their status has timed out.
179186
for key, existingDevice := range currentDriver.Devices {
180187
if _, wasReported := reportedKeys[key]; !wasReported {
181-
if existingDevice.Health != state.DeviceHealthStatusUnknown && now.Sub(existingDevice.LastUpdated) > healthTimeout {
188+
// Use device-specific timeout if set, otherwise use default
189+
timeout := existingDevice.HealthCheckTimeout
190+
if timeout == 0 {
191+
timeout = DefaultHealthTimeout
192+
}
193+
194+
// Mark as unknown if the device health has timed out
195+
if existingDevice.Health != state.DeviceHealthStatusUnknown && now.Sub(existingDevice.LastUpdated) > timeout {
182196
existingDevice.Health = state.DeviceHealthStatusUnknown
183197
existingDevice.LastUpdated = now
184198
currentDriver.Devices[key] = existingDevice

pkg/kubelet/cm/dra/healthinfo_test.go

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@ const (
3838

3939
var (
4040
testDeviceHealth = state.DeviceHealth{
41-
PoolName: testPool,
42-
DeviceName: testDevice,
43-
Health: state.DeviceHealthStatusHealthy,
41+
PoolName: testPool,
42+
DeviceName: testDevice,
43+
Health: state.DeviceHealthStatusHealthy,
44+
HealthCheckTimeout: DefaultHealthTimeout,
4445
}
4546
)
4647

@@ -206,7 +207,7 @@ func TestGetHealthInfo(t *testing.T) {
206207
driverState := (*cache.HealthInfo)[testDriver]
207208
deviceKey := testPool + "/" + testDevice
208209
device := driverState.Devices[deviceKey]
209-
device.LastUpdated = time.Now().Add((-healthTimeout) - time.Second)
210+
device.LastUpdated = time.Now().Add((-DefaultHealthTimeout) - time.Second)
210211
driverState.Devices[deviceKey] = device
211212
(*cache.HealthInfo)[testDriver] = driverState
212213
return nil
@@ -237,7 +238,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
237238
name: "device exists and is healthy",
238239
initialState: &state.DevicesHealthMap{
239240
testDriver: {Devices: map[string]state.DeviceHealth{
240-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now()},
241+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
241242
}},
242243
},
243244
driverName: testDriver,
@@ -249,7 +250,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
249250
name: "device exists and is unhealthy",
250251
initialState: &state.DevicesHealthMap{
251252
testDriver: {Devices: map[string]state.DeviceHealth{
252-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusUnhealthy, LastUpdated: time.Now()},
253+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusUnhealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
253254
}},
254255
},
255256
driverName: testDriver,
@@ -261,7 +262,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
261262
name: "device exists but timed out",
262263
initialState: &state.DevicesHealthMap{
263264
testDriver: {Devices: map[string]state.DeviceHealth{
264-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * healthTimeout) - time.Second)},
265+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * DefaultHealthTimeout) - time.Second), HealthCheckTimeout: DefaultHealthTimeout},
265266
}},
266267
},
267268
driverName: testDriver,
@@ -273,7 +274,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
273274
name: "device exists, just within timeout",
274275
initialState: &state.DevicesHealthMap{
275276
testDriver: {Devices: map[string]state.DeviceHealth{
276-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * healthTimeout) + time.Second)},
277+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * DefaultHealthTimeout) + time.Second), HealthCheckTimeout: DefaultHealthTimeout},
277278
}},
278279
},
279280
driverName: testDriver,
@@ -285,7 +286,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
285286
name: "device does not exist, just outside of timeout",
286287
initialState: &state.DevicesHealthMap{
287288
testDriver: {Devices: map[string]state.DeviceHealth{
288-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * healthTimeout) - time.Second)},
289+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now().Add((-1 * DefaultHealthTimeout) - time.Second), HealthCheckTimeout: DefaultHealthTimeout},
289290
}},
290291
},
291292
driverName: testDriver,
@@ -297,7 +298,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
297298
name: "device does not exist",
298299
initialState: &state.DevicesHealthMap{
299300
testDriver: {Devices: map[string]state.DeviceHealth{
300-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now()},
301+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
301302
}},
302303
},
303304
driverName: testDriver,
@@ -309,7 +310,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
309310
name: "driver does not exist",
310311
initialState: &state.DevicesHealthMap{
311312
testDriver: {Devices: map[string]state.DeviceHealth{
312-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now()},
313+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
313314
}},
314315
},
315316
driverName: "driver2",
@@ -321,7 +322,7 @@ func TestGetHealthInfoRobust(t *testing.T) {
321322
name: "pool does not exist",
322323
initialState: &state.DevicesHealthMap{
323324
testDriver: {Devices: map[string]state.DeviceHealth{
324-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now()},
325+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
325326
}},
326327
},
327328
driverName: testDriver,
@@ -333,8 +334,8 @@ func TestGetHealthInfoRobust(t *testing.T) {
333334
name: "multiple devices",
334335
initialState: &state.DevicesHealthMap{
335336
testDriver: {Devices: map[string]state.DeviceHealth{
336-
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now()},
337-
testPool + "/device2": {PoolName: testPool, DeviceName: "device2", Health: state.DeviceHealthStatusUnhealthy, LastUpdated: time.Now()},
337+
testPool + "/" + testDevice: {PoolName: testPool, DeviceName: testDevice, Health: state.DeviceHealthStatusHealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
338+
testPool + "/device2": {PoolName: testPool, DeviceName: "device2", Health: state.DeviceHealthStatusUnhealthy, LastUpdated: time.Now(), HealthCheckTimeout: DefaultHealthTimeout},
338339
}},
339340
},
340341
driverName: testDriver,
@@ -382,7 +383,7 @@ func TestUpdateHealthInfo(t *testing.T) {
382383
assert.Equal(t, state.DeviceHealthStatusUnhealthy, cache.getHealthInfo(testDriver, testPool, testDevice))
383384

384385
// 4 -- Add second device, omit first
385-
secondDevice := state.DeviceHealth{PoolName: testPool, DeviceName: "device2", Health: state.DeviceHealthStatusHealthy}
386+
secondDevice := state.DeviceHealth{PoolName: testPool, DeviceName: "device2", Health: state.DeviceHealthStatusHealthy, HealthCheckTimeout: DefaultHealthTimeout}
386387
// When the first device is omitted, it should be marked as "Unknown" after a timeout.
387388
// For this test, we simulate the timeout by not reporting it.
388389
firstDeviceAsUnknown := newHealth
@@ -392,7 +393,7 @@ func TestUpdateHealthInfo(t *testing.T) {
392393
err = cache.withLock(func() error {
393394
deviceKey := testPool + "/" + testDevice
394395
device := (*cache.HealthInfo)[testDriver].Devices[deviceKey]
395-
device.LastUpdated = time.Now().Add(-healthTimeout * 2)
396+
device.LastUpdated = time.Now().Add(-DefaultHealthTimeout * 2)
396397
(*cache.HealthInfo)[testDriver].Devices[deviceKey] = device
397398
return nil
398399
})
@@ -411,7 +412,7 @@ func TestUpdateHealthInfo(t *testing.T) {
411412
assert.Equal(t, state.DeviceHealthStatusUnknown, cache2.getHealthInfo(testDriver, testPool, testDevice))
412413

413414
// 6 -- Test how updateHealthInfo handles device timeouts
414-
timeoutDevice := state.DeviceHealth{PoolName: testPool, DeviceName: "timeoutDevice", Health: "Unhealthy"}
415+
timeoutDevice := state.DeviceHealth{PoolName: testPool, DeviceName: "timeoutDevice", Health: "Unhealthy", HealthCheckTimeout: DefaultHealthTimeout}
415416
_, err = cache.updateHealthInfo(testDriver, []state.DeviceHealth{timeoutDevice})
416417
require.NoError(t, err)
417418

@@ -420,14 +421,14 @@ func TestUpdateHealthInfo(t *testing.T) {
420421
driverState := (*cache.HealthInfo)[testDriver]
421422
deviceKey := testPool + "/timeoutDevice"
422423
device := driverState.Devices[deviceKey]
423-
device.LastUpdated = time.Now().Add((-healthTimeout) - time.Second)
424+
device.LastUpdated = time.Now().Add((-DefaultHealthTimeout) - time.Second)
424425
driverState.Devices[deviceKey] = device
425426
(*cache.HealthInfo)[testDriver] = driverState
426427
return nil
427428
})
428429
require.NoError(t, err)
429430

430-
expectedTimeoutDeviceUnknown := state.DeviceHealth{PoolName: testPool, DeviceName: "timeoutDevice", Health: state.DeviceHealthStatusUnknown}
431+
expectedTimeoutDeviceUnknown := state.DeviceHealth{PoolName: testPool, DeviceName: "timeoutDevice", Health: state.DeviceHealthStatusUnknown, HealthCheckTimeout: DefaultHealthTimeout}
431432
expectedChanged6 := []state.DeviceHealth{expectedTimeoutDeviceUnknown}
432433
changedDevices, err = cache.updateHealthInfo(testDriver, []state.DeviceHealth{})
433434
require.NoError(t, err)

pkg/kubelet/cm/dra/manager.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -933,11 +933,29 @@ func (m *Manager) HandleWatchResourcesStream(ctx context.Context, stream draheal
933933
default:
934934
health = state.DeviceHealthStatusUnknown
935935
}
936+
937+
// Extract the health check timeout from the gRPC response
938+
// If not specified, zero, or negative, use the default timeout
939+
timeout := DefaultHealthTimeout
940+
timeoutSeconds := d.GetHealthCheckTimeoutSeconds()
941+
if timeoutSeconds > 0 {
942+
timeout = time.Duration(timeoutSeconds) * time.Second
943+
} else if timeoutSeconds < 0 {
944+
// Log warning for negative timeout values and use default
945+
logger.V(4).Info("Ignoring negative health check timeout, using default",
946+
"pluginName", pluginName,
947+
"poolName", d.GetDevice().GetPoolName(),
948+
"deviceName", d.GetDevice().GetDeviceName(),
949+
"providedTimeout", timeoutSeconds,
950+
"defaultTimeout", DefaultHealthTimeout)
951+
}
952+
936953
devices[i] = state.DeviceHealth{
937-
PoolName: d.GetDevice().GetPoolName(),
938-
DeviceName: d.GetDevice().GetDeviceName(),
939-
Health: health,
940-
LastUpdated: time.Unix(d.GetLastUpdatedTime(), 0),
954+
PoolName: d.GetDevice().GetPoolName(),
955+
DeviceName: d.GetDevice().GetDeviceName(),
956+
Health: health,
957+
LastUpdated: time.Unix(d.GetLastUpdatedTime(), 0),
958+
HealthCheckTimeout: timeout,
941959
}
942960
}
943961

pkg/kubelet/cm/dra/state/state.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,9 @@ type DeviceHealth struct {
9595

9696
// LastUpdated keeps track of the last health status update of this device.
9797
LastUpdated time.Time
98+
99+
// HealthCheckTimeout is the timeout for the health check of the device.
100+
// Zero value means use the default timeout (DefaultHealthTimeout).
101+
// This ensures backward compatibility with existing data.
102+
HealthCheckTimeout time.Duration
98103
}

staging/src/k8s.io/kubelet/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ godebug default=go1.24
99
require (
1010
github.com/emicklei/go-restful/v3 v3.12.2
1111
github.com/gogo/protobuf v1.3.2
12+
github.com/golang/protobuf v1.5.4
1213
github.com/stretchr/testify v1.10.0
1314
go.uber.org/goleak v1.3.0
1415
google.golang.org/grpc v1.72.2

0 commit comments

Comments
 (0)