Skip to content

Commit e787f54

Browse files
committed
qat: add heartbeat check and use that as a device healthiness indicator
Signed-off-by: Tuomas Katila <[email protected]>
1 parent 7ebb43b commit e787f54

File tree

3 files changed

+164
-11
lines changed

3 files changed

+164
-11
lines changed

.golangci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,4 @@ issues:
6767
# Until the testing package allows pinning variables disable scopelint
6868
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
6969
- scopelint
70+
- gocognit

cmd/qat_plugin/dpdkdrv/dpdkdrv.go

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,35 @@ func readDeviceConfiguration(pfDev string) string {
393393
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
394394
}
395395

396+
func getDeviceHealthiness(device string, lookup map[string]string) string {
397+
healthiness := pluginapi.Healthy
398+
399+
pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn"))
400+
if err != nil {
401+
klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err)
402+
return healthiness
403+
}
404+
405+
// VFs share one PF, so all the VFs should return the same result
406+
if _, found := lookup[pfDev]; found {
407+
return lookup[pfDev]
408+
} else {
409+
driver := getCurrentDriver(pfDev)
410+
411+
hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug",
412+
fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev)))
413+
414+
// If status reads "-1", the device is considered bad
415+
if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" {
416+
healthiness = pluginapi.Unhealthy
417+
}
418+
419+
lookup[pfDev] = healthiness
420+
421+
return healthiness
422+
}
423+
}
424+
396425
func getDeviceCapabilities(device string) (string, error) {
397426
devID, err := getDeviceID(device)
398427
if err != nil {
@@ -583,6 +612,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
583612
devTree := dpapi.NewDeviceTree()
584613
n := 0
585614

615+
pfHealthLookup := map[string]string{}
616+
586617
for _, vfDevice := range dp.getVfDevices() {
587618
vfBdf := filepath.Base(vfDevice)
588619

@@ -610,14 +641,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
610641
return nil, err
611642
}
612643

613-
klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap)
644+
healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup)
645+
646+
klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness)
614647

615648
n = n + 1
616649
envs := map[string]string{
617650
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
618651
}
619652

620-
devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
653+
devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
621654

622655
devTree.AddDevice(cap, vfBdf, devinfo)
623656
}

cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package dpdkdrv
1616

1717
import (
1818
"flag"
19+
"fmt"
1920
"os"
2021
"path"
2122
"reflect"
@@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) {
162163

163164
func TestScan(t *testing.T) {
164165
tcases := []struct {
165-
name string
166-
dpdkDriver string
167-
dirs []string
168-
files map[string][]byte
169-
symlinks map[string]string
170-
kernelVfDrivers []string
171-
expectedErr bool
172-
maxDevNum int
173-
expectedDevNum int
166+
name string
167+
dpdkDriver string
168+
dirs []string
169+
files map[string][]byte
170+
symlinks map[string]string
171+
kernelVfDrivers []string
172+
expectedErr bool
173+
maxDevNum int
174+
expectedDevNum int
175+
expectedUnhealthyNum int
174176
}{
175177
{
176178
name: "No error returned for uninitialized device plugin",
@@ -519,7 +521,119 @@ func TestScan(t *testing.T) {
519521
maxDevNum: 2,
520522
expectedDevNum: 2,
521523
},
524+
{
525+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad",
526+
dpdkDriver: "vfio-pci",
527+
kernelVfDrivers: []string{"4xxxvf"},
528+
dirs: []string{
529+
"sys/bus/pci/drivers/4xxx",
530+
"sys/bus/pci/drivers/vfio-pci",
531+
"sys/devices/pci0000:02/0000:02:00.0",
532+
"sys/devices/pci0000:02/0000:02:00.0/qat",
533+
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
534+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
535+
"sys/bus/pci/devices/0000:02:01.0",
536+
},
537+
files: map[string][]byte{
538+
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
539+
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
540+
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
541+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
542+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"),
543+
},
544+
symlinks: map[string]string{
545+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
546+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
547+
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
548+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
549+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
550+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
551+
},
552+
maxDevNum: 1,
553+
expectedDevNum: 1,
554+
expectedUnhealthyNum: 1,
555+
},
556+
{
557+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good",
558+
dpdkDriver: "vfio-pci",
559+
kernelVfDrivers: []string{"4xxxvf"},
560+
dirs: []string{
561+
"sys/bus/pci/drivers/4xxx",
562+
"sys/bus/pci/drivers/vfio-pci",
563+
"sys/devices/pci0000:02/0000:02:00.0",
564+
"sys/devices/pci0000:02/0000:02:00.0/qat",
565+
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
566+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
567+
"sys/bus/pci/devices/0000:02:01.0",
568+
},
569+
files: map[string][]byte{
570+
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
571+
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
572+
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
573+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
574+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"),
575+
},
576+
symlinks: map[string]string{
577+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
578+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
579+
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
580+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
581+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
582+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
583+
},
584+
maxDevNum: 1,
585+
expectedDevNum: 1,
586+
expectedUnhealthyNum: 0,
587+
},
588+
{
589+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad",
590+
dpdkDriver: "vfio-pci",
591+
kernelVfDrivers: []string{"c6xxvf"},
592+
dirs: []string{
593+
"sys/bus/pci/drivers/c6xx",
594+
"sys/bus/pci/drivers/vfio-pci",
595+
"sys/bus/pci/devices/0000:02:01.0",
596+
"sys/bus/pci/devices/0000:02:01.1",
597+
"sys/devices/pci0000:02/0000:02:00.0",
598+
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat",
599+
},
600+
files: map[string][]byte{
601+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"),
602+
"sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"),
603+
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"),
604+
},
605+
symlinks: map[string]string{
606+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
607+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
608+
"sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
609+
"sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0",
610+
"sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
611+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
612+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
613+
"sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1",
614+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx",
615+
},
616+
maxDevNum: 3,
617+
expectedDevNum: 2,
618+
expectedUnhealthyNum: 2,
619+
},
620+
}
621+
622+
countUnhealthyDevices := func(tree dpapi.DeviceTree) int {
623+
unhealtyNum := 0
624+
625+
for _, v := range tree {
626+
for _, vv := range v {
627+
field := reflect.ValueOf(vv).FieldByName("state")
628+
if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy {
629+
unhealtyNum = unhealtyNum + 1
630+
}
631+
}
632+
}
633+
634+
return unhealtyNum
522635
}
636+
523637
for _, tt := range tcases {
524638
t.Run(tt.name, func(t *testing.T) {
525639
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
@@ -552,6 +666,7 @@ func TestScan(t *testing.T) {
552666
if !tt.expectedErr && err != nil {
553667
t.Errorf("got unexpected error: %+v", err)
554668
}
669+
555670
devNum := 0
556671
for _, resource := range fN.tree {
557672
devNum = devNum + len(resource)
@@ -560,6 +675,10 @@ func TestScan(t *testing.T) {
560675
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
561676
}
562677

678+
if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum {
679+
t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum)
680+
}
681+
563682
if err = os.RemoveAll(tmpdir); err != nil {
564683
t.Fatal(err)
565684
}

0 commit comments

Comments
 (0)