From 3669d1f1f6c025e279054046adcc4dee9e6482d7 Mon Sep 17 00:00:00 2001 From: Carlos Goncalves Date: Tue, 22 Nov 2022 16:25:55 +0100 Subject: [PATCH] Retry setting VF MAC address Some NIC drivers (i.e. i40e/iavf) set their VF MAC addressing asynchronously when set administratively. This means that while the PF could already show the VF with the desired MAC address, the netdev VF may still have the original one. If in this window we issue a netdev VF MAC address set, the driver will return an error and the pod will fail to create. One way to fix this issue would be to not try to set the netdev VF MAC address, rather simply rely on the MAC address set administratively already in place. However, other NIC drivers (i.e. mlx5_core) do not propagate the MAC address down to the netdev VF so for those drivers we have to continue setting the VF MAC address the same way (via PF and netdev VF). This commit addresses the issue with a retry where it waits up to 1 second (5 retries * 200 millisecond sleep) in case driver is still working on propagating the MAC address change down to the VF. ResetVFConfig resets a VF administratively. We must run ResetVFConfig before ReleaseVF because some drivers will error out if we try to reset netdev VF with trust off. So, reset VF MAC address via PF first. Signed-off-by: Carlos Goncalves --- cmd/sriov/main.go | 12 ++++++++---- pkg/sriov/sriov.go | 31 +++++++++++++++++++++++++++++-- pkg/utils/utils.go | 14 ++++++++++++++ pkg/utils/utils_test.go | 13 +++++++++++++ 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/cmd/sriov/main.go b/cmd/sriov/main.go index 3ba469832..06dd05697 100644 --- a/cmd/sriov/main.go +++ b/cmd/sriov/main.go @@ -221,6 +221,14 @@ func cmdDel(args *skel.CmdArgs) error { sm := sriov.NewSriovManager() + /* ResetVFConfig resets a VF administratively. We must run ResetVFConfig + before ReleaseVF because some drivers will error out if we try to + reset netdev VF with trust off. So, reset VF MAC address via PF first. + */ + if err := sm.ResetVFConfig(netConf); err != nil { + return fmt.Errorf("cmdDel() error reseting VF: %q", err) + } + if !netConf.DPDKMode { netns, err := ns.GetNS(args.Netns) if err != nil { @@ -243,10 +251,6 @@ func cmdDel(args *skel.CmdArgs) error { } } - if err := sm.ResetVFConfig(netConf); err != nil { - return fmt.Errorf("cmdDel() error reseting VF: %q", err) - } - // Mark the pci address as released allocator := utils.NewPCIAllocator(config.DefaultCNIDir) if err = allocator.DeleteAllocatedPCI(netConf.DeviceID); err != nil { diff --git a/pkg/sriov/sriov.go b/pkg/sriov/sriov.go index 70160bdae..deaad0b98 100644 --- a/pkg/sriov/sriov.go +++ b/pkg/sriov/sriov.go @@ -3,6 +3,7 @@ package sriov import ( "fmt" "net" + "time" "github.com/containernetworking/plugins/pkg/ns" @@ -91,7 +92,20 @@ func (s *sriovManager) SetupVF(conf *sriovtypes.NetConf, podifName string, cid s // Save the original effective MAC address before overriding it conf.OrigVfState.EffectiveMAC = linkObj.Attrs().HardwareAddr.String() - if err = s.nLink.LinkSetHardwareAddr(linkObj, hwaddr); err != nil { + /* Some NIC drivers (i.e. i40e/iavf) set VF MAC address asynchronously + via PF. This means that while the PF could already show the VF with + the desired MAC address, the netdev VF may still have the original + one. If in this window we issue a netdev VF MAC address set, the driver + will return an error and the pod will fail to create. + Other NICs (Mellanox) require explicit netdev VF MAC address so we + cannot skip this part. + Retry up to 5 times; wait 200 milliseconds between retries + */ + err = utils.Retry(5, 200*time.Millisecond, func() error { + return s.nLink.LinkSetHardwareAddr(linkObj, hwaddr) + }) + + if err != nil { return "", fmt.Errorf("failed to set netlink MAC address to %s: %v", hwaddr, err) } macAddress = conf.MAC @@ -332,7 +346,20 @@ func (s *sriovManager) ResetVFConfig(conf *sriovtypes.NetConf) error { if err != nil { return fmt.Errorf("failed to parse original administrative MAC address %s: %v", conf.OrigVfState.AdminMAC, err) } - if err = s.nLink.LinkSetVfHardwareAddr(pfLink, conf.VFID, hwaddr); err != nil { + + /* Some NIC drivers (i.e. i40e/iavf) set VF MAC address asynchronously + via PF. This means that while the PF could already show the VF with + the desired MAC address, the netdev VF may still have the original + one. If in this window we issue a netdev VF MAC address set, the driver + will return an error and the pod will fail to create. + Other NICs (Mellanox) require explicit netdev VF MAC address so we + cannot skip this part. + Retry up to 5 times; wait 200 milliseconds between retries + */ + err = utils.Retry(5, 200*time.Millisecond, func() error { + return s.nLink.LinkSetVfHardwareAddr(pfLink, conf.VFID, hwaddr) + }) + if err != nil { return fmt.Errorf("failed to restore original administrative MAC address %s: %v", hwaddr, err) } } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 9d4ccae7b..24178da19 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "strings" + "time" ) var ( @@ -314,3 +315,16 @@ func IsIPv4(ip net.IP) bool { func IsIPv6(ip net.IP) bool { return ip.To4() == nil && ip.To16() != nil } + +// Retry retries a given function until no return error; times out after retries*sleep +func Retry(retries int, sleep time.Duration, f func() error) error { + err := error(nil) + for retry := 0; retry < retries; retry++ { + err = f() + if err == nil { + return nil + } + time.Sleep(sleep) + } + return err +} diff --git a/pkg/utils/utils_test.go b/pkg/utils/utils_test.go index 3896e573f..0bb159030 100644 --- a/pkg/utils/utils_test.go +++ b/pkg/utils/utils_test.go @@ -1,6 +1,9 @@ package utils import ( + "errors" + "time" + . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" ) @@ -78,4 +81,14 @@ var _ = Describe("Utils", func() { Expect(err).To(HaveOccurred(), "Not existing VF should return an error") }) }) + Context("Checking Retry functon", func() { + It("Assuming calling function fails", func() { + err := Retry(5, 10*time.Millisecond, func() error { return errors.New("") }) + Expect(err).To((HaveOccurred()), "Retry should return an error") + }) + It("Assuming calling function does not fail", func() { + err := Retry(5, 10*time.Millisecond, func() error { return nil }) + Expect(err).NotTo((HaveOccurred()), "Retry should not return an error") + }) + }) })