diff --git a/pkg/nettools/nettools.go b/pkg/nettools/nettools.go index cad057c73..3f2882bc0 100644 --- a/pkg/nettools/nettools.go +++ b/pkg/nettools/nettools.go @@ -705,6 +705,10 @@ func RecoverContainerSideNetwork(csn *network.ContainerSideNetwork, nsPath strin bindDeviceToVFIO(devIdentifier) } else { ifaceType = network.InterfaceTypeTap + // It's OK if OpenTAP failed as the device is busy and used by running VM + if fo, err := OpenTAP(link.Attrs().Name); err == nil { + desc.Fo = fo + } } if desc.Type != ifaceType { return fmt.Errorf("bad interface type for %q", desc.Name) diff --git a/pkg/tapmanager/fdserver.go b/pkg/tapmanager/fdserver.go index 90fc68cb6..d1e8d568b 100644 --- a/pkg/tapmanager/fdserver.go +++ b/pkg/tapmanager/fdserver.go @@ -106,6 +106,8 @@ type FDSource interface { // specified key. It's intended to be called after // Virtlet restart. Recover(key string, data []byte) error + // RetrieveFDs retrieves FDs in case the FD is null + RetrieveFDs(key string) ([]int, error) // Stop stops any goroutines associated with FDSource // but doesn't release the namespaces Stop() error @@ -162,6 +164,19 @@ func (s *FDServer) getFDs(key string) ([]int, error) { if !found { return nil, fmt.Errorf("bad fd key: %q", key) } + + var err error + if fds == nil { + // Run here means: + // first: the virtlet gets restarted and recoverNetworkNamespaces is called + // but tap fd is missing + // then: VM gets restarted for some reasons + fds, err = s.source.RetrieveFDs(key) + if err != nil { + return nil, err + } + s.fds[key] = fds + } return fds, nil } diff --git a/pkg/tapmanager/fdserver_test.go b/pkg/tapmanager/fdserver_test.go index 1ad64d78b..2bc1aeb82 100644 --- a/pkg/tapmanager/fdserver_test.go +++ b/pkg/tapmanager/fdserver_test.go @@ -79,6 +79,13 @@ func (s *sampleFDSource) GetFDs(key string, data []byte) ([]int, []byte, error) return []int{int(f.Fd())}, []byte("abcdef"), nil } +func (s *sampleFDSource) RetrieveFDs(key string) ([]int, error) { + if s.stopped { + return nil, errors.New("sampleFDSource is stopped") + } + return nil, nil +} + func (s *sampleFDSource) Recover(key string, data []byte) error { if s.stopped { return errors.New("sampleFDSource is stopped") diff --git a/pkg/tapmanager/tapfdsource.go b/pkg/tapmanager/tapfdsource.go index 912a20061..d84f21d42 100644 --- a/pkg/tapmanager/tapfdsource.go +++ b/pkg/tapmanager/tapfdsource.go @@ -341,6 +341,47 @@ func (s *TapFDSource) Recover(key string, data []byte) error { }) } +// RetrieveFDs retrieve the FDs +// It is only the case if VM exited but recover didn't populate the FDs +func (s *TapFDSource) RetrieveFDs(key string) ([]int, error) { + var podNet *podNetwork + var fds []int + func() { + s.Lock() + defer s.Unlock() + podNet = s.fdMap[key] + }() + if podNet == nil { + return nil, fmt.Errorf("bad key %q to retrieve FDs", key) + } + + netNSPath := cni.PodNetNSPath(podNet.pnd.PodID) + vmNS, err := ns.GetNS(netNSPath) + if err != nil { + return nil, fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err) + } + + if err := utils.CallInNetNSWithSysfsRemounted(vmNS, func(hostNS ns.NetNS) error { + allLinks, err := netlink.LinkList() + if err != nil { + return fmt.Errorf("error listing the links: %v", err) + } + + return nettools.RecoverContainerSideNetwork(podNet.csn, netNSPath, allLinks, hostNS) + }); err != nil { + return nil, err + } + + for _, ifDesc := range podNet.csn.Interfaces { + // Fail if not all succeeded + if ifDesc.Fo == nil { + return nil, fmt.Errorf("failed to open tap interface %q", ifDesc.Name) + } + fds = append(fds, int(ifDesc.Fo.Fd())) + } + return fds, nil +} + func (s *TapFDSource) setupNetNS(key string, pnd *PodNetworkDesc, initNet func(netNSPath string, allLinks []netlink.Link, hostNS ns.NetNS) (*network.ContainerSideNetwork, error)) error { netNSPath := cni.PodNetNSPath(pnd.PodID) vmNS, err := ns.GetNS(netNSPath)