diff --git a/docs/cmd-coil-egress.md b/docs/cmd-coil-egress.md index 1a1808ae..4bdb539c 100644 --- a/docs/cmd-coil-egress.md +++ b/docs/cmd-coil-egress.md @@ -20,6 +20,7 @@ It watches client Pods and creates or deletes Foo-over-UDP tunnels. ``` Flags: --fou-port int port number for foo-over-udp tunnels (default 5555) + --enable-sport-auto enable automatic source port assignment (default false) --health-addr string bind address of health/readiness probes (default ":8081") -h, --help help for coil-egress --metrics-addr string bind address of metrics endpoint (default ":8080") diff --git a/docs/design.md b/docs/design.md index 01c28865..8019e778 100644 --- a/docs/design.md +++ b/docs/design.md @@ -203,7 +203,7 @@ This can be configured by 1) creating IPIP tunnel device with FoU encapsulation ```console $ sudo ip link add name tun1 type ipip ttl 225 \ remote 1.2.3.4 local 5.6.7.8 \ - encap fou encap-sport 5555 encap-dport 5555 + encap fou encap-sport auto encap-dport 5555 $ sudo ip fou add port 5555 ipproto 4 # 4 means IPIP protocol ``` @@ -225,13 +225,27 @@ The transmission between client pods and the SNAT router needs to be bidirection If the SNAT routers are behind Kubernetes Service, the IPIP tunnel on the client pod is configured to send packets to the Service's ClusterIP. Therefore, the FoU encapsulated packet will have the ClusterIP as the destination address. -Remember we need bidirectional tunneling. If the returning packet has the SNAT router's IP address as the source address, the packet does not match the IPIP tunnel configured for the Service's ClusterIP. So, the returning packet *must* have the ClusterIP as the source address. +Remember we need bidirectional tunneling. If the returning packet has the SNAT router's IP address as the source address, the packet does not match the IPIP tunnel configured for the Service's ClusterIP. +We setup a flow based IPIP tunnel device to receive such the returning packet as well as the IPIP tunnel device with FoU encapsulation option. Otherwise, clients will return ICMP destination unreachable packets. +This flow based IPIP tunnel devices work as catch-all fallback interfaces for the IPIP decapsulation stack. -To resolve this, we need to understand how `kube-proxy` works for ClusterIP. `kube-proxy` rewrites outgoing packets' destination addresses if they are ClusterIP. So, it works as a destination NAT (DNAT) service. +For example, a NAT client(`10.64.0.65:49944`) sends an encapsulated packet from CLusterIP `10.68.114.217:5555`, and a return packet comes from a router Pod(`10.72.49.1.59203`) to the client. +The outgoing packet will be encapsulated by the IPIP tunnel device with FoU encapsulation option, and the incoming packet will be received and decapsulated by the flow based IPIP tunnel device. -Moreover, it rewrites the incoming packet's source addresses if the packet seems like a response returned from one of the destination servers of Service. To be more precise, the incoming packet will be handled by `kube-proxy` if and only if its destination address/port was the source address/port of the outgoing packet and its source address/port was the destination address/port. +``` +10.64.0.65.49944 > 10.68.114.217.5555: UDP, length 60 +10.72.49.1.59203 > 10.64.0.65.5555: UDP, length 60 +``` + +Before coil v2.4.0, we configured a fixed source port 5555 for FoU encapsulation devices so that `kube-proxy` or `Cilium kube-proxy replacement` can do the reverse SNAT handling. +The transmit and receive sides have been separated and the communication can be asymmetric as the example above shows. We were relying on the fixed source port to handle the reverse SNAT. -To satisfy this condition, we use the port number 5555 for FoU on both client pods and SNAT router pods. +This fixed source port approach causes the following problems: + +- Traffic from NAT clients to router Pods can't be distributed when users use Coil with a proxier that selects a backend based on the flow hash such as `Cilium` +- When a router Pod is terminating, traffic from NAT clients to the route Pod cant' be switched until the Pod is finally removed. This problem happens with the Graceful termination of `Cilium kube-proxy replacement`. + +We encourage users to use `fouSourcePortAuto: true` setting to avoid these problems. ### Session persistence @@ -240,6 +254,10 @@ This can be achieved by setting Service's [`spec.sessionAffinity`](https://kuber Therefore, Coil creates a Service with `spec.sessionAffinity=ClientIP` for each NAT gateway. +It's also notable that the session persistence is not required if you use this feature in conjunction with `Cilium kube-proxy replacement`. +`Cilium` selects a backend for the service based on the flow hash, and the kernel picks source ports based on the flow hash of the encapsulated packet. +It means that the traffic belonging to the same TCP connection from a NAT client to a router service is always sent to the same Pod. + ### Auto-scaling with HPA To enable auto-scaling with horizontal pod autoscaler (HPA), `Egress` implements `scale` subresource. diff --git a/v2/api/v2/egress_types.go b/v2/api/v2/egress_types.go index b638d26d..a4ea0ef3 100644 --- a/v2/api/v2/egress_types.go +++ b/v2/api/v2/egress_types.go @@ -53,6 +53,13 @@ type EgressSpec struct { // Ref. https://pkg.go.dev/k8s.io/api/core/v1?tab=doc#ServiceSpec // +optional SessionAffinityConfig *corev1.SessionAffinityConfig `json:"sessionAffinityConfig,omitempty"` + + // FouSourcePortAuto indicates that the source port number in foo-over-udp encapsulation + // should be chosen automatically. + // If set to true, the kernel picks a flow based on the flow hash of the encapsulated packet. + // The default is false. + // +optional + FouSourcePortAuto bool `json:"fouSourcePortAuto,omitempty"` } // EgressPodTemplate defines pod template for Egress diff --git a/v2/cmd/coil-egress/sub/root.go b/v2/cmd/coil-egress/sub/root.go index 3069d641..40520768 100644 --- a/v2/cmd/coil-egress/sub/root.go +++ b/v2/cmd/coil-egress/sub/root.go @@ -12,10 +12,11 @@ import ( ) var config struct { - metricsAddr string - healthAddr string - port int - zapOpts zap.Options + metricsAddr string + healthAddr string + port int + enableSportAuto bool + zapOpts zap.Options } var rootCmd = &cobra.Command{ @@ -43,6 +44,7 @@ func init() { pf.StringVar(&config.metricsAddr, "metrics-addr", ":8080", "bind address of metrics endpoint") pf.StringVar(&config.healthAddr, "health-addr", ":8081", "bind address of health/readiness probes") pf.IntVar(&config.port, "fou-port", 5555, "port number for foo-over-udp tunnels") + pf.BoolVar(&config.enableSportAuto, "enable-sport-auto", false, "enable automatic source port assignment") goflags := flag.NewFlagSet("klog", flag.ExitOnError) klog.InitFlags(goflags) diff --git a/v2/cmd/coil-egress/sub/run.go b/v2/cmd/coil-egress/sub/run.go index 141cc6c9..4c74bc41 100644 --- a/v2/cmd/coil-egress/sub/run.go +++ b/v2/cmd/coil-egress/sub/run.go @@ -94,7 +94,7 @@ func subMain() error { return err } - if err := controllers.SetupPodWatcher(mgr, myNS, myName, ft, eg); err != nil { + if err := controllers.SetupPodWatcher(mgr, myNS, myName, ft, config.enableSportAuto, eg); err != nil { return err } diff --git a/v2/config/crd/bases/coil.cybozu.com_egresses.yaml b/v2/config/crd/bases/coil.cybozu.com_egresses.yaml index c518731c..7b40c6d2 100644 --- a/v2/config/crd/bases/coil.cybozu.com_egresses.yaml +++ b/v2/config/crd/bases/coil.cybozu.com_egresses.yaml @@ -40,6 +40,12 @@ spec: type: string minItems: 1 type: array + fouSourcePortAuto: + description: FouSourcePortAuto indicates that the source port number + in foo-over-udp encapsulation should be chosen automatically. If + set to true, the kernel picks a flow based on the flow hash of the + encapsulated packet. The default is false. + type: boolean replicas: default: 1 description: Replicas is the desired number of egress (SNAT) pods. diff --git a/v2/controllers/egress_controller.go b/v2/controllers/egress_controller.go index 5d4e510c..380d42ac 100644 --- a/v2/controllers/egress_controller.go +++ b/v2/controllers/egress_controller.go @@ -157,6 +157,9 @@ func (r *EgressReconciler) reconcilePodTemplate(eg *coilv2.Egress, depl *appsv1. } if len(egressContainer.Args) == 0 { egressContainer.Args = []string{"--zap-stacktrace-level=panic"} + if eg.Spec.FouSourcePortAuto { + egressContainer.Args = append(egressContainer.Args, "--enable-sport-auto=true") + } } egressContainer.Env = append(egressContainer.Env, corev1.EnvVar{ diff --git a/v2/controllers/mock_test.go b/v2/controllers/mock_test.go index 178f3dc1..0cbe3e3f 100644 --- a/v2/controllers/mock_test.go +++ b/v2/controllers/mock_test.go @@ -149,11 +149,11 @@ func (t *mockFoUTunnel) Init() error { panic("not implemented") } -func (t *mockFoUTunnel) AddPeer(ip net.IP) (netlink.Link, error) { +func (t *mockFoUTunnel) AddPeer(ip net.IP, sportAuto bool) (netlink.Link, error) { t.mu.Lock() defer t.mu.Unlock() - t.peers[ip.String()] = true + t.peers[ip.String()] = sportAuto return nil, nil } @@ -172,7 +172,7 @@ func (t *mockFoUTunnel) GetPeers() map[string]bool { defer t.mu.Unlock() for k := range t.peers { - m[k] = true + m[k] = t.peers[k] } return m } diff --git a/v2/controllers/pod_watcher.go b/v2/controllers/pod_watcher.go index de582659..e4c86cbc 100644 --- a/v2/controllers/pod_watcher.go +++ b/v2/controllers/pod_watcher.go @@ -40,18 +40,19 @@ func init() { // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch // SetupPodWatcher registers pod watching reconciler to mgr. -func SetupPodWatcher(mgr ctrl.Manager, ns, name string, ft founat.FoUTunnel, eg founat.Egress) error { +func SetupPodWatcher(mgr ctrl.Manager, ns, name string, ft founat.FoUTunnel, encapSportAuto bool, eg founat.Egress) error { clientPods.Reset() r := &podWatcher{ - client: mgr.GetClient(), - myNS: ns, - myName: name, - ft: ft, - eg: eg, - metric: clientPods.WithLabelValues(ns, name), - podAddrs: make(map[string][]net.IP), - peers: make(map[string]map[string]struct{}), + client: mgr.GetClient(), + myNS: ns, + myName: name, + ft: ft, + encapSportAuto: encapSportAuto, + eg: eg, + metric: clientPods.WithLabelValues(ns, name), + podAddrs: make(map[string][]net.IP), + peers: make(map[string]map[string]struct{}), } return ctrl.NewControllerManagedBy(mgr). @@ -65,12 +66,13 @@ func SetupPodWatcher(mgr ctrl.Manager, ns, name string, ft founat.FoUTunnel, eg // this implementation can leave some tunnels as garbage. Such garbage tunnels // do no harm, though. type podWatcher struct { - client client.Client - myNS string - myName string - ft founat.FoUTunnel - eg founat.Egress - metric prometheus.Gauge + client client.Client + myNS string + myName string + ft founat.FoUTunnel + encapSportAuto bool + eg founat.Egress + metric prometheus.Gauge mu sync.Mutex podAddrs map[string][]net.IP @@ -166,7 +168,7 @@ OUTER: } } - link, err := r.ft.AddPeer(ip) + link, err := r.ft.AddPeer(ip, r.encapSportAuto) if errors.Is(err, founat.ErrIPFamilyMismatch) { logger.Info("skipping unsupported pod IP", "pod", pod.Namespace+"/"+pod.Name, "ip", ip.String()) continue diff --git a/v2/controllers/pod_watcher_test.go b/v2/controllers/pod_watcher_test.go index defeec6f..3f011fab 100644 --- a/v2/controllers/pod_watcher_test.go +++ b/v2/controllers/pod_watcher_test.go @@ -103,7 +103,7 @@ var _ = Describe("Pod watcher", func() { }) Expect(err).ToNot(HaveOccurred()) - err = SetupPodWatcher(mgr, "internet", "egress2", ft, eg) + err = SetupPodWatcher(mgr, "internet", "egress2", ft, true, eg) Expect(err).ToNot(HaveOccurred()) go func() { diff --git a/v2/e2e/coil_test.go b/v2/e2e/coil_test.go index bdf0d57a..5d9fd185 100644 --- a/v2/e2e/coil_test.go +++ b/v2/e2e/coil_test.go @@ -258,6 +258,19 @@ var _ = Describe("Coil", func() { } return int(depl.Status.ReadyReplicas) }).Should(Equal(2)) + + By("defining Egress with fouSourcePortAuto in the internet namespace") + kubectlSafe(nil, "apply", "-f", "manifests/egress-sport-auto.yaml") + + By("checking pod deployments for fouSourcePortAuto") + Eventually(func() int { + depl := &appsv1.Deployment{} + err := getResource("internet", "deployments", "egress-sport-auto", "", depl) + if err != nil { + return 0 + } + return int(depl.Status.ReadyReplicas) + }).Should(Equal(2)) }) It("should be able to run NAT client pods", func() { @@ -280,6 +293,26 @@ var _ = Describe("Coil", func() { } return nil }).Should(Succeed()) + + By("creating a NAT client pod for fouSourcePortAuto") + kubectlSafe(nil, "apply", "-f", "manifests/nat-client-sport-auto.yaml") + + By("checking the pod status for fouSourcePortAuto") + Eventually(func() error { + pod := &corev1.Pod{} + err := getResource("default", "pods", "nat-client-sport-auto", "", pod) + if err != nil { + return err + } + if len(pod.Status.ContainerStatuses) == 0 { + return errors.New("no container status") + } + cst := pod.Status.ContainerStatuses[0] + if !cst.Ready { + return errors.New("container is not ready") + } + return nil + }).Should(Succeed()) }) It("should allow NAT traffic over foo-over-udp tunnel", func() { @@ -319,5 +352,17 @@ var _ = Describe("Coil", func() { resp := kubectlSafe(data, "exec", "-i", "nat-client", "--", "curl", "-sf", "-T", "-", fakeURL) Expect(resp).To(HaveLen(1 << 20)) } + + By("sending and receiving HTTP request from nat-client-sport-auto") + data = make([]byte, 1<<20) // 1 MiB + resp = kubectlSafe(data, "exec", "-i", "nat-client-sport-auto", "--", "curl", "-sf", "-T", "-", fakeURL) + Expect(resp).To(HaveLen(1 << 20)) + + By("running the same test 100 times with nat-client-sport-auto") + for i := 0; i < 100; i++ { + time.Sleep(1 * time.Millisecond) + resp := kubectlSafe(data, "exec", "-i", "nat-client-sport-auto", "--", "curl", "-sf", "-T", "-", fakeURL) + Expect(resp).To(HaveLen(1 << 20)) + } }) }) diff --git a/v2/e2e/manifests/egress-sport-auto.yaml b/v2/e2e/manifests/egress-sport-auto.yaml new file mode 100644 index 00000000..5ef44082 --- /dev/null +++ b/v2/e2e/manifests/egress-sport-auto.yaml @@ -0,0 +1,20 @@ +apiVersion: coil.cybozu.com/v2 +kind: Egress +metadata: + name: egress-sport-auto + namespace: internet +spec: + replicas: 2 + destinations: + - 0.0.0.0/0 + - ::/0 + fouSourcePortAuto: true + template: + spec: + nodeSelector: + kubernetes.io/hostname: coil-control-plane + tolerations: + - effect: NoSchedule + operator: Exists + containers: + - name: egress diff --git a/v2/e2e/manifests/nat-client-sport-auto.yaml b/v2/e2e/manifests/nat-client-sport-auto.yaml new file mode 100644 index 00000000..7ba60421 --- /dev/null +++ b/v2/e2e/manifests/nat-client-sport-auto.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Pod +metadata: + name: nat-client-sport-auto + namespace: default + annotations: + egress.coil.cybozu.com/internet: egress-sport-auto +spec: + tolerations: + - key: test + operator: Exists + nodeSelector: + test: coil + containers: + - name: ubuntu + image: quay.io/cybozu/ubuntu:22.04 + command: ["pause"] diff --git a/v2/go.mod b/v2/go.mod index 4027534a..d12a4e4a 100644 --- a/v2/go.mod +++ b/v2/go.mod @@ -20,9 +20,9 @@ require ( github.com/prometheus/common v0.42.0 github.com/spf13/cobra v1.7.0 github.com/spf13/viper v1.15.0 - github.com/vishvananda/netlink v1.2.1-beta.2 + github.com/vishvananda/netlink v1.2.1-beta.2.0.20230807190133-6afddb37c1f0 go.uber.org/zap v1.24.0 - golang.org/x/sys v0.7.0 + golang.org/x/sys v0.10.0 google.golang.org/grpc v1.54.0 google.golang.org/protobuf v1.30.0 k8s.io/api v0.26.4 diff --git a/v2/go.sum b/v2/go.sum index 4c366dde..50a7ccca 100644 --- a/v2/go.sum +++ b/v2/go.sum @@ -308,8 +308,8 @@ github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKs github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= -github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= -github.com/vishvananda/netlink v1.2.1-beta.2/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho= +github.com/vishvananda/netlink v1.2.1-beta.2.0.20230807190133-6afddb37c1f0 h1:CLsXiDYQjYqJVntHkQZL2AW0R8BrvJu1K/hbs+2Q+EQ= +github.com/vishvananda/netlink v1.2.1-beta.2.0.20230807190133-6afddb37c1f0/go.mod h1:whJevzBpTrid75eZy99s3DqCmy05NfibNaF2Ol5Ox5A= github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= github.com/vishvananda/netns v0.0.3 h1:WxY6MpgIdDMQX50UJ7bPIRJdBCOeUV6XtW8dZZja988= github.com/vishvananda/netns v0.0.3/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= @@ -468,7 +468,6 @@ golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -486,8 +485,8 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.6.0 h1:clScbb1cHjoCkyRbWwBEUZ5H/tIFu5TAXIqaZD0Gcjw= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= diff --git a/v2/pkg/founat/fou.go b/v2/pkg/founat/fou.go index 4e555722..2ab3d246 100644 --- a/v2/pkg/founat/fou.go +++ b/v2/pkg/founat/fou.go @@ -2,6 +2,7 @@ package founat import ( "crypto/sha1" + "errors" "fmt" "net" "os/exec" @@ -48,7 +49,7 @@ type FoUTunnel interface { // AddPeer setups tunnel devices to the given peer and returns them. // If FoUTunnel does not setup for the IP family of the given address, // this returns ErrIPFamilyMismatch error. - AddPeer(net.IP) (netlink.Link, error) + AddPeer(net.IP, bool) (netlink.Link, error) // DelPeer deletes tunnel for the peer, if any. DelPeer(net.IP) error @@ -166,17 +167,17 @@ func (t *fouTunnel) Init() error { return netlink.LinkAdd(&netlink.Dummy{LinkAttrs: attrs}) } -func (t *fouTunnel) AddPeer(addr net.IP) (netlink.Link, error) { +func (t *fouTunnel) AddPeer(addr net.IP, sportAuto bool) (netlink.Link, error) { t.mu.Lock() defer t.mu.Unlock() if v4 := addr.To4(); v4 != nil { - return t.addPeer4(v4) + return t.addPeer4(v4, sportAuto) } - return t.addPeer6(addr) + return t.addPeer6(addr, sportAuto) } -func (t *fouTunnel) addPeer4(addr net.IP) (netlink.Link, error) { +func (t *fouTunnel) addPeer4(addr net.IP, sportAuto bool) (netlink.Link, error) { if t.local4 == nil { return nil, ErrIPFamilyMismatch } @@ -194,12 +195,16 @@ func (t *fouTunnel) addPeer4(addr net.IP) (netlink.Link, error) { attrs := netlink.NewLinkAttrs() attrs.Name = linkName attrs.Flags = net.FlagUp + encapSport := uint16(t.port) + if sportAuto { + encapSport = 0 + } link = &netlink.Iptun{ LinkAttrs: attrs, Ttl: 225, EncapType: netlink.FOU_ENCAP_DIRECT, EncapDport: uint16(t.port), - EncapSport: uint16(t.port), + EncapSport: encapSport, Remote: addr, Local: t.local4, } @@ -207,10 +212,14 @@ func (t *fouTunnel) addPeer4(addr net.IP) (netlink.Link, error) { return nil, fmt.Errorf("netlink: failed to add fou link: %w", err) } + if err := setupFlowBasedIP4TunDevice(); err != nil { + return nil, fmt.Errorf("netlink: failed to setup ipip device: %w", err) + } + return netlink.LinkByName(linkName) } -func (t *fouTunnel) addPeer6(addr net.IP) (netlink.Link, error) { +func (t *fouTunnel) addPeer6(addr net.IP, sportAuto bool) (netlink.Link, error) { if t.local6 == nil { return nil, ErrIPFamilyMismatch } @@ -228,12 +237,16 @@ func (t *fouTunnel) addPeer6(addr net.IP) (netlink.Link, error) { attrs := netlink.NewLinkAttrs() attrs.Name = linkName attrs.Flags = net.FlagUp + encapSport := uint16(t.port) + if sportAuto { + encapSport = 0 + } link = &netlink.Ip6tnl{ LinkAttrs: attrs, Ttl: 225, EncapType: netlink.FOU_ENCAP_DIRECT, EncapDport: uint16(t.port), - EncapSport: uint16(t.port), + EncapSport: encapSport, Remote: addr, Local: t.local6, } @@ -241,9 +254,126 @@ func (t *fouTunnel) addPeer6(addr net.IP) (netlink.Link, error) { return nil, fmt.Errorf("netlink: failed to add fou6 link: %w", err) } + if err := setupFlowBasedIP6TunDevice(); err != nil { + return nil, fmt.Errorf("netlink: failed to setup ipip device: %w", err) + } + return netlink.LinkByName(linkName) } +// setupFlowBasedIP[4,6]TunDevice creates an IPv4 or IPv6 tunnel device +// +// This flow based IPIP tunnel device is used to decapsulate packets from +// the router Pods. +// +// Calling this function may result in tunl0 (v4) or ip6tnl0 (v6) +// fallback interface being renamed to coil_tunl or coil_ip6tnl. +// This is to communicate to the user that this plugin has taken +// control of the encapsulation stack on the netns, as it currently +// doesn't explicitly support sharing it with other tools/CNIs. +// Fallback devices are left unused for production traffic. +// Only devices that were explicitly created are used. +// +// This fallback interface is present as a result of loading the +// ipip and ip6_tunnel kernel modules by fou tunnel interfaces. +// These are catch-all interfaces for the ipip decapsulation stack. +// By default, these interfaces will be created in new network namespaces, +// but this behavior can be disabled by setting net.core.fb_tunnels_only_for_init_net = 2. +func setupFlowBasedIP4TunDevice() error { + ipip4Device := "coil_ipip4" + // Set up IPv4 tunnel device if requested. + if err := setupDevice(&netlink.Iptun{ + LinkAttrs: netlink.LinkAttrs{Name: ipip4Device}, + FlowBased: true, + }); err != nil { + return fmt.Errorf("creating %s: %w", ipip4Device, err) + } + + // Rename fallback device created by potential kernel module load after + // creating tunnel interface. + if err := renameDevice("tunl0", "coil_tunl"); err != nil { + return fmt.Errorf("renaming fallback device %s: %w", "tunl0", err) + } + + return nil +} + +// See setupFlowBasedIP4TunDevice +func setupFlowBasedIP6TunDevice() error { + ipip6Device := "coil_ipip6" + + // Set up IPv6 tunnel device if requested. + if err := setupDevice(&netlink.Ip6tnl{ + LinkAttrs: netlink.LinkAttrs{Name: ipip6Device}, + FlowBased: true, + }); err != nil { + return fmt.Errorf("creating %s: %w", ipip6Device, err) + } + + // Rename fallback device created by potential kernel module load after + // creating tunnel interface. + if err := renameDevice("ip6tnl0", "coil_ip6tnl"); err != nil { + return fmt.Errorf("renaming fallback device %s: %w", "tunl0", err) + } + + return nil +} + +// setupDevice creates and configures a device based on the given netlink attrs. +func setupDevice(link netlink.Link) error { + name := link.Attrs().Name + + // Reuse existing tunnel interface created by previous runs. + l, err := netlink.LinkByName(name) + if err != nil { + var linkNotFoundError netlink.LinkNotFoundError + if !errors.As(err, &linkNotFoundError) { + return err + } + + if err := netlink.LinkAdd(link); err != nil { + return fmt.Errorf("netlink: failed to create device %s: %w", name, err) + } + + // Fetch the link we've just created. + l, err = netlink.LinkByName(name) + if err != nil { + return fmt.Errorf("netlink: failed to retrieve created device %s: %w", name, err) + } + } + + if err := configureDevice(l); err != nil { + return fmt.Errorf("failed to set up device %s: %w", l.Attrs().Name, err) + } + + return nil +} + +// configureDevice puts the given link into the up state +func configureDevice(link netlink.Link) error { + ifName := link.Attrs().Name + + if err := netlink.LinkSetUp(link); err != nil { + return fmt.Errorf("netlink: failed to set link %s up: %w", ifName, err) + } + return nil +} + +// renameDevice renames a network device from and to a given value. Returns nil +// if the device does not exist. +func renameDevice(from, to string) error { + link, err := netlink.LinkByName(from) + if err != nil { + return nil + } + + if err := netlink.LinkSetName(link, to); err != nil { + return fmt.Errorf("netlink: failed to rename device %s to %s: %w", from, to, err) + } + + return nil +} + func (t *fouTunnel) DelPeer(addr net.IP) error { linkName := fouName(addr) diff --git a/v2/pkg/founat/fou_test.go b/v2/pkg/founat/fou_test.go index 73e2894c..b98151f0 100644 --- a/v2/pkg/founat/fou_test.go +++ b/v2/pkg/founat/fou_test.go @@ -1,6 +1,7 @@ package founat import ( + "errors" "fmt" "net" "strings" @@ -75,7 +76,7 @@ func testFoUDual(t *testing.T) { } } - if link, err := fou.AddPeer(net.ParseIP("10.1.1.1")); err != nil { + if link, err := fou.AddPeer(net.ParseIP("10.1.1.1"), true); err != nil { return fmt.Errorf("failed to call AddPeer with 10.1.1.1: %w", err) } else { iptun, ok := link.(*netlink.Iptun) @@ -91,12 +92,24 @@ func testFoUDual(t *testing.T) { if iptun.EncapDport != 5555 { return fmt.Errorf("iptun.EncapDport is not 5555: %d", iptun.EncapDport) } - if iptun.EncapSport != 5555 { - return fmt.Errorf("iptun.EncapSport is not 5555: %d", iptun.EncapSport) + if iptun.EncapSport != 0 { + return fmt.Errorf("iptun.EncapSport is not 0: %d", iptun.EncapSport) + } + + ipip4, err := netlink.LinkByName("coil_ipip4") + if err != nil { + return fmt.Errorf("failed to get coil_ipip4: %w", err) + } + iptun, ok = ipip4.(*netlink.Iptun) + if !ok { + return fmt.Errorf("link is not Iptun: %T", link) + } + if !iptun.FlowBased { + return errors.New("coil_ipip4 is not flow based") } } - if link, err := fou.AddPeer(net.ParseIP("fd02::101")); err != nil { + if link, err := fou.AddPeer(net.ParseIP("fd02::101"), true); err != nil { return fmt.Errorf("failed to call AddPeer with fd02::101: %w", err) } else { ip6tnl, ok := link.(*netlink.Ip6tnl) @@ -112,8 +125,20 @@ func testFoUDual(t *testing.T) { if ip6tnl.EncapDport != 5555 { return fmt.Errorf("ip6tnl.EncapDport is not 5555: %d", ip6tnl.EncapDport) } - if ip6tnl.EncapSport != 5555 { - return fmt.Errorf("ip6tnl.EncapSport is not 5555: %d", ip6tnl.EncapSport) + if ip6tnl.EncapSport != 0 { + return fmt.Errorf("ip6tnl.EncapSport is not 0: %d", ip6tnl.EncapSport) + } + + ipip6, err := netlink.LinkByName("coil_ipip6") + if err != nil { + return fmt.Errorf("failed to get coil_ipip6: %w", err) + } + ip6tnl, ok = ipip6.(*netlink.Ip6tnl) + if !ok { + return fmt.Errorf("link is not Iptun: %T", link) + } + if !ip6tnl.FlowBased { + return errors.New("coil_ipip6 is not flow based") } } @@ -186,7 +211,7 @@ func testFoUV4(t *testing.T) { } } - if link, err := fou.AddPeer(net.ParseIP("10.1.1.1")); err != nil { + if link, err := fou.AddPeer(net.ParseIP("10.1.1.1"), true); err != nil { return fmt.Errorf("failed to call AddPeer with 10.1.1.1: %w", err) } else { iptun, ok := link.(*netlink.Iptun) @@ -202,12 +227,24 @@ func testFoUV4(t *testing.T) { if iptun.EncapDport != 5555 { return fmt.Errorf("iptun.EncapDport is not 5555: %d", iptun.EncapDport) } - if iptun.EncapSport != 5555 { - return fmt.Errorf("iptun.EncapSport is not 5555: %d", iptun.EncapSport) + if iptun.EncapSport != 0 { + return fmt.Errorf("iptun.EncapSport is not 0: %d", iptun.EncapSport) + } + + ipip4, err := netlink.LinkByName("coil_ipip4") + if err != nil { + return fmt.Errorf("failed to get coil_ipip4: %w", err) + } + iptun, ok = ipip4.(*netlink.Iptun) + if !ok { + return fmt.Errorf("link is not Iptun: %T", link) + } + if !iptun.FlowBased { + return errors.New("coil_ipip4 is not flow based") } } - if _, err := fou.AddPeer(net.ParseIP("fd02::101")); err != ErrIPFamilyMismatch { + if _, err := fou.AddPeer(net.ParseIP("fd02::101"), true); err != ErrIPFamilyMismatch { return fmt.Errorf("error is not ErrIPFamilyMismatch: %w", err) } @@ -267,11 +304,11 @@ func testFoUV6(t *testing.T) { } } - if _, err := fou.AddPeer(net.ParseIP("10.1.1.1")); err != ErrIPFamilyMismatch { + if _, err := fou.AddPeer(net.ParseIP("10.1.1.1"), true); err != ErrIPFamilyMismatch { return fmt.Errorf("error is not ErrIPFamilyMismatch: %w", err) } - if link, err := fou.AddPeer(net.ParseIP("fd02::101")); err != nil { + if link, err := fou.AddPeer(net.ParseIP("fd02::101"), true); err != nil { return fmt.Errorf("failed to call AddPeer with fd02::101: %w", err) } else { ip6tnl, ok := link.(*netlink.Ip6tnl) @@ -287,8 +324,20 @@ func testFoUV6(t *testing.T) { if ip6tnl.EncapDport != 5555 { return fmt.Errorf("ip6tnl.EncapDport is not 5555: %d", ip6tnl.EncapDport) } - if ip6tnl.EncapSport != 5555 { - return fmt.Errorf("ip6tnl.EncapSport is not 5555: %d", ip6tnl.EncapSport) + if ip6tnl.EncapSport != 0 { + return fmt.Errorf("ip6tnl.EncapSport is not 0: %d", ip6tnl.EncapSport) + } + + ipip6, err := netlink.LinkByName("coil_ipip6") + if err != nil { + return fmt.Errorf("failed to get coil_ipip6: %w", err) + } + ip6tnl, ok = ipip6.(*netlink.Ip6tnl) + if !ok { + return fmt.Errorf("link is not Iptun: %T", link) + } + if !ip6tnl.FlowBased { + return errors.New("coil_ipip6 is not flow based") } } diff --git a/v2/pkg/founat/nat_test.go b/v2/pkg/founat/nat_test.go index ec42253e..77060bdb 100644 --- a/v2/pkg/founat/nat_test.go +++ b/v2/pkg/founat/nat_test.go @@ -31,11 +31,11 @@ func TestNAT(t *testing.T) { return fmt.Errorf("nc.Init failed: %w", err) } - link4, err := ft.AddPeer(net.ParseIP("10.1.2.2")) + link4, err := ft.AddPeer(net.ParseIP("10.1.2.2"), true) if err != nil { return fmt.Errorf("ft.AddPeer failed for 10.1.2.2: %w", err) } - link6, err := ft.AddPeer(net.ParseIP("fd01::202")) + link6, err := ft.AddPeer(net.ParseIP("fd01::202"), true) if err != nil { return fmt.Errorf("ft.AddPeer failed for fd01::202: %w", err) } @@ -66,11 +66,11 @@ func TestNAT(t *testing.T) { return fmt.Errorf("egress.Init failed: %w", err) } - link4, err := ft.AddPeer(net.ParseIP("10.1.1.2")) + link4, err := ft.AddPeer(net.ParseIP("10.1.1.2"), true) if err != nil { return fmt.Errorf("ft.AddPeer failed for 10.1.1.2: %w", err) } - link6, err := ft.AddPeer(net.ParseIP("fd01::102")) + link6, err := ft.AddPeer(net.ParseIP("fd01::102"), true) if err != nil { return fmt.Errorf("ft.AddPeer failed for fd01::102: %w", err) } diff --git a/v2/runners/coild_server.go b/v2/runners/coild_server.go index 7f0e6a2b..7d333749 100644 --- a/v2/runners/coild_server.go +++ b/v2/runners/coild_server.go @@ -36,8 +36,9 @@ import ( // GWNets represents networks for a destination. type GWNets struct { - Gateway net.IP - Networks []*net.IPNet + Gateway net.IP + Networks []*net.IPNet + SportAuto bool } // NATSetup represents a NAT setup function for Pods. @@ -68,7 +69,7 @@ func (n natSetup) Hook(l []GWNets, log *zap.Logger) func(ipv4, ipv6 net.IP) erro } for _, gwn := range l { - link, err := ft.AddPeer(gwn.Gateway) + link, err := ft.AddPeer(gwn.Gateway, gwn.SportAuto) if errors.Is(err, founat.ErrIPFamilyMismatch) { // ignore unsupported IP family link log.Sugar().Infow("ignored unsupported gateway", "gw", gwn.Gateway) @@ -369,7 +370,7 @@ func (s *coildServer) getHook(ctx context.Context, pod *corev1.Pod) (nodenet.Set } if len(subnets) > 0 { - gwlist = append(gwlist, GWNets{Gateway: svcIP, Networks: subnets}) + gwlist = append(gwlist, GWNets{Gateway: svcIP, Networks: subnets, SportAuto: eg.Spec.FouSourcePortAuto}) } }