Skip to content

Commit

Permalink
Merge pull request #301 from elvgarrui/vswitchd_restart_split
Browse files Browse the repository at this point in the history
Fix gateway datapath disrupt on update
  • Loading branch information
openshift-merge-bot[bot] authored Jun 24, 2024
2 parents f768609 + 303cbcd commit 9f91003
Show file tree
Hide file tree
Showing 15 changed files with 168 additions and 17 deletions.
7 changes: 3 additions & 4 deletions pkg/ovncontroller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ func CreateOVSDaemonSet(
Lifecycle: &corev1.Lifecycle{
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"/usr/share/openvswitch/scripts/ovs-ctl", "stop", "--no-ovs-vswitchd"},
Command: []string{"/usr/local/bin/container-scripts/stop-ovsdb-server.sh"},
},
},
},
Expand All @@ -199,12 +199,11 @@ func CreateOVSDaemonSet(
},
{
Name: "ovs-vswitchd",
Command: []string{"/bin/bash", "-c"},
Args: []string{"/usr/local/bin/container-scripts/net_setup.sh && /usr/sbin/ovs-vswitchd --pidfile --mlockall"},
Command: []string{"/usr/local/bin/container-scripts/start-vswitchd.sh"},
Lifecycle: &corev1.Lifecycle{
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"/usr/share/openvswitch/scripts/ovs-ctl", "stop", "--no-ovsdb-server"},
Command: []string{"/usr/local/bin/container-scripts/stop-vswitchd.sh"},
},
},
},
Expand Down
14 changes: 14 additions & 0 deletions templates/ovncontroller/bin/functions
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ EnableChassisAsGateway=${EnableChassisAsGateway:-true}
PhysicalNetworks=${PhysicalNetworks:-""}
OVNHostName=${OVNHostName:-""}

ovs_dir=/var/lib/openvswitch
FLOWS_RESTORE_SCRIPT=$ovs_dir/flows-script
FLOWS_RESTORE_DIR=$ovs_dir/saved-flows
SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE=$ovs_dir/is_safe_to_stop_ovsdb_server

function cleanup_ovsdb_server_semaphore() {
rm -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE 2>&1 > /dev/null
}

function cleanup_flows_backup() {
rm -f $FLOWS_RESTORE_SCRIPT 2>&1 > /dev/null
rm -rf $FLOWS_RESTORE_DIR 2>&1 > /dev/null
}

function wait_for_ovsdb_server {
while true; do
/usr/bin/ovs-vsctl show
Expand Down
6 changes: 5 additions & 1 deletion templates/ovncontroller/bin/start-ovsdb-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

set -ex
source $(dirname $0)/functions

CTL_ARGS="--system-id=random --no-ovs-vswitchd"
# Remove the obsolete semaphore file in case it still exists.
cleanup_ovsdb_server_semaphore

# Initialize or upgrade database if needed
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS

Expand Down
54 changes: 54 additions & 0 deletions templates/ovncontroller/bin/start-vswitchd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/sh
#
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

source $(dirname $0)/functions
wait_for_ovsdb_server

# The order - first wait for db server, then set -ex - is important. Otherwise,
# wait_for_ovsdb_server interrim check would make the script exit.
set -ex

# Configure encap IP.
OVNEncapIP=$(ip -o addr show dev {{ .OVNEncapNIC }} scope global | awk '{print $4}' | cut -d/ -f1)
ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}

# Before starting vswitchd, block it from flushing existing datapath flows.
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait=true

# It's safe to start vswitchd now. Do it.
# --detach to allow the execution to continue to restoring the flows.
/usr/sbin/ovs-vswitchd --pidfile --mlockall --detach

# Restore saved flows.
if [ -f $FLOWS_RESTORE_SCRIPT ]; then
# It's unsafe to leave these files in place if they fail once. Make sure we
# remove them if the eval fails.
trap cleanup_flows_backup EXIT
eval "$(cat $FLOWS_RESTORE_SCRIPT)"
trap - EXIT
fi

# It's also unsafe to leave these files after flow-restore-wait flag is removed
# because the backup will become stale and if a container later crashes, it may
# mistakenly try to restore from this old backup.
cleanup_flows_backup

# Now, inform vswitchd that we are done.
ovs-vsctl remove open_vswitch . other_config flow-restore-wait

# This is container command script. Block it from exiting, otherwise k8s will
# restart the container again.
sleep infinity
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh
#
# Copyright 2023 Red Hat Inc.
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -14,11 +14,17 @@
# License for the specific language governing permissions and limitations
# under the License.

OVNEncapIP=$(ip -o addr show dev {{ .OVNEncapNIC }} scope global | awk '{print $4}' | cut -d/ -f1)

set -ex
source $(dirname $0)/functions

wait_for_ovsdb_server
# The ovs_vswitchd container has to terminate before ovsdb-server because it
# needs access to db in its preStop script. The preStop script backs up flows
# for restoration during the next startup. This semaphore ensures the vswitchd
# container is not torn down before flows are saved.
while [ ! -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE ]; do
sleep 0.5
done
cleanup_ovsdb_server_semaphore

set -ex
ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}
# Now it's safe to stop db server. Do it.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovs-vswitchd
36 changes: 36 additions & 0 deletions templates/ovncontroller/bin/stop-vswitchd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/sh
#
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

set -ex
source $(dirname $0)/functions

# Clean up any previously created flow backups to avoid conflict with newly
# generated backup.
cleanup_flows_backup

# Passing --real to mimic what upstream startup scripts do; maybe redundant.
bridges=$(ovs-vsctl -- --real list-br)

# Saving flows to avoid disrupting gateway datapath.
mkdir $FLOWS_RESTORE_DIR
TMPDIR=$FLOWS_RESTORE_DIR /usr/share/openvswitch/scripts/ovs-save save-flows $bridges > $FLOWS_RESTORE_SCRIPT

# Once save-flows logic is complete it no longer needs ovsdb-server, this file
# unlocks the db preStop script, working as a semaphore
touch $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE

# Finally, stop vswitchd.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovsdb-server
12 changes: 6 additions & 6 deletions tests/functional/ovncontroller_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ var _ = Describe("OVNController controller", func() {
}, timeout, interval).Should(ContainElement("openstack.org/ovncontroller"))
})

It("should create a ConfigMap for net_setup.sh with eth0 as Interface Name", func() {
It("should create a ConfigMap for start-vswitchd.sh with eth0 as Interface Name", func() {
scriptsCM := types.NamespacedName{
Namespace: OVNControllerName.Namespace,
Name: fmt.Sprintf("%s-%s", OVNControllerName.Name, "scripts"),
Expand All @@ -74,7 +74,7 @@ var _ = Describe("OVNController controller", func() {
return *th.GetConfigMap(scriptsCM)
}, timeout, interval).ShouldNot(BeNil())

Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev eth0"))

th.ExpectCondition(
Expand Down Expand Up @@ -179,12 +179,12 @@ var _ = Describe("OVNController controller", func() {
th.AssertJobDoesNotExist(configJobOVS)
})

It("should create a ConfigMap for net_setup.sh with eth0 as Interface Name", func() {
It("should create a ConfigMap for start-vswitchd.sh with eth0 as Interface Name", func() {
Eventually(func() corev1.ConfigMap {
return *th.GetConfigMap(scriptsCM)
}, timeout, interval).ShouldNot(BeNil())

Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev eth0"))

th.ExpectCondition(
Expand Down Expand Up @@ -445,7 +445,7 @@ var _ = Describe("OVNController controller", func() {

}, timeout, interval).Should(Succeed())
})
It("should create a ConfigMap for net_setup.sh with nic name as Network Attachment and OwnerReferences set", func() {
It("should create a ConfigMap for start-vswitchd.sh with nic name as Network Attachment and OwnerReferences set", func() {

scriptsCM := types.NamespacedName{
Namespace: OVNControllerName.Namespace,
Expand All @@ -461,7 +461,7 @@ var _ = Describe("OVNController controller", func() {
Expect(th.GetConfigMap(scriptsCM).ObjectMeta.OwnerReferences[0].Kind).To(Equal("OVNController"))

ovncontroller := GetOVNController(OVNControllerName)
Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev %s", ovncontroller.Spec.NetworkAttachment))
})
It("should create an external ConfigMap with expected key-value pairs and OwnerReferences set", func() {
Expand Down
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/01-assert.yaml
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/01-deploy-ovn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl --if-exists del-br br-test-flows || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl add-br br-test-flows || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-ofctl add-flow br-test-flows "table=100, priority=200 action=drop" || exit 1
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/03-assert.yaml
7 changes: 7 additions & 0 deletions tests/kuttl/tests/ovn_restart_flow/03-delete-pods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
oc delete -n $NAMESPACE $controller_pod
17 changes: 17 additions & 0 deletions tests/kuttl/tests/ovn_restart_flow/04-assert-flows-present.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Check for:
#
# - Check that the flow added to the test bridge "br-test-flows" is present
# after the ovn-controller has been restarted.
# Expected flow: "table=100, priority=200 actions=drop"

apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 300
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
expected_flows="table=100, priority=200 actions=drop"
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-ofctl dump-flows br-test-flows --no-stats | grep -q "$expected_flows" || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl --if-exists del-br br-test-flows
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/06-cleanup-ovn.yaml
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/06-errors.yaml

0 comments on commit 9f91003

Please sign in to comment.