Thanks to Edward Hope-Morley who put a lot of time into recreating/debugging this and also to Dmitrii Shcherbakov for helping debug. Detailed summary of debugging follows for https://bugs.launchpad.net/ubuntu/+source/neutron/+bug/1927868. In the test environment, the bug was triggered at approximately 16 HA routers, but keeping in mind the number is not the issue. What appears to be the issue is a slowed environment due to one reason or other that is causing eventlet to timeout at > 3 seconds (more on that later). The testing environment had neutron >= 16.1.0 and the bug is triggered after running the following commands to restart the L3 agent, expecting the keepalived processes to get restored: systemctl restart neutron-l3-agent watch 'pgrep -alf keepalived| grep -v state| wc -l' After a successful restart, the number of processes will equal 2x the number of HA routers. When the failure is hit, the number doesn't reach the expected count. In debugging we noticed that we were hitting the following exception: AssertionError("msgid should in outstanding_msgs.") If I recall correctly, we were only able to see that exception when debugging with rpdb (it wasn't in the logs). We then added the following debug statements to /usr/lib/python3/dist-packages/oslo_privsep/comm.py (see LOGs prefixed with HELLO). We also added an except block after the send_recv() try block because we noticed the "after future.result" log was skipped (that line of code wasn't executed) in the failure scenario. def _reader_main(self, reader): """This thread owns and demuxes the read channel""" with self.lock: self.running = True for msg in reader: msgid, data = msg if msgid is None: self.out_of_band(data) else: with self.lock: if msgid not in self.outstanding_msgs: #import rpdb #rpdb.set_trace() LOG.debug('HELLO - msgid={} not in self.outstanding_msgs'.format(msgid)) raise AssertionError("msgid should in " "outstanding_msgs.") self.outstanding_msgs[msgid].set_result(data) # EOF. Perhaps the privileged process exited? # Send an IOError to any oustanding waiting readers. Assuming # the write direction is also closed, any new writes should # get an immediate similar error. LOG.debug('EOF on privsep read channel') exc = IOError(_('Premature eof waiting for privileged process')) with self.lock: for mbox in self.outstanding_msgs.values(): mbox.set_exception(exc) self.running = False def out_of_band(self, msg): """Received OOB message. Subclasses might want to override this.""" pass def send_recv(self, msg): myid = _get_thread_ident() future = Future(self.lock) with self.lock: if myid in self.outstanding_msgs: raise AssertionError("myid shoudn't be in outstanding_msgs.") self.outstanding_msgs[myid] = future try: LOG.debug('HELLO - writer.send myid={}'.format(myid)) self.writer.send((myid, msg)) LOG.debug('HELLO - before future.result() for myid={}'.format(myid)) reply = future.result() LOG.debug('HELLO - after future.result() for myid={},reply={}'.format(myid, reply)) except: LOG.debug("HELLO - Unexpected error: {}".format(sys.exc_info()[0])) LOG.debug("HELLO - traceback: {}".format(traceback.format_exc())) raise finally: LOG.debug('HELLO - deleting myid={}'.format(myid)) del self.outstanding_msgs[myid] return reply With those changes enabled, restarting neutron-l3-agent resulted in the following logs, based on the LOGs we added above. Note the ordering of LOG messages in (a) (successful) and (b) (unsuccessful). Also note the "Unexpected error" of eventlet.timeout.Timeout. (a) 2021-07-28 17:48:15.660 3801 DEBUG oslo_privsep.comm [-] HELLO - writer.send myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:174 2021-07-28 17:48:15.660 3801 DEBUG oslo_privsep.comm [-] HELLO - before future.result() for myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:177 2021-07-28 17:48:16.635 3886 DEBUG oslo.privsep.daemon [-] privsep: reply[139706992122312]: (4, False) _call_back /usr/lib/python3/dist-packages/oslo_privsep/daemon.py:475 2021-07-28 17:48:17.035 3801 DEBUG oslo_privsep.comm [-] HELLO - after future.result() for myid=139706992122312,reply=(4, False) send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:179 2021-07-28 17:48:17.036 3801 DEBUG oslo_privsep.comm [-] HELLO - deleting myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:189 (b) # note, don't mind the timestamp difference for the traceback, that debug was added later 2021-07-28 17:48:18.037 3801 DEBUG oslo_privsep.comm [-] HELLO - writer.send myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:174 2021-07-28 17:48:18.037 3801 DEBUG oslo_privsep.comm [-] HELLO - before future.result() for myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:177 2021-07-28 17:48:18.691 3801 DEBUG oslo_privsep.comm [-] HELLO - Unexpected error: send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:182 2021-07-28 18:14:09.430 12704 DEBUG oslo_privsep.comm [-] HELLO - traceback: Traceback (most recent call last): File "/usr/lib/python3/dist-packages/oslo_privsep/comm.py", line 179, in send_recv File "/usr/lib/python3/dist-packages/oslo_privsep/comm.py", line 111, in result File "/usr/lib/python3.6/threading.py", line 295, in wait File "/usr/lib/python3/dist-packages/eventlet/semaphore.py", line 115, in acquire File "/usr/lib/python3/dist-packages/eventlet/hubs/hub.py", line 298, in switch eventlet.timeout.Timeout: 3 seconds 2021-07-28 17:48:18.691 3801 DEBUG oslo_privsep.comm [-] HELLO - deleting myid=139706992122312 send_recv /usr/lib/python3/dist-packages/oslo_privsep/comm.py:189 2021-07-28 17:48:20.716 3886 DEBUG oslo.privsep.daemon [-] privsep: reply[139706992122312]: (4, False) _call_back /usr/lib/python3/dist-packages/oslo_privsep/daemon.py:475 2021-07-28 17:48:21.461 3801 DEBUG oslo_privsep.comm [-] HELLO - msgid=139706992122312 not in self.outstanding_msgs _reader_main /usr/lib/python3/dist-packages/oslo_privsep/comm.py:144 Grepping neutron for eventlet and timeout showed wait_until_true() in neutron/common/utils.py: def wait_until_true(predicate, timeout=60, sleep=1, exception=None): """Wait until callable predicate is evaluated as True :param predicate: Callable deciding whether waiting should continue. Best practice is to instantiate predicate with functools.partial() :param timeout: Timeout in seconds how long should function wait. :param sleep: Polling interval for results in seconds. :param exception: Exception instance to raise on timeout. If None is passed (default) then WaitTimeout exception is raised. """ try: with eventlet.Timeout(timeout): while not predicate(): eventlet.sleep(sleep) except eventlet.Timeout: if exception is not None: # pylint: disable=raising-bad-type raise exception raise WaitTimeout(_("Timed out after %d seconds") % timeout) Grepping for timeout=3 showed neutron/agent/linux/interface.py. The following patch introduced utils.wait_until_true(ns_dev.exists, timeout=3). Reverting this patch seems to have fixed the issue reported in https://bugs.launchpad.net/ubuntu/+source/neutron/+bug/1927868. commit 590551dbbff5ba0b2a43772c3ef117377f783db8 Author: Rodolfo Alonso Hernandez Date: Thu Feb 18 11:57:45 2021 +0000 [L3][HA] Retry when setting HA router GW status. When a HA router instance changes the state (active, backup), the GW interface is set to up or down. As reported in the bug, while keepalived is configuring the interface, the interface disappears and appears again from the kernel namespace, as seen in the udev messages. This patch is a workaround until the real issue is addressed (if possible), retrying the interface configuration for a small period of time. Related-Bug: #1916024 Change-Id: I8ced69f4f8e7d7c73da130a57e89e9d66590390b (cherry picked from commit 662f483120972a373e19bde52f16392e2ccb9c82) corey@corey-ThinkPad-T440s:~/pkg/xena/upstream/neutron$ gd 590551dbb~1 590551dbb diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index ec86f716a4..2b5c49dcb1 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -548,8 +548,11 @@ class HaRouter(router.RouterInfo): if ex_gw_port_id: interface_name = self.get_external_device_name(ex_gw_port_id) ns_name = self.get_gw_ns_name() - self.driver.set_link_status(interface_name, ns_name, - link_up=link_up) + if (not self.driver.set_link_status( + interface_name, namespace=ns_name, link_up=link_up) and + link_up): + LOG.error('Gateway interface for router %s was not set up; ' + 'router will not work properly', self.router_id) if link_up and set_gw: preserve_ips = self.get_router_preserve_ips() self._external_gateway_settings(ex_gw_port, interface_name, diff --git a/neutron/agent/linux/interface.py b/neutron/agent/linux/interface.py index 247d8ea8f5..cdf270cb86 100644 --- a/neutron/agent/linux/interface.py +++ b/neutron/agent/linux/interface.py @@ -324,14 +324,20 @@ class LinuxInterfaceDriver(object): def set_link_status(self, device_name, namespace=None, link_up=True): ns_dev = ip_lib.IPWrapper(namespace=namespace).device(device_name) - if not ns_dev.exists(): - LOG.debug("Device %s may concurrently be deleted.", device_name) - return + try: + utils.wait_until_true(ns_dev.exists, timeout=3) + except utils.WaitTimeout: + LOG.debug('Device %s may have been deleted concurrently', + device_name) + return False + if link_up: ns_dev.link.set_up() else: ns_dev.link.set_down() + return True + class NullDriver(LinuxInterfaceDriver): def plug_new(self, network_id, port_id, device_name, mac_address,