Comment 5 for bug 2028338

Revision history for this message
Mohankumar (mohankumar-n) wrote (last edit ):

what is working to get rid of this issue , add some checks after agent revived in "fetch_and_sync_all_routers" method @neutron/agent/l3/agent.py

   if r.get('_ha_state') == 'unknown':
      LOG.debug("Processing router with l3_agent in "
                "unknown state")
      self._process_added_router(r)

and entire method looks like this :

    def fetch_and_sync_all_routers(self, context, ns_manager):
        prev_router_ids = set(self.router_info)
        curr_router_ids = set()
        timestamp = timeutils.utcnow()
        router_ids = []
        chunk = []
        is_snat_agent = (self.conf.agent_mode ==
                         lib_const.L3_AGENT_MODE_DVR_SNAT)
        try:
            router_ids = self.plugin_rpc.get_router_ids(context)
            LOG.debug("Router IDs from Neutron API: %s", router_ids)
            self._remove_orphan_routers_config(
                ha_confs_path=self.ha_confs_path,
                router_ids=set(router_ids),
            )
            # fetch routers by chunks to reduce the load on server and to
            # start router processing earlier
            for i in range(0, len(router_ids), self.sync_routers_chunk_size):
                chunk = router_ids[i:i + self.sync_routers_chunk_size]
                routers = self.plugin_rpc.get_routers(context, chunk)
                LOG.debug('Processing :%r', routers)
                for r in routers:
                    curr_router_ids.add(r['id'])
                    ns_manager.keep_router(r['id'])
                    if r.get('distributed'):
                        # need to keep fip namespaces as well
                        ext_net_id = (r['external_gateway_info'] or {}).get(
                            'network_id')
                        if ext_net_id:
                            ns_manager.keep_ext_net(ext_net_id)
                        elif is_snat_agent and not r.get('ha'):
                            ns_manager.ensure_snat_cleanup(r['id'])
                        if r.get('_ha_state') == 'unknown':
                            LOG.debug("Processing router with l3_agent in "
                                      "unknown state")
                            self._process_added_router(r)
                    update = queue.ResourceUpdate(
                        r['id'],
                        PRIORITY_SYNC_ROUTERS_TASK,
                        resource=r,
                        action=ADD_UPDATE_ROUTER,
                        timestamp=timestamp)
                    self._queue.add(update)
        except oslo_messaging.MessagingTimeout:
            if self.sync_routers_chunk_size > SYNC_ROUTERS_MIN_CHUNK_SIZE:
                self.sync_routers_chunk_size = max(
                    self.sync_routers_chunk_size // 2,
                    SYNC_ROUTERS_MIN_CHUNK_SIZE)
                LOG.error('Server failed to return info for routers in '
                          'required time, decreasing chunk size to: %s',
                          self.sync_routers_chunk_size)
            else:
                LOG.error('Server failed to return info for routers in '
                          'required time even with min chunk size: %s. '
                          'It might be under very high load or '
                          'just inoperable',
                          self.sync_routers_chunk_size)
            raise
        except oslo_messaging.MessagingException:
            failed_routers = chunk or router_ids
            LOG.exception("Failed synchronizing routers '%s' "
                          "due to RPC error", failed_routers)
            raise l3_exc.AbortSyncRouters()
        self.fullsync = False
        LOG.debug("periodic_sync_routers_task successfully completed")
        # adjust chunk size after successful sync
        if self.sync_routers_chunk_size < SYNC_ROUTERS_MAX_CHUNK_SIZE:
            self.sync_routers_chunk_size = min(
                self.sync_routers_chunk_size + SYNC_ROUTERS_MIN_CHUNK_SIZE,
                SYNC_ROUTERS_MAX_CHUNK_SIZE)
        # Delete routers that have disappeared since the last sync
        for router_id in prev_router_ids - curr_router_ids:
            ns_manager.keep_router(router_id)
            update = queue.ResourceUpdate(router_id,
                                          PRIORITY_SYNC_ROUTERS_TASK,
                                          timestamp=timestamp,
                                          action=DELETE_ROUTER)
            self._queue.add(update)