Seen both with maas 2.8 as well as maas 2.9; after running for a while, deployments stop working, and the rackd log has many messages like:
2021-02-05 18:13:56 provisioningserver.rpc.clusterservice: [critical] Failed to contact region. (While requesting RPC info at http://10.230.56.2:5240/MAAS).
Traceback (most recent call last):
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 460, in callback
self._startRunCallbacks(result)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 568, in _startRunCallbacks
self._runCallbacks()
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1475, in gotResult
_inlineCallbacks(r, g, status)
--- <exception caught here> ---
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1367, in _doUpdate
eventloops, maas_url = yield self._get_rpc_info(urls)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1631, in _get_rpc_info
raise config_exc
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1602, in _get_rpc_info
eventloops, maas_url = yield self._parallel_fetch_rpc_info(urls)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1576, in handle_responses
errors[0].raiseException()
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/python/failure.py", line 467, in raiseException
raise self.value.with_traceback(self.tb)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1537, in _serial_fetch_rpc_info
raise last_exc
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1529, in _serial_fetch_rpc_info
response = yield self._fetch_rpc_info(url, orig_url)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/python/failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1631, in _get_rpc_info
raise config_exc
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1602, in _get_rpc_info
eventloops, maas_url = yield self._parallel_fetch_rpc_info(urls)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1576, in handle_responses
errors[0].raiseException()
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/python/failure.py", line 467, in raiseException
raise self.value.with_traceback(self.tb)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/snap/maas/11322/usr/lib/python3/dist-packages/twisted/python/failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1537, in _serial_fetch_rpc_info
raise last_exc
File "/snap/maas/11322/lib/python3.8/site-packages/provisioningserver/rpc/clusterservice.py", line 1529, in _serial_fetch_rpc_info
response = yield self._fetch_rpc_info(url, orig_url)
twisted.internet.error.ConnectingCancelledError: HostnameAddress(hostname=b'10.230.56.2', port=5240)
The region controller appears to be working fine and there are no errors in the regiond log. This deployment uses a single region and single rack, which are both located on a single VM.
To get maas working again, the system must be rebooted, or the maas snap service must be restarted. However, the problem being occurring again after some number of hours or days.
We are resorting to rebooting maas every day just so we can avoid hitting this and having deployments fail.