Comment 3 for bug 1886429

Revision history for this message
Dan Voiculeasa (dvoicule) wrote :

In case there are multiple armada pods: one stuck in Terminating, one Ready.

Doesn't work selecting the good pod[reference 1] instead of `pod = pods[0]` because at a later time helmv2-cli is called:

  helmv2-cli
  helmv2-cli[3839105] ERROR Could not find tiller listen port.

Kubernetes python client doesn't have support for pod delete --force flag.
A call to kubectl --force must be done:
kubectl delete pods -n armada armada-pod-hash47 --force

[1]
diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
index 8f5f5cb0..03a07041 100644
--- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
@@ -3027,6 +3027,24 @@ class ArmadaHelper(object):
                 return False
         return True

+ def _check_pod_ready_probe(self, pod):
+ """ Pod is of the form returned by self._kube.kube_get_pods_by_selector
+ Returns true if last probe shows the container is in `Ready` state
+ """
+ conditions = list(filter(lambda x: x.type == 'Ready', pod.status.conditions))
+ return conditions[0].status == 'True'
+
+ def _prefer_select_one_running_ready_pod(self, pods):
+ """ Find one running and ready pod.
+ Return found if one, otherwise first pod.
+ """
+ for pod in pods:
+ if pod.status.phase == 'Running' and \
+ self._check_pod_ready_probe(pod):
+ return pod
+
+ return pods[0]
+
     def _start_armada_service(self):
         """Armada pod is managed by Kubernetes / Helm.
            This routine checks and waits for armada to be providing service.
@@ -3058,7 +3076,7 @@ class ArmadaHelper(object):
                     "application=%s" % ARMADA_APPLICATION, "")
                 if not pods:
                     raise RuntimeError('armada pod not found')
- pod = pods[0]
+ pod = self._prefer_select_one_running_ready_pod(pods)

                 if pod and pod.status.phase != 'Running':
                     # Delete the pod, it should restart if it can
@@ -3067,7 +3085,8 @@ class ArmadaHelper(object):
                         LOG.warning("Pod %s/%s deletion unsuccessful...",
                             ARMADA_NAMESPACE, pod.metadata.name)

- if pod and pod.status.phase == 'Running':
+ if pod and pod.status.phase == 'Running' and \
+ self._check_pod_ready_probe(pod):
                     # Test that we can copy files into armada-api container
                     src = '/etc/build.info'
                     dest_dir = '{}:{}'.format(pod.metadata.name, '/tmp')
@@ -3081,6 +3100,16 @@ class ArmadaHelper(object):
                     else:
                         return True
                     return True
+ #
+ # elif pod and pod.status.phase == 'Running':
+ # LOG.warning("Pod %s/%s running but not ready",
+ # ARMADA_NAMESPACE, pod.metadata.name)
+ #
+ # # Delete the pod, it should restart if it can
+ # if not self._kube.kube_delete_pod(pod.metadata.name,
+ # ARMADA_NAMESPACE, grace_periods_seconds=0, force=None):
+ # LOG.warning("Pod %s/%s deletion unsuccessful...",
+ # ARMADA_NAMESPACE, pod.metadata.name)

             except Exception as e:
                 LOG.info("Could not get Armada service : %s " % e)
@@ -3151,7 +3180,7 @@ class ArmadaHelper(object):
                     "status.phase=Running")
                 if not pods:
                     raise RuntimeError('armada pod not found')
- armada_pod = pods[0].metadata.name
+ armada_pod = self._prefer_select_one_running_ready_pod(pods).metadata.name
                 if not self.copy_manifests_and_overrides_to_armada(armada_pod, manifest_file):
                     raise RuntimeError('could not access armada pod')