Comment 3 for bug 1682058

Revision history for this message
Mark Goddard (mgoddard) wrote :

I see this on the master branch of magnum currently. It's a race condition, and does not happen on every cluster creation, but does happen on most. I have a server with 48 cores (24 + hyperthreading), and so have 48 magnum conductor worker processes. When this issue occurs, I actually see one of these tracebacks for each worker, and all seem to be aligned temporally. This is another issue which I will raise separately.

I added some logging to the relevant section of code:

diff --git a/magnum/drivers/heat/driver.py b/magnum/drivers/heat/driver.py
index 5aa5a8e..96563b3 100644
--- a/magnum/drivers/heat/driver.py
+++ b/magnum/drivers/heat/driver.py
@@ -73,6 +73,9 @@ class HeatDriver(driver.Driver):

     def update_cluster_status(self, context, cluster):
         stack_ctx = mag_ctx.make_cluster_context(cluster)
+ LOG.info("MG: cluster %s", dict(cluster))
         poller = HeatPoller(clients.OpenStackClients(stack_ctx), context,
                             cluster, self)
         poller.poll_and_check()

When the error occurs, we see the following output:

2017-07-04 18:58:08.384 41 INFO magnum.drivers.heat.driver [req-c48a6b51-e7c9-4898-956f-512f01ab3d68 - - - - -] MG: cluster {'updated_at': None, 'trustee_user_id': None, 'keypair': u'controller', 'node_co
unt': 1, 'id': 22, 'trust_id': None, 'magnum_cert_ref': None, 'user_id': u'ae1a02bad3a043549b77a64d946a3ec0', 'uuid': 'b78e4f5c-02cf-4208-bc09-ecea3e7c867b', 'cluster_template': ClusterTemplate(apiserver_
port=None,cluster_distro='fedora',coe='kubernetes',created_at=2017-07-03T13:07:52Z,dns_nameserver='8.8.8.8',docker_storage_driver='devicemapper',docker_volume_size=3,external_network_id='ilab',fixed_netwo
rk='p3-internal',fixed_subnet='p3-internal',flavor_id='compute-A-magnum',floating_ip_enabled=True,http_proxy=None,https_proxy=None,id=4,image_id='k8s-fedora-25',insecure_registry=None,keypair_id=None,labe
ls={},master_flavor_id='compute-A-magnum',master_lb_enabled=False,name='k8s-fedora-25',network_driver='flannel',no_proxy=None,project_id='37a2b9a3299e44f8bc3f6dc60007ef76',public=False,registry_enabled=Fa
lse,server_type='bm',tls_disabled=False,updated_at=None,user_id='ae1a02bad3a043549b77a64d946a3ec0',uuid='ea0ba754-a769-493a-ade7-d25762e4669e',volume_driver=None), 'ca_cert_ref': None, 'api_address': None
, 'master_addresses': [], 'create_timeout': 60, 'project_id': u'37a2b9a3299e44f8bc3f6dc60007ef76', 'status': u'CREATE_IN_PROGRESS', 'docker_volume_size': 3, 'master_count': 1, 'node_addresses': [], 'statu
s_reason': None, 'coe_version': None, 'cluster_template_id': u'ea0ba754-a769-493a-ade7-d25762e4669e', 'name': u'k8s-xxx', 'stack_id': None, 'created_at': datetime.datetime(2017, 7, 4, 17, 58, 3, tzinfo=<i
so8601.Utc>), 'container_version': None, 'trustee_password': None, 'trustee_username': None, 'discovery_url': None}

When the error does not occur, we see the following:

2017-07-04 19:00:37.368 40 INFO magnum.drivers.heat.driver [req-4cd7235d-c7f4-4497-817c-c9c0a8d57eed - - - - -] MG: cluster {'updated_at': datetime.datetime(2017, 7, 4, 17, 58, 12, tzinfo=<iso8601.Utc>),
'trustee_user_id': u'5f1fbc8933e749df8500e521fb95e746', 'keypair': u'controller', 'node_count': 1, 'id': 22, 'trust_id': u'0c73f85318b94d3dbf8a9110fbc9f983', 'magnum_cert_ref': u'http://10.60.253.128:9311
/v1/containers/16b00b96-74f3-4cb5-aac4-9ace755d90c7', 'user_id': u'ae1a02bad3a043549b77a64d946a3ec0', 'uuid': 'b78e4f5c-02cf-4208-bc09-ecea3e7c867b', 'cluster_template': ClusterTemplate(apiserver_port=Non
e,cluster_distro='fedora',coe='kubernetes',created_at=2017-07-03T13:07:52Z,dns_nameserver='8.8.8.8',docker_storage_driver='devicemapper',docker_volume_size=3,external_network_id='ilab',fixed_network='p3-i
nternal',fixed_subnet='p3-internal',flavor_id='compute-A-magnum',floating_ip_enabled=True,http_proxy=None,https_proxy=None,id=4,image_id='k8s-fedora-25',insecure_registry=None,keypair_id=None,labels={},ma
ster_flavor_id='compute-A-magnum',master_lb_enabled=False,name='k8s-fedora-25',network_driver='flannel',no_proxy=None,project_id='37a2b9a3299e44f8bc3f6dc60007ef76',public=False,registry_enabled=False,serv
er_type='bm',tls_disabled=False,updated_at=None,user_id='ae1a02bad3a043549b77a64d946a3ec0',uuid='ea0ba754-a769-493a-ade7-d25762e4669e',volume_driver=None), 'ca_cert_ref': u'http://10.60.253.128:9311/v1/co
ntainers/a3e7060d-4b44-4cb7-9cf1-57a7b3b226ba', 'api_address': None, 'master_addresses': [], 'create_timeout': 60, 'project_id': u'37a2b9a3299e44f8bc3f6dc60007ef76', 'status': u'CREATE_IN_PROGRESS', 'dock
er_volume_size': 3, 'master_count': 1, 'node_addresses': [], 'status_reason': None, 'coe_version': None, 'cluster_template_id': u'ea0ba754-a769-493a-ade7-d25762e4669e', 'name': u'k8s-xxx', 'stack_id': u'd
d020bdd-0708-413a-8413-820e1e97fa0a', 'created_at': datetime.datetime(2017, 7, 4, 17, 58, 3, tzinfo=<iso8601.Utc>), 'container_version': None, 'trustee_password': u'eTu8bWhJc9LUcrae2E', 'trustee_username'
: u'b78e4f5c-02cf-4208-bc09-ecea3e7c867b_37a2b9a3299e44f8bc3f6dc60007ef76', 'discovery_url': u'https://discovery.etcd.io/7c9aa5538cf697554feb86c30ee5a921'}

One key difference here is that the cluster's trust fields (trustee_username, trustee_password, trust_id) are all None when the error message occurs, and not None thereafter. This suggests to me that we should not be trying to poll heat if the cluster's trust fields are None.