Comment 1 for bug 2044494

Revision history for this message
Haidong Pang (haidong-pang) wrote :

Nova-api cast to conductor is an asynchronous rpc, which cannot handle exceptions.
When nova-conductor makes a synchronous rpc call to nova-scheduler to select a target node for evacuation, it triggers an oslo_messaging.exceptions.MessagingTimeout.

```python
@profiler.trace_cls("rpc")
class ComputeTaskManager(base.Base):
    @targets_cell
    def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
                         injected_files, new_pass, orig_sys_metadata,
                         bdms, recreate, on_shared_storage,
                         preserve_ephemeral=False, host=None,
                         request_spec=None):
        ......

                try:
                    # if this is a rebuild of instance on the same host with
                    # new image.
                    if not evacuate and orig_image_ref != image_ref:
                        self._validate_image_traits_for_rebuild(context,
                                                                instance,
                                                                image_ref)
                    self._restrict_request_spec_to_cell(
                        context, instance, request_spec)
                    request_spec.ensure_project_and_user_id(instance)
                    request_spec.ensure_network_information(instance)
                    compute_utils.heal_reqspec_is_bfv(
                        context, request_spec, instance)

                    host_lists = self._schedule_instances(context,
                            request_spec, [instance.uuid],
                            return_alternates=False)
                    host_list = host_lists[0]
                    selection = host_list[0]
                    host, node, limits = (selection.service_host,
                            selection.nodename, selection.limits)

                    if recreate:
                        scheduler_utils.fill_provider_mapping(
                            request_spec, selection)

                except (exception.NoValidHost,
                        exception.UnsupportedPolicyException,
                        exception.AllocationUpdateFailed,
                        # the next two can come from fill_provider_mapping and
                        # signals a software error.
                        NotImplementedError,
                        ValueError) as ex:
                    ......

           ......

            self.compute_rpcapi.rebuild_instance(
                context,
                instance=instance,
                new_pass=new_pass,
                injected_files=injected_files,
                image_ref=image_ref,
                orig_image_ref=orig_image_ref,
                orig_sys_metadata=orig_sys_metadata,
                bdms=bdms,
                recreate=evacuate,
                on_shared_storage=on_shared_storage,
                preserve_ephemeral=preserve_ephemeral,
                migration=migration,
                host=host,
                node=node,
                limits=limits,
                request_spec=request_spec,
                accel_uuids=accel_uuids)
```

called `_schedule_instances`

```python
def _schedule_instances(self, context, request_spec,
                            instance_uuids=None, return_alternates=False):
        scheduler_utils.setup_instance_group(context, request_spec)
        with timeutils.StopWatch() as timer:
            host_lists = self.query_client.select_destinations(
                context, request_spec, instance_uuids, return_objects=True,
                return_alternates=return_alternates)
        LOG.debug('Took %0.2f seconds to select destinations for %s '
                  'instance(s).', timer.elapsed(), len(instance_uuids))
        return host_lists

```

We just need to catch `MessagingTimeout` in the except block and reuse the existing exception handling logic to elegantly solve the problem.