diff -Nru nova-17.0.13/debian/changelog nova-17.0.13/debian/changelog --- nova-17.0.13/debian/changelog 2022-01-22 03:03:12.000000000 +1000 +++ nova-17.0.13/debian/changelog 2022-06-23 15:46:08.000000000 +1000 @@ -1,3 +1,14 @@ +nova (2:17.0.13-0ubuntu4~cloud2) xenial; urgency=medium + + * Fixes API allows source compute service/node deletion while instances are + pending a resize confirm/revert (LP: #1852610). + - d/p/0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch + - d/p/0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch + - d/p/0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch + - d/p/0004-lp1852610_api_allows_source_compute_service.patch + + -- Brett Milford Thu, 23 Jun 2022 15:42:14 +1000 + nova (2:17.0.13-0ubuntu4~cloud1) xenial-queens; urgency=medium * New update for the Ubuntu Cloud Archive. diff -Nru nova-17.0.13/debian/patches/0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch nova-17.0.13/debian/patches/0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch --- nova-17.0.13/debian/patches/0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch 1970-01-01 10:00:00.000000000 +1000 +++ nova-17.0.13/debian/patches/0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch 2022-06-23 15:40:40.000000000 +1000 @@ -0,0 +1,123 @@ +From 23ca5e5ac9b90ff45074ae9171f63ca060ebcedd Mon Sep 17 00:00:00 2001 +From: Matt Riedemann +Date: Thu, 6 Jun 2019 13:41:09 -0400 +Subject: [PATCH] Add functional recreate test for bug 1829479 and bug 1817833 + +Change I7b8622b178d5043ed1556d7bdceaf60f47e5ac80 started deleting +the associated resource provider when a compute service is deleted. +However, the delete_resource_provider cascade=True logic only looks +for instances on the given compute service host being deleted which +will miss (1) allocations remaining from evacuated servers and +(2) unconfirmed migrations. + +Attempting to delete the resource provider results in an +ResourceProviderInUse error which delete_resource_provider ignores +for legacy reasons. This results in the compute service being +deleted but the resource provider being orphaned. What's more, +attempting to restart the now-deleted compute service will fail +because nova-compute will try to create a new resource provider +with a new uuid but with the same name (based on the hypervisor +hostname). That failure is actually reported in bug 1817833. + +Conflicts: + nova/tests/functional/integrated_helpers.py + +NOTE(mriedem): The conflict is due to not having change +Iea283322124cb35fc0bc6d25f35548621e8c8c2f in Queens so the +change to ProviderUsageBaseTestCase is made in test_servers.py +rather than integrated_helpers.py. + +Change-Id: I69f52f1282c8361c9cdf90a523f3612139cb8423 +Related-Bug: #1829479 +Related-Bug: #1817833 +(cherry picked from commit 2629d65fbc15d8698f98117e0d6072810f70da03) +(cherry picked from commit b18e42d20bd7d341e713292bdb179ae8e5530d33) +(cherry picked from commit 6eda7409fff75449c97843b2d6ead0b3267a1099) +--- + nova/tests/functional/test_servers.py | 4 ++ + nova/tests/functional/wsgi/test_services.py | 57 +++++++++++++++++++++ + 2 files changed, 61 insertions(+) + +diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py +index 10354cb355..4eb412c491 100644 +--- a/nova/tests/functional/test_servers.py ++++ b/nova/tests/functional/test_servers.py +@@ -1565,6 +1565,10 @@ class ProviderUsageBaseTestCase(test.TestCase, + self.assertEqual(old_flavor['disk'] + new_flavor['disk'], + allocation['DISK_GB']) + ++ def assertFlavorMatchesUsage(self, rp_uuid, flavor): ++ usages = self._get_provider_usages(rp_uuid) ++ self.assertFlavorMatchesAllocation(flavor, usages) ++ + def get_migration_uuid_for_instance(self, instance_uuid): + # NOTE(danms): This is too much introspection for a test like this, but + # we can't see the migration uuid from the API, so we just encapsulate +diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py +index 7584749cfc..a9ec833276 100644 +--- a/nova/tests/functional/wsgi/test_services.py ++++ b/nova/tests/functional/wsgi/test_services.py +@@ -116,3 +116,60 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): + # and allocation information. + resp = self.placement_api.get('/resource_providers/%s' % rp_uuid) + self.assertEqual(404, resp.status) ++ ++ def test_evacuate_then_delete_compute_service(self): ++ """Tests a scenario where a server is created on a host, the host ++ goes down, the server is evacuated to another host, and then the ++ source host compute service is deleted. After that the deleted ++ compute service is restarted. Related placement resources are checked ++ throughout. ++ """ ++ # Create our source host that we will evacuate *from* later. ++ host1 = self._start_compute('host1') ++ # Create a server which will go on host1 since it is the only host. ++ flavor = self.api.get_flavors()[0] ++ server = self._boot_and_check_allocations(flavor, 'host1') ++ # Get the compute service record for host1 so we can manage it. ++ service = self.admin_api.get_services( ++ binary='nova-compute', host='host1')[0] ++ # Get the corresponding resource provider uuid for host1. ++ rp_uuid = self._get_provider_uuid_by_host(service['host']) ++ # Make sure there is a resource provider for that compute node based ++ # on the uuid. ++ resp = self.placement_api.get('/resource_providers/%s' % rp_uuid) ++ self.assertEqual(200, resp.status) ++ # Down the compute service for host1 so we can evacuate from it. ++ self.admin_api.put_service(service['id'], {'forced_down': True}) ++ host1.stop() ++ # Start another host and trigger the server evacuate to that host. ++ self._start_compute('host2') ++ self.admin_api.post_server_action(server['id'], {'evacuate': {}}) ++ # The host does not change until after the status is changed to ACTIVE ++ # so wait for both parameters. ++ self._wait_for_server_parameter( ++ self.admin_api, server, {'status': 'ACTIVE', ++ 'OS-EXT-SRV-ATTR:host': 'host2'}) ++ # Delete the compute service for host1 and check the related ++ # placement resources for that host. ++ self.admin_api.api_delete('/os-services/%s' % service['id']) ++ # Make sure the service is gone. ++ services = self.admin_api.get_services( ++ binary='nova-compute', host='host1') ++ self.assertEqual(0, len(services), services) ++ # FIXME(mriedem): This is bug 1829479 where the compute service is ++ # deleted but the resource provider is not because there are still ++ # allocations against the provider from the evacuated server. ++ resp = self.placement_api.get('/resource_providers/%s' % rp_uuid) ++ self.assertEqual(200, resp.status) ++ self.assertFlavorMatchesUsage(rp_uuid, flavor) ++ # Try to restart the host1 compute service to create a new resource ++ # provider. ++ self.restart_compute_service(host1) ++ # FIXME(mriedem): This is bug 1817833 where restarting the now-deleted ++ # compute service attempts to create a new resource provider with a ++ # new uuid but the same name which results in a conflict. The service ++ # does not die, however, because _update_available_resource_for_node ++ # catches and logs but does not re-raise the error. ++ log_output = self.stdlog.logger.output ++ self.assertIn('Error updating resources for node host1.', log_output) ++ self.assertIn('Failed to create resource provider host1', log_output) +-- +2.25.1 + diff -Nru nova-17.0.13/debian/patches/0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch nova-17.0.13/debian/patches/0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch --- nova-17.0.13/debian/patches/0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch 1970-01-01 10:00:00.000000000 +1000 +++ nova-17.0.13/debian/patches/0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch 2022-06-23 15:40:40.000000000 +1000 @@ -0,0 +1,193 @@ +From 922098044b37c66c51e83f4879c1f37ae999f196 Mon Sep 17 00:00:00 2001 +From: Matt Riedemann +Date: Thu, 14 Nov 2019 11:38:07 -0500 +Subject: [PATCH] Add functional recreate test for bug 1852610 + +It is possible to delete a source compute service which has +pending migration-based allocations and servers in VERIFY_RESIZE +status. Doing so deletes the compute service and compute node +but orphans the source node resource provider along with its +resource allocations held by the migration record while there +is a pending resized server. + +This adds a simple cold migrate test which deletes the source +compute service while the server is in VERIFY_RESIZE status and +then tries to confirm the resize which fails. + +Conflicts: + nova/tests/functional/integrated_helpers.py + +NOTE(mriedem): The conflict is due to not having change +Iea283322124cb35fc0bc6d25f35548621e8c8c2f in Queens. As a result +the helper methods are moved from ServerMovingTests to +ProviderUsageBaseTestCase within test_servers.py. + +Change-Id: I644608b4e197ddea31c5f264adb492f2c8931f04 +Related-Bug: #1852610 +(cherry picked from commit 94d3743b185d22c07504f5d878dff2f9ef42cee3) +(cherry picked from commit 28d76cc7ae5c86d251915392b5b961a975b343ae) +(cherry picked from commit 7d673872462f53d0ce5e651263253ec4057a2138) +(cherry picked from commit 1563a15c8b4bcf1a602f72a549c8d9c56ed7da4e) +--- + nova/tests/functional/test_servers.py | 94 ++++++++++----------- + nova/tests/functional/wsgi/test_services.py | 35 ++++++++ + 2 files changed, 82 insertions(+), 47 deletions(-) + +diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py +index 4eb412c491..134316e08b 100644 +--- a/nova/tests/functional/test_servers.py ++++ b/nova/tests/functional/test_servers.py +@@ -1703,6 +1703,53 @@ class ProviderUsageBaseTestCase(test.TestCase, + compute.manager.update_available_resource(ctx) + LOG.info('Finished with periodics') + ++ def _move_and_check_allocations(self, server, request, old_flavor, ++ new_flavor, source_rp_uuid, dest_rp_uuid): ++ self.api.post_server_action(server['id'], request) ++ self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') ++ ++ def _check_allocation(): ++ source_usages = self._get_provider_usages(source_rp_uuid) ++ self.assertFlavorMatchesAllocation(old_flavor, source_usages) ++ dest_usages = self._get_provider_usages(dest_rp_uuid) ++ self.assertFlavorMatchesAllocation(new_flavor, dest_usages) ++ ++ # The instance should own the new_flavor allocation against the ++ # destination host created by the scheduler ++ allocations = self._get_allocations_by_server_uuid(server['id']) ++ self.assertEqual(1, len(allocations)) ++ dest_alloc = allocations[dest_rp_uuid]['resources'] ++ self.assertFlavorMatchesAllocation(new_flavor, dest_alloc) ++ ++ # The migration should own the old_flavor allocation against the ++ # source host created by conductor ++ migration_uuid = self.get_migration_uuid_for_instance(server['id']) ++ allocations = self._get_allocations_by_server_uuid(migration_uuid) ++ source_alloc = allocations[source_rp_uuid]['resources'] ++ self.assertFlavorMatchesAllocation(old_flavor, source_alloc) ++ ++ # OK, so the move operation has run, but we have not yet confirmed or ++ # reverted the move operation. Before we run periodics, make sure ++ # that we have allocations/usages on BOTH the source and the ++ # destination hosts. ++ _check_allocation() ++ self._run_periodics() ++ _check_allocation() ++ ++ # Make sure the RequestSpec.flavor matches the new_flavor. ++ ctxt = context.get_admin_context() ++ reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id']) ++ self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid) ++ ++ def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid, ++ dest_rp_uuid): ++ request = { ++ 'migrate': None ++ } ++ self._move_and_check_allocations( ++ server, request=request, old_flavor=flavor, new_flavor=flavor, ++ source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) ++ + + class TraitsTrackingTests(ProviderUsageBaseTestCase): + compute_driver = 'fake.SmallFakeDriver' +@@ -1781,53 +1828,6 @@ class ServerMovingTests(ProviderUsageBaseTestCase): + self.compute2.manager.update_available_resource(ctx) + LOG.info('Finished with periodics') + +- def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid, +- dest_rp_uuid): +- request = { +- 'migrate': None +- } +- self._move_and_check_allocations( +- server, request=request, old_flavor=flavor, new_flavor=flavor, +- source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) +- +- def _move_and_check_allocations(self, server, request, old_flavor, +- new_flavor, source_rp_uuid, dest_rp_uuid): +- self.api.post_server_action(server['id'], request) +- self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') +- +- def _check_allocation(): +- source_usages = self._get_provider_usages(source_rp_uuid) +- self.assertFlavorMatchesAllocation(old_flavor, source_usages) +- dest_usages = self._get_provider_usages(dest_rp_uuid) +- self.assertFlavorMatchesAllocation(new_flavor, dest_usages) +- +- # The instance should own the new_flavor allocation against the +- # destination host created by the scheduler +- allocations = self._get_allocations_by_server_uuid(server['id']) +- self.assertEqual(1, len(allocations)) +- dest_alloc = allocations[dest_rp_uuid]['resources'] +- self.assertFlavorMatchesAllocation(new_flavor, dest_alloc) +- +- # The migration should own the old_flavor allocation against the +- # source host created by conductor +- migration_uuid = self.get_migration_uuid_for_instance(server['id']) +- allocations = self._get_allocations_by_server_uuid(migration_uuid) +- source_alloc = allocations[source_rp_uuid]['resources'] +- self.assertFlavorMatchesAllocation(old_flavor, source_alloc) +- +- # OK, so the move operation has run, but we have not yet confirmed or +- # reverted the move operation. Before we run periodics, make sure +- # that we have allocations/usages on BOTH the source and the +- # destination hosts. +- _check_allocation() +- self._run_periodics() +- _check_allocation() +- +- # Make sure the RequestSpec.flavor matches the new_flavor. +- ctxt = context.get_admin_context() +- reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id']) +- self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid) +- + def test_resize_revert(self): + self._test_resize_revert(dest_hostname='host1') + +diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py +index a9ec833276..9a2d090896 100644 +--- a/nova/tests/functional/wsgi/test_services.py ++++ b/nova/tests/functional/wsgi/test_services.py +@@ -173,3 +173,38 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): + log_output = self.stdlog.logger.output + self.assertIn('Error updating resources for node host1.', log_output) + self.assertIn('Failed to create resource provider host1', log_output) ++ ++ def test_migrate_confirm_after_deleted_source_compute(self): ++ """Tests a scenario where a server is cold migrated and while in ++ VERIFY_RESIZE status the admin attempts to delete the source compute ++ and then the user tries to confirm the resize. ++ """ ++ # Start a compute service and create a server there. ++ self._start_compute('host1') ++ host1_rp_uuid = self._get_provider_uuid_by_host('host1') ++ flavor = self.api.get_flavors()[0] ++ server = self._boot_and_check_allocations(flavor, 'host1') ++ # Start a second compute service so we can cold migrate there. ++ self._start_compute('host2') ++ host2_rp_uuid = self._get_provider_uuid_by_host('host2') ++ # Cold migrate the server to host2. ++ self._migrate_and_check_allocations( ++ server, flavor, host1_rp_uuid, host2_rp_uuid) ++ # Delete the source compute service. ++ service = self.admin_api.get_services( ++ binary='nova-compute', host='host1')[0] ++ self.admin_api.api_delete('/os-services/%s' % service['id']) ++ # FIXME(mriedem): This is bug 1852610 where the compute service is ++ # deleted but the resource provider is not because there are still ++ # migration-based allocations against the source node provider. ++ resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) ++ self.assertEqual(200, resp.status) ++ self.assertFlavorMatchesUsage(host1_rp_uuid, flavor) ++ # Now try to confirm the migration. ++ # FIXME(mriedem): This will fail until bug 1852610 is fixed and the ++ # source compute service delete is blocked while there is an ++ # in-progress migration involving the node. ++ self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) ++ self.api.post_server_action(server['id'], {'confirmResize': None}) ++ self._wait_for_state_change(self.api, server, 'ERROR') ++ self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) +-- +2.25.1 + diff -Nru nova-17.0.13/debian/patches/0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch nova-17.0.13/debian/patches/0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch --- nova-17.0.13/debian/patches/0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch 1970-01-01 10:00:00.000000000 +1000 +++ nova-17.0.13/debian/patches/0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch 2022-06-23 15:40:40.000000000 +1000 @@ -0,0 +1,108 @@ +From 917b5d383829851c6cbf7583cd0f3640c8ba2b9a Mon Sep 17 00:00:00 2001 +From: Matt Riedemann +Date: Thu, 14 Nov 2019 12:16:53 -0500 +Subject: [PATCH] Add functional recreate revert resize test for bug 1852610 + +This builds on I644608b4e197ddea31c5f264adb492f2c8931f04 and +adds a revert resize test which deletes the source compute service +while the server is in VERIFY_RESIZE status and then reverts the +resize. The results are a bit different from the confirm scenario +because the confirm fails while the revert actually works which +is more dumb luck based on where the compute service drops the +move claim during the revert process (on the dest which still exists +rather than the source). + +Conflicts: + nova/tests/functional/integrated_helpers.py + +NOTE(mriedem): The conflict is due to not having change +Iea283322124cb35fc0bc6d25f35548621e8c8c2f in Queens. As a result +the _resize_and_check_allocations method is added to +ProviderUsageBaseTestCase within test_servers.py. + +Change-Id: I2dcb1cb3e1f8ed469a4c5bf81ca5ca2fcf1fa73c +Related-Bug: #1852610 +(cherry picked from commit f7dde6054e559752d8e9be044c32c0741f8f39c5) +(cherry picked from commit 3774952410f98bfde014bd9fdc0897d4a9a6c50f) +(cherry picked from commit 9983b2462401176db0c332f21bc8b24ba1d81503) +(cherry picked from commit b6b2b3a35e1b95463fac3bbb2acd51d905328a2c) +--- + nova/tests/functional/test_servers.py | 12 ++++++ + nova/tests/functional/wsgi/test_services.py | 41 +++++++++++++++++++++ + 2 files changed, 53 insertions(+) + +diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py +index 134316e08b..161116232c 100644 +--- a/nova/tests/functional/test_servers.py ++++ b/nova/tests/functional/test_servers.py +@@ -1750,6 +1750,18 @@ class ProviderUsageBaseTestCase(test.TestCase, + server, request=request, old_flavor=flavor, new_flavor=flavor, + source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) + ++ def _resize_and_check_allocations(self, server, old_flavor, new_flavor, ++ source_rp_uuid, dest_rp_uuid): ++ request = { ++ 'resize': { ++ 'flavorRef': new_flavor['id'] ++ } ++ } ++ self._move_and_check_allocations( ++ server, request=request, old_flavor=old_flavor, ++ new_flavor=new_flavor, source_rp_uuid=source_rp_uuid, ++ dest_rp_uuid=dest_rp_uuid) ++ + + class TraitsTrackingTests(ProviderUsageBaseTestCase): + compute_driver = 'fake.SmallFakeDriver' +diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py +index 9a2d090896..b1d689593b 100644 +--- a/nova/tests/functional/wsgi/test_services.py ++++ b/nova/tests/functional/wsgi/test_services.py +@@ -208,3 +208,44 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase): + self.api.post_server_action(server['id'], {'confirmResize': None}) + self._wait_for_state_change(self.api, server, 'ERROR') + self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) ++ ++ def test_resize_revert_after_deleted_source_compute(self): ++ """Tests a scenario where a server is resized and while in ++ VERIFY_RESIZE status the admin attempts to delete the source compute ++ and then the user tries to revert the resize. ++ """ ++ # Start a compute service and create a server there. ++ self._start_compute('host1') ++ host1_rp_uuid = self._get_provider_uuid_by_host('host1') ++ flavors = self.api.get_flavors() ++ flavor1 = flavors[0] ++ flavor2 = flavors[1] ++ server = self._boot_and_check_allocations(flavor1, 'host1') ++ # Start a second compute service so we can resize there. ++ self._start_compute('host2') ++ host2_rp_uuid = self._get_provider_uuid_by_host('host2') ++ # Resize the server to host2. ++ self._resize_and_check_allocations( ++ server, flavor1, flavor2, host1_rp_uuid, host2_rp_uuid) ++ # Delete the source compute service. ++ service = self.admin_api.get_services( ++ binary='nova-compute', host='host1')[0] ++ self.admin_api.api_delete('/os-services/%s' % service['id']) ++ # FIXME(mriedem): This is bug 1852610 where the compute service is ++ # deleted but the resource provider is not because there are still ++ # migration-based allocations against the source node provider. ++ resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) ++ self.assertEqual(200, resp.status) ++ self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) ++ # Now try to revert the resize. ++ # NOTE(mriedem): This actually works because the drop_move_claim ++ # happens in revert_resize on the dest host which still has its ++ # ComputeNode record. The migration-based allocations are reverted ++ # so the instance holds the allocations for the source provider and ++ # the allocations against the dest provider are dropped. ++ self.api.post_server_action(server['id'], {'revertResize': None}) ++ self._wait_for_state_change(self.api, server, 'ACTIVE') ++ self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) ++ self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) ++ zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}} ++ self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor) +-- +2.25.1 + diff -Nru nova-17.0.13/debian/patches/0004-lp1852610_api_allows_source_compute_service.patch nova-17.0.13/debian/patches/0004-lp1852610_api_allows_source_compute_service.patch --- nova-17.0.13/debian/patches/0004-lp1852610_api_allows_source_compute_service.patch 1970-01-01 10:00:00.000000000 +1000 +++ nova-17.0.13/debian/patches/0004-lp1852610_api_allows_source_compute_service.patch 2022-06-23 15:41:26.000000000 +1000 @@ -0,0 +1,273 @@ +From d88f353796813bf0ad5ec79ba4714af35e04e591 Mon Sep 17 00:00:00 2001 +From: Matt Riedemann +Date: Thu, 14 Nov 2019 14:19:26 -0500 +Subject: [PATCH] Block deleting compute services with in-progress migrations + +This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a +which made DELETE /os-services/{service_id} fail with a 409 +response if the host has instances on it. This change checks +for in-progress migrations involving the nodes on the host, +either as the source or destination nodes, and returns a 409 +error response if any are found. + +Failling to do this can lead to orphaned resource providers +in placement and also failing to properly confirm or revert +a pending resize or cold migration. + +A release note is included for the (justified) behavior +change in the API. A new microversion should not be required +for this since admins should not have to opt out of broken +behavior. + +Conflicts: + nova/tests/functional/integrated_helpers.py + +NOTE(mriedem): The conflict is due to not having change +Iea283322124cb35fc0bc6d25f35548621e8c8c2f in Queens so +_revert_resize is added to ProviderUsageBaseTestCase +within test_servers.py. + +Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c +Closes-Bug: #1852610 +(cherry picked from commit 92fed026103b47fa2a76ea09204a4ba24c21e191) +(cherry picked from commit a9650b3cbfc674e283964090fb64ac6297be5b78) +(cherry picked from commit a0290858b717b4cefd0d6fc17acc2b143ab12ac4) +(cherry picked from commit 30a635068512be558acf0f9c83185dc1aaad560f) +--- + api-ref/source/os-services.inc | 6 ++ + nova/api/openstack/compute/services.py | 45 ++++++++++++- + nova/tests/functional/test_servers.py | 12 ++++ + nova/tests/functional/wsgi/test_services.py | 64 ++++++++++++------- + ...lete-with-migrations-ca0565fc0b503519.yaml | 10 +++ + 5 files changed, 112 insertions(+), 25 deletions(-) + create mode 100644 releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml + +Index: nova-17.0.13/api-ref/source/os-services.inc +=================================================================== +--- nova-17.0.13.orig/api-ref/source/os-services.inc ++++ nova-17.0.13/api-ref/source/os-services.inc +@@ -322,6 +322,12 @@ Attempts to delete a ``nova-compute`` se + will result in a 409 HTTPConflict response. The instances will need to be + migrated or deleted before a compute service can be deleted. + ++Similarly, attempts to delete a ``nova-compute`` service which is involved in ++in-progress migrations will result in a 409 HTTPConflict response. The ++migrations will need to be completed, for example confirming or reverting a ++resize, or the instances will need to be deleted before the compute service can ++be deleted. ++ + .. important:: Be sure to stop the actual ``nova-compute`` process on the + physical host *before* deleting the service with this API. + Failing to do so can lead to the running service re-creating +Index: nova-17.0.13/nova/api/openstack/compute/services.py +=================================================================== +--- nova-17.0.13.orig/nova/api/openstack/compute/services.py ++++ nova-17.0.13/nova/api/openstack/compute/services.py +@@ -12,6 +12,7 @@ + # License for the specific language governing permissions and limitations + # under the License. + ++from oslo_log import log as logging + from oslo_utils import strutils + from oslo_utils import uuidutils + import webob.exc +@@ -32,6 +33,8 @@ from nova import utils + + UUID_FOR_ID_MIN_VERSION = '2.53' + ++LOG = logging.getLogger(__name__) ++ + + class ServiceController(wsgi.Controller): + +@@ -238,6 +241,16 @@ class ServiceController(wsgi.Controller) + 'is hosting instances. Migrate or ' + 'delete the instances first.')) + ++ # Similarly, check to see if the are any in-progress migrations ++ # involving this host because if there are we need to block the ++ # service delete since we could orphan resource providers and ++ # break the ability to do things like confirm/revert instances ++ # in VERIFY_RESIZE status. ++ compute_nodes = objects.ComputeNodeList.get_all_by_host( ++ context, service.host) ++ self._assert_no_in_progress_migrations( ++ context, id, compute_nodes) ++ + aggrs = self.aggregate_api.get_aggregates_by_host(context, + service.host) + for ag in aggrs: +@@ -248,8 +261,6 @@ class ServiceController(wsgi.Controller) + # placement for the compute nodes managed by this service; + # remember that an ironic compute service can manage multiple + # nodes +- compute_nodes = objects.ComputeNodeList.get_all_by_host( +- context, service.host) + for compute_node in compute_nodes: + self.placementclient.delete_resource_provider( + context, compute_node, cascade=True) +@@ -271,6 +282,36 @@ class ServiceController(wsgi.Controller) + explanation = _("Service id %s refers to multiple services.") % id + raise webob.exc.HTTPBadRequest(explanation=explanation) + ++ @staticmethod ++ def _assert_no_in_progress_migrations(context, service_id, compute_nodes): ++ """Ensures there are no in-progress migrations on the given nodes. ++ ++ :param context: nova auth RequestContext ++ :param service_id: id of the Service being deleted ++ :param compute_nodes: ComputeNodeList of nodes on a compute service ++ :raises: HTTPConflict if there are any in-progress migrations on the ++ nodes ++ """ ++ for cn in compute_nodes: ++ migrations = ( ++ objects.MigrationList.get_in_progress_by_host_and_node( ++ context, cn.host, cn.hypervisor_hostname)) ++ if migrations: ++ # Log the migrations for the operator and then raise ++ # a 409 error. ++ LOG.info('Unable to delete compute service with id %s ' ++ 'for host %s. There are %i in-progress ' ++ 'migrations involving the host. Migrations ' ++ '(uuid:status): %s', ++ service_id, cn.host, len(migrations), ++ ','.join(['%s:%s' % (mig.uuid, mig.status) ++ for mig in migrations])) ++ raise webob.exc.HTTPConflict( ++ explanation=_( ++ 'Unable to delete compute service that has ' ++ 'in-progress migrations. Complete the ' ++ 'migrations or delete the instances first.')) ++ + @validation.query_schema(services.index_query_schema) + @wsgi.expected_errors(()) + def index(self, req): +Index: nova-17.0.13/nova/tests/functional/test_servers.py +=================================================================== +--- nova-17.0.13.orig/nova/tests/functional/test_servers.py ++++ nova-17.0.13/nova/tests/functional/test_servers.py +@@ -1762,6 +1762,18 @@ class ProviderUsageBaseTestCase(test.Tes + new_flavor=new_flavor, source_rp_uuid=source_rp_uuid, + dest_rp_uuid=dest_rp_uuid) + ++ def _revert_resize(self, server): ++ self.api.post_server_action(server['id'], {'revertResize': None}) ++ server = self._wait_for_state_change(self.api, server, 'ACTIVE') ++ self._wait_for_migration_status(server, ['reverted']) ++ # Note that the migration status is changed to "reverted" in the ++ # dest host revert_resize method but the allocations are cleaned up ++ # in the source host finish_revert_resize method so we need to wait ++ # for the finish_revert_resize method to complete. ++ fake_notifier.wait_for_versioned_notifications( ++ 'instance.resize_revert.end') ++ return server ++ + + class TraitsTrackingTests(ProviderUsageBaseTestCase): + compute_driver = 'fake.SmallFakeDriver' +Index: nova-17.0.13/nova/tests/functional/wsgi/test_services.py +=================================================================== +--- nova-17.0.13.orig/nova/tests/functional/wsgi/test_services.py ++++ nova-17.0.13/nova/tests/functional/wsgi/test_services.py +@@ -193,21 +193,30 @@ class TestServicesAPI(test_servers.Provi + # Delete the source compute service. + service = self.admin_api.get_services( + binary='nova-compute', host='host1')[0] +- self.admin_api.api_delete('/os-services/%s' % service['id']) +- # FIXME(mriedem): This is bug 1852610 where the compute service is +- # deleted but the resource provider is not because there are still +- # migration-based allocations against the source node provider. ++ # We expect the delete request to fail with a 409 error because of the ++ # instance in VERIFY_RESIZE status even though that instance is marked ++ # as being on host2 now. ++ ex = self.assertRaises(api_client.OpenStackApiException, ++ self.admin_api.api_delete, ++ '/os-services/%s' % service['id']) ++ self.assertEqual(409, ex.response.status_code) ++ self.assertIn('Unable to delete compute service that has in-progress ' ++ 'migrations', six.text_type(ex)) ++ self.assertIn('There are 1 in-progress migrations involving the host', ++ self.stdlog.logger.output) ++ # The provider is still around because we did not delete the service. + resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) + self.assertEqual(200, resp.status) + self.assertFlavorMatchesUsage(host1_rp_uuid, flavor) + # Now try to confirm the migration. +- # FIXME(mriedem): This will fail until bug 1852610 is fixed and the +- # source compute service delete is blocked while there is an +- # in-progress migration involving the node. +- self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) + self.api.post_server_action(server['id'], {'confirmResize': None}) +- self._wait_for_state_change(self.api, server, 'ERROR') +- self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) ++ self._wait_for_state_change(self.api, server, 'ACTIVE') ++ # Delete the host1 service since the migration is confirmed and the ++ # server is on host2. ++ self.admin_api.api_delete('/os-services/%s' % service['id']) ++ # The host1 resource provider should be gone. ++ resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) ++ self.assertEqual(404, resp.status) + + def test_resize_revert_after_deleted_source_compute(self): + """Tests a scenario where a server is resized and while in +@@ -230,22 +239,31 @@ class TestServicesAPI(test_servers.Provi + # Delete the source compute service. + service = self.admin_api.get_services( + binary='nova-compute', host='host1')[0] +- self.admin_api.api_delete('/os-services/%s' % service['id']) +- # FIXME(mriedem): This is bug 1852610 where the compute service is +- # deleted but the resource provider is not because there are still +- # migration-based allocations against the source node provider. ++ # We expect the delete request to fail with a 409 error because of the ++ # instance in VERIFY_RESIZE status even though that instance is marked ++ # as being on host2 now. ++ ex = self.assertRaises(api_client.OpenStackApiException, ++ self.admin_api.api_delete, ++ '/os-services/%s' % service['id']) ++ self.assertEqual(409, ex.response.status_code) ++ self.assertIn('Unable to delete compute service that has in-progress ' ++ 'migrations', six.text_type(ex)) ++ self.assertIn('There are 1 in-progress migrations involving the host', ++ self.stdlog.logger.output) ++ # The provider is still around because we did not delete the service. + resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) + self.assertEqual(200, resp.status) + self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) +- # Now try to revert the resize. +- # NOTE(mriedem): This actually works because the drop_move_claim +- # happens in revert_resize on the dest host which still has its +- # ComputeNode record. The migration-based allocations are reverted +- # so the instance holds the allocations for the source provider and +- # the allocations against the dest provider are dropped. +- self.api.post_server_action(server['id'], {'revertResize': None}) +- self._wait_for_state_change(self.api, server, 'ACTIVE') +- self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) ++ # Now revert the resize. ++ self._revert_resize(server) + self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) + zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}} + self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor) ++ # Delete the host2 service since the migration is reverted and the ++ # server is on host1 again. ++ service2 = self.admin_api.get_services( ++ binary='nova-compute', host='host2')[0] ++ self.admin_api.api_delete('/os-services/%s' % service2['id']) ++ # The host2 resource provider should be gone. ++ resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid) ++ self.assertEqual(404, resp.status) +Index: nova-17.0.13/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml +=================================================================== +--- /dev/null ++++ nova-17.0.13/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml +@@ -0,0 +1,10 @@ ++--- ++fixes: ++ - | ++ The ``DELETE /os-services/{service_id}`` compute API will now return a ++ ``409 HTTPConflict`` response when trying to delete a ``nova-compute`` ++ service which is involved in in-progress migrations. This is because doing ++ so would not only orphan the compute node resource provider in the ++ placement service on which those instances have resource allocations but ++ can also break the ability to confirm/revert a pending resize properly. ++ See https://bugs.launchpad.net/nova/+bug/1852610 for more details. diff -Nru nova-17.0.13/debian/patches/series nova-17.0.13/debian/patches/series --- nova-17.0.13/debian/patches/series 2021-09-22 04:29:56.000000000 +1000 +++ nova-17.0.13/debian/patches/series 2022-06-23 15:40:40.000000000 +1000 @@ -7,3 +7,7 @@ 0001-Force-refresh-instance-info_cache-during-heal.patch 0002-remove-deprecated-test_list_vifs_neutron_notimplemented.patch libvirt-Ignore-DiskNotFound-during-update_available.patch +0001-lp1852610_Add_functional_recreate_test_for_bug_1829479_and_bug_1817833.patch +0002-lp1852610_Add_functional_recreate_test_for_bug_1852610.patch +0003-lp1852610_Add_functional_recreate_revert_resize_test_for_bug_1852610.patch +0004-lp1852610_api_allows_source_compute_service.patch