diff -Nru pacemaker-1.1.18/debian/changelog pacemaker-1.1.18/debian/changelog --- pacemaker-1.1.18/debian/changelog 2020-03-05 23:28:20.000000000 -0300 +++ pacemaker-1.1.18/debian/changelog 2020-08-14 17:51:39.000000000 -0400 @@ -1,3 +1,16 @@ +pacemaker (1.1.18-0ubuntu1.3) bionic; urgency=medium + + * Pacemaker fixes to disable ordered stops when + a remote connection is down or node is shutting down (LP: #1890491). + - d/p/lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative.patch: + libpe_status: don't order implied stops relative to a remote connection. + - d/p/lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch: + remote state is failed if node is shutting down with connection failure + - d/p/lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch: + add function for checking shutdown attribute. + + -- Jorge Niedbalski Fri, 14 Aug 2020 17:51:39 -0400 + pacemaker (1.1.18-0ubuntu1.2) bionic; urgency=medium * Pacemaker fixes to allow fence-agents to work correctly (LP: #1866119) diff -Nru pacemaker-1.1.18/debian/patches/0001-Fix-libpe_status-don-t-order-implied-stops-relative-.patch pacemaker-1.1.18/debian/patches/0001-Fix-libpe_status-don-t-order-implied-stops-relative-.patch --- pacemaker-1.1.18/debian/patches/0001-Fix-libpe_status-don-t-order-implied-stops-relative-.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/0001-Fix-libpe_status-don-t-order-implied-stops-relative-.patch 2020-08-14 17:51:17.000000000 -0400 @@ -0,0 +1,55 @@ +From 215aceb69d501e32dde697f6352562e7444fb454 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:43:19 -0500 +Subject: [PATCH 1/2] Fix: libpe_status: don't order implied stops relative to + a remote connection + +Actions behind a remote connection are ordered relative to any start or stop of +the remote connection. However, if the action is a stop implied due to fencing, +it does not require the remote connection, and the ordering should not be done. + +This avoids a delay in the remote connection recovery if it is failed, e.g. +previously the ordering would look like: + + fence remote node -> implied stop of resource on remote -> stop connection + +Now, the connection stop can proceed simultaneously with the remote node +fencing. +--- + pengine/allocate.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/pengine/allocate.c b/pengine/allocate.c +index 98464a9b5..57707edf5 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -2045,18 +2045,14 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) + order_start_then_action(remote_rsc, action, pe_order_none, + data_set); + +- } else { +- if(state == remote_state_failed) { +- /* We would only be here if the resource is +- * running on the remote node. Since we have no +- * way to stop it, it is necessary to fence the +- * node. +- */ +- pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); +- } +- +- order_action_then_stop(action, remote_rsc, +- pe_order_implies_first, data_set); ++ } else if(state == remote_state_failed) { ++ /* The resource is active on the node, but since we don't have a ++ * valid connection, the only way to stop the resource is by ++ * fencing the node. There is no need to order the stop relative ++ * to the remote connection, since the stop will become implied ++ * by the fencing. ++ */ ++ pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); + } + break; + +-- +2.25.1 + diff -Nru pacemaker-1.1.18/debian/patches/0002-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch pacemaker-1.1.18/debian/patches/0002-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch --- pacemaker-1.1.18/debian/patches/0002-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/0002-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch 2020-08-14 17:51:17.000000000 -0400 @@ -0,0 +1,39 @@ +From 354011b561008d1e48d1bce08255997fa8aee1b7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:37:26 -0500 +Subject: [PATCH 2/2] Fix: scheduler: remote state is failed if node is + shutting down with connection failure + +When determining remote state, if the connection resource is failed and not +being started again, we consider the state to be unknown if the connection has +a reconnect interval, because we won't know whether the connection can be +recovered until the interval expires and we re-attempt connection. + +However, if the node is shutting down at the time, we won't re-attempt +connection, so consider the state failed in that case. (Note that we check the +actual shutdown node attribute, rather than node->details->shutdown, since that +is set for remote nodes whenever the connection is stopping.) + +This avoids a situation where actions that cannot succeed can be scheduled on a +remote node that's shutting down. +--- + pengine/allocate.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/pengine/allocate.c b/pengine/allocate.c +index 57707edf5..faa82d4d0 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -1949,7 +1949,8 @@ get_remote_node_state(pe_node_t *node) + + if ((remote_rsc->next_role == RSC_ROLE_STOPPED) + && remote_rsc->remote_reconnect_interval +- && node->details->remote_was_fenced) { ++ && node->details->remote_was_fenced ++ && !pe__shutdown_requested(node)) { + + /* We won't know whether the connection is recoverable until the + * reconnect interval expires and we reattempt connection. +-- +2.25.1 + diff -Nru pacemaker-1.1.18/debian/patches/lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative-.patch pacemaker-1.1.18/debian/patches/lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative-.patch --- pacemaker-1.1.18/debian/patches/lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative-.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative-.patch 2020-08-14 17:51:17.000000000 -0400 @@ -0,0 +1,55 @@ +From 215aceb69d501e32dde697f6352562e7444fb454 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:43:19 -0500 +Subject: [PATCH 1/2] Fix: libpe_status: don't order implied stops relative to + a remote connection + +Actions behind a remote connection are ordered relative to any start or stop of +the remote connection. However, if the action is a stop implied due to fencing, +it does not require the remote connection, and the ordering should not be done. + +This avoids a delay in the remote connection recovery if it is failed, e.g. +previously the ordering would look like: + + fence remote node -> implied stop of resource on remote -> stop connection + +Now, the connection stop can proceed simultaneously with the remote node +fencing. +--- + pengine/allocate.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/pengine/allocate.c b/pengine/allocate.c +index 98464a9b5..57707edf5 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -2045,18 +2045,14 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) + order_start_then_action(remote_rsc, action, pe_order_none, + data_set); + +- } else { +- if(state == remote_state_failed) { +- /* We would only be here if the resource is +- * running on the remote node. Since we have no +- * way to stop it, it is necessary to fence the +- * node. +- */ +- pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); +- } +- +- order_action_then_stop(action, remote_rsc, +- pe_order_implies_first, data_set); ++ } else if(state == remote_state_failed) { ++ /* The resource is active on the node, but since we don't have a ++ * valid connection, the only way to stop the resource is by ++ * fencing the node. There is no need to order the stop relative ++ * to the remote connection, since the stop will become implied ++ * by the fencing. ++ */ ++ pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); + } + break; + +-- +2.25.1 + diff -Nru pacemaker-1.1.18/debian/patches/lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch pacemaker-1.1.18/debian/patches/lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch --- pacemaker-1.1.18/debian/patches/lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch 2020-08-14 17:51:17.000000000 -0400 @@ -0,0 +1,39 @@ +From 354011b561008d1e48d1bce08255997fa8aee1b7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:37:26 -0500 +Subject: [PATCH 2/2] Fix: scheduler: remote state is failed if node is + shutting down with connection failure + +When determining remote state, if the connection resource is failed and not +being started again, we consider the state to be unknown if the connection has +a reconnect interval, because we won't know whether the connection can be +recovered until the interval expires and we re-attempt connection. + +However, if the node is shutting down at the time, we won't re-attempt +connection, so consider the state failed in that case. (Note that we check the +actual shutdown node attribute, rather than node->details->shutdown, since that +is set for remote nodes whenever the connection is stopping.) + +This avoids a situation where actions that cannot succeed can be scheduled on a +remote node that's shutting down. +--- + pengine/allocate.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/pengine/allocate.c b/pengine/allocate.c +index 57707edf5..faa82d4d0 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -1949,7 +1949,8 @@ get_remote_node_state(pe_node_t *node) + + if ((remote_rsc->next_role == RSC_ROLE_STOPPED) + && remote_rsc->remote_reconnect_interval +- && node->details->remote_was_fenced) { ++ && node->details->remote_was_fenced ++ && !pe__shutdown_requested(node)) { + + /* We won't know whether the connection is recoverable until the + * reconnect interval expires and we reattempt connection. +-- +2.25.1 + diff -Nru pacemaker-1.1.18/debian/patches/lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch pacemaker-1.1.18/debian/patches/lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch --- pacemaker-1.1.18/debian/patches/lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch 2020-08-14 17:51:39.000000000 -0400 @@ -0,0 +1,96 @@ +From 33ff3ec614fb0b7e08f7b58fa602a08b5a39635f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Sat, 8 Jun 2019 16:25:04 -0500 +Subject: [PATCH] Refactor: libpe_status: add function for checking shutdown + attribute + +... to reduce code duplication and allow further reuse +--- + include/crm/pengine/internal.h | 2 +- + lib/pengine/unpack.c | 8 ++------ + lib/pengine/utils.c | 20 ++++++++++++++++++++ + 3 files changed, 23 insertions(+), 7 deletions(-) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 44aef048f..962c51db8 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -324,5 +324,5 @@ bool container_fix_remote_addr(resource_t *rsc); + const char *container_fix_remote_addr_in(resource_t *rsc, xmlNode *xml, const char *field); + const char *pe_node_attribute_calculated(pe_node_t *node, const char *name, resource_t *rsc); + const char *pe_node_attribute_raw(pe_node_t *node, const char *name); +- ++bool pe__shutdown_requested(pe_node_t *node); + #endif +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 71307790e..7e6a89395 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -1006,7 +1006,6 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * + const char *resource_discovery_enabled = NULL; + xmlNode *attrs = NULL; + resource_t *rsc = NULL; +- const char *shutdown = NULL; + + if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { + return; +@@ -1028,8 +1027,7 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * + attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); + add_node_attrs(attrs, this_node, TRUE, data_set); + +- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); +- if (shutdown != NULL && safe_str_neq("0", shutdown)) { ++ if (pe__shutdown_requested(this_node)) { + crm_info("Node %s is shutting down", this_node->details->uname); + this_node->details->shutdown = TRUE; + if (rsc) { +@@ -1489,7 +1487,6 @@ gboolean + determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) + { + gboolean online = FALSE; +- const char *shutdown = NULL; + const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); + + if (this_node == NULL) { +@@ -1499,9 +1496,8 @@ determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set + + this_node->details->shutdown = FALSE; + this_node->details->expected_up = FALSE; +- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); + +- if (shutdown != NULL && safe_str_neq("0", shutdown)) { ++ if (pe__shutdown_requested(this_node)) { + this_node->details->shutdown = TRUE; + + } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 0ce0e30c9..4aaab312d 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2364,3 +2364,23 @@ void pe_action_set_reason(pe_action_t *action, const char *reason, bool overwrit + } + } + } ++ ++/*! ++ * \internal ++ * \brief Check whether shutdown has been requested for a node ++ * ++ * \param[in] node Node to check ++ * ++ * \return TRUE if node has shutdown attribute set and nonzero, FALSE otherwise ++ * \note This differs from simply using node->details->shutdown in that it can ++ * be used before that has been determined (and in fact to determine it), ++ * and it can also be used to distinguish requested shutdown from implicit ++ * shutdown of remote nodes by virtue of their connection stopping. ++ */ ++bool ++pe__shutdown_requested(pe_node_t *node) ++{ ++ const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); ++ ++ return shutdown && strcmp(shutdown, "0"); ++} +-- +2.25.1 + diff -Nru pacemaker-1.1.18/debian/patches/series pacemaker-1.1.18/debian/patches/series --- pacemaker-1.1.18/debian/patches/series 2020-03-05 23:28:20.000000000 -0300 +++ pacemaker-1.1.18/debian/patches/series 2020-08-14 17:51:39.000000000 -0400 @@ -20,3 +20,6 @@ lp1866119-Fix-crmd-avoid-double-free.patch lp1866119-Fix-pengine-unfence-before-probing.patch lp1866119-Refactor-pengine-functionize.patch +lp1890491-Fix-libpe_status-don-t-order-implied-stops-relative-.patch +lp1890491-Fix-scheduler-remote-state-is-failed-if-node-is-shut.patch +lp1890491-Refactor-libpe_status-add-function-for-checking-shut.patch