diff -Nru pacemaker-1.1.10+git20130802/debian/changelog pacemaker-1.1.10+git20130802/debian/changelog --- pacemaker-1.1.10+git20130802/debian/changelog 2014-08-08 09:56:26.000000000 -0300 +++ pacemaker-1.1.10+git20130802/debian/changelog 2014-08-08 16:10:53.000000000 -0300 @@ -1,3 +1,11 @@ +pacemaker (1.1.10+git20130802-4ubuntu3) utopic; urgency=medium + + * Fix: services: Do not allow duplicate recurring op entries - 1/3 (LP: #1353473) + * High: lrmd: Merge duplicate recurring monitor operations - 2/3 (LP: #1353473) + * Fix: lrmd: Cancel recurring operations before stop action is executed - 3/3 (LP: #1353473) + + -- Rafael David Tinoco Fri, 08 Aug 2014 16:10:26 -0300 + pacemaker (1.1.10+git20130802-4ubuntu2) utopic; urgency=high * No change rebuild against gnutls28. diff -Nru pacemaker-1.1.10+git20130802/debian/patches/Fix-lrmd-Cancel-recurring-operations-before-stop-act.patch pacemaker-1.1.10+git20130802/debian/patches/Fix-lrmd-Cancel-recurring-operations-before-stop-act.patch --- pacemaker-1.1.10+git20130802/debian/patches/Fix-lrmd-Cancel-recurring-operations-before-stop-act.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.10+git20130802/debian/patches/Fix-lrmd-Cancel-recurring-operations-before-stop-act.patch 2014-08-08 16:09:40.000000000 -0300 @@ -0,0 +1,295 @@ +Description: [PATCH 3/3] Fix: lrmd: Cancel recurring operations before stop action is executed + +Origin: upstream, commit: 348bb51 +Author: David Vossel +Last-Updated: 2014-08-08 +Bug-Ubuntu: https://bugs.launchpad.net/bugs/1353473 + +--- + include/crm/common/mainloop.h | 1 + + lib/common/mainloop.c | 167 ++++++++++++++++++++++++++++-------------- + lib/services/services.c | 9 ++- + lrmd/lrmd.c | 10 +-- + 4 files changed, 127 insertions(+), 60 deletions(-) + +diff --git a/include/crm/common/mainloop.h b/include/crm/common/mainloop.h +index 0941f1b..baee2ee 100644 +--- a/include/crm/common/mainloop.h ++++ b/include/crm/common/mainloop.h +@@ -93,6 +93,7 @@ const char *mainloop_child_name(mainloop_child_t * child); + + pid_t mainloop_child_pid(mainloop_child_t * child); + void mainloop_clear_child_userdata(mainloop_child_t * child); ++gboolean mainloop_child_kill(pid_t pid); + + # define G_PRIORITY_MEDIUM (G_PRIORITY_HIGH/2) + +diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c +index d678584..a77e90c 100644 +--- a/lib/common/mainloop.c ++++ b/lib/common/mainloop.c +@@ -855,10 +855,35 @@ mainloop_clear_child_userdata(mainloop_child_t * child) + child->privatedata = NULL; + } + ++/* good function name */ ++static void ++child_free(mainloop_child_t *child) ++{ ++ if (child->timerid != 0) { ++ crm_trace("Removing timer %d", child->timerid); ++ g_source_remove(child->timerid); ++ child->timerid = 0; ++ } ++ free(child->desc); ++ free(child); ++} ++ ++/* terrible function name */ ++static int ++child_kill_helper(mainloop_child_t *child) ++{ ++ if (kill(child->pid, SIGKILL) < 0) { ++ crm_perror(LOG_ERR, "kill(%d, KILL) failed", child->pid); ++ return -errno; ++ } ++ return 0; ++} ++ + static gboolean + child_timeout_callback(gpointer p) + { + mainloop_child_t *child = p; ++ int rc = 0; + + child->timerid = 0; + if (child->timeout) { +@@ -866,88 +891,122 @@ child_timeout_callback(gpointer p) + return FALSE; + } + ++ rc = child_kill_helper(child); ++ if (rc == ESRCH) { ++ /* Nothing left to do. pid doesn't exist */ ++ return FALSE; ++ } ++ + child->timeout = TRUE; + crm_warn("%s process (PID %d) timed out", child->desc, (int)child->pid); + +- if (kill(child->pid, SIGKILL) < 0) { +- if (errno == ESRCH) { +- /* Nothing left to do */ +- return FALSE; +- } +- crm_perror(LOG_ERR, "kill(%d, KILL) failed", child->pid); +- } +- + child->timerid = g_timeout_add(5000, child_timeout_callback, child); + return FALSE; + } + + static GListPtr child_list = NULL; + +-static void +-child_death_dispatch(int signal) ++static gboolean ++child_waitpid(mainloop_child_t *child, int flags) + { +- GListPtr iter = child_list; +- +- while(iter) { +- int rc = 0; +- int core = 0; +- int signo = 0; +- int status = 0; +- int exitcode = 0; +- +- GListPtr saved = NULL; +- mainloop_child_t *child = iter->data; ++ int rc = 0; ++ int core = 0; ++ int signo = 0; ++ int status = 0; ++ int exitcode = 0; + +- rc = waitpid(child->pid, &status, WNOHANG); +- if(rc == 0) { +- iter = iter->next; +- continue; ++ rc = waitpid(child->pid, &status, flags); ++ if(rc == 0) { ++ return FALSE; + +- } else if(rc != child->pid) { +- signo = signal; +- exitcode = 1; +- status = 1; +- crm_perror(LOG_ERR, "Call to waitpid(%d) failed", child->pid); ++ } else if(rc != child->pid) { ++ signo = SIGCHLD; ++ exitcode = 1; ++ status = 1; ++ crm_perror(LOG_ERR, "Call to waitpid(%d) failed", child->pid); + +- } else { +- crm_trace("Managed process %d exited: %p", child->pid, child); ++ } else { ++ crm_trace("Managed process %d exited: %p", child->pid, child); + +- if (WIFEXITED(status)) { +- exitcode = WEXITSTATUS(status); +- crm_trace("Managed process %d (%s) exited with rc=%d", child->pid, child->desc, exitcode); ++ if (WIFEXITED(status)) { ++ exitcode = WEXITSTATUS(status); ++ crm_trace("Managed process %d (%s) exited with rc=%d", child->pid, child->desc, exitcode); + +- } else if (WIFSIGNALED(status)) { +- signo = WTERMSIG(status); +- crm_trace("Managed process %d (%s) exited with signal=%d", child->pid, child->desc, signo); +- } ++ } else if (WIFSIGNALED(status)) { ++ signo = WTERMSIG(status); ++ crm_trace("Managed process %d (%s) exited with signal=%d", child->pid, child->desc, signo); ++ } + #ifdef WCOREDUMP +- if (WCOREDUMP(status)) { +- core = 1; +- crm_err("Managed process %d (%s) dumped core", child->pid, child->desc); +- } +-#endif ++ if (WCOREDUMP(status)) { ++ core = 1; ++ crm_err("Managed process %d (%s) dumped core", child->pid, child->desc); + } ++#endif ++ } + +- if (child->callback) { +- child->callback(child, child->pid, core, signo, exitcode); +- } ++ if (child->callback) { ++ child->callback(child, child->pid, core, signo, exitcode); ++ } ++ return TRUE; ++} + +- crm_trace("Removing process entry %p for %d", child, child->pid); ++static void ++child_death_dispatch(int signal) ++{ ++ GListPtr iter = child_list; ++ gboolean exited; ++ ++ while(iter) { ++ GListPtr saved = NULL; ++ mainloop_child_t *child = iter->data; ++ exited = child_waitpid(child, WNOHANG); + + saved = iter; + iter = iter->next; + ++ if (exited == FALSE) { ++ continue; ++ } ++ crm_trace("Removing process entry %p for %d", child, child->pid); ++ + child_list = g_list_remove_link(child_list, saved); + g_list_free(saved); ++ child_free(child); ++ } ++} + +- if (child->timerid != 0) { +- crm_trace("Removing timer %d", child->timerid); +- g_source_remove(child->timerid); +- child->timerid = 0; ++gboolean ++mainloop_child_kill(pid_t pid) ++{ ++ GListPtr iter; ++ mainloop_child_t *child = NULL; ++ ++ for (iter = child_list; iter != NULL; iter = iter->next) { ++ child = iter->data; ++ if (pid == child->pid) { ++ break; + } +- free(child->desc); +- free(child); + } ++ ++ if (child == NULL) { ++ return FALSE; ++ } ++ ++ if (child_kill_helper(child) != 0) { ++ /* failed to terminate child process */ ++ return FALSE; ++ } ++ ++ /* It is impossible to block SIGKILL, this allows us to ++ * call waitpid without WNOHANG here */ ++ if (child_waitpid(child, 0) == FALSE) { ++ /* not much we can do if this occurs */ ++ return FALSE; ++ } ++ ++ child_list = g_list_remove(child_list, child); ++ child_free(child); ++ return TRUE; + } + + /* Create/Log a new tracked process +diff --git a/lib/services/services.c b/lib/services/services.c +index a9a7fd4..ce97bd5 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -382,8 +382,15 @@ services_action_cancel(const char *name, const char *action, int interval) + } + services_action_free(op); + } else { +- crm_info("Cancelling op: %s will occur once operation completes", id); ++ int rc; ++ crm_info("Cancelling in-flight op: performing early termination of %s", id); + op->cancel = 1; ++ rc = mainloop_child_kill(op->pid); ++ if (rc != 0 ) { ++ /* even though the early termination failed, ++ * the op will be marked as cancelled once it completes. */ ++ crm_err("Termination of %s failed", id); ++ } + } + + return TRUE; +diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c +index a3a00ab..9edc749 100644 +--- a/lrmd/lrmd.c ++++ b/lrmd/lrmd.c +@@ -314,6 +314,11 @@ schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + return; + } + ++ /* crmd expects lrmd to automatically cancel recurring ops before rsc stops. */ ++ if (rsc && safe_str_eq(cmd->action, "stop")) { ++ cancel_all_recurring(rsc, NULL); ++ } ++ + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); + #ifdef HAVE_SYS_TIMEB_H + ftime(&cmd->t_queue); +@@ -502,11 +507,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) + + send_cmd_complete_notify(cmd); + +- /* crmd expects lrmd to automatically cancel recurring ops after rsc stops */ +- if (rsc && safe_str_eq(cmd->action, "stop")) { +- cancel_all_recurring(rsc, NULL); +- } +- + if (cmd->interval && (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED)) { + if (rsc) { + rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd); +-- +1.9.1 + diff -Nru pacemaker-1.1.10+git20130802/debian/patches/Fix-services-Do-not-allow-duplicate-recurring-op-ent.patch pacemaker-1.1.10+git20130802/debian/patches/Fix-services-Do-not-allow-duplicate-recurring-op-ent.patch --- pacemaker-1.1.10+git20130802/debian/patches/Fix-services-Do-not-allow-duplicate-recurring-op-ent.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.10+git20130802/debian/patches/Fix-services-Do-not-allow-duplicate-recurring-op-ent.patch 2014-08-08 16:09:40.000000000 -0300 @@ -0,0 +1,94 @@ +Description: [PATCH 1/3] Fix: services: Do not allow duplicate recurring op entries + +Duplicate recurring operations silently replace each +other in a way that makes the original entry impossible +to cancel. This can cause unexpected monitor failures to +occur after resources were thought to have been stopped. + +Origin: upstream, commit: 48f90f6 +Author: David Vossel +Last-Updated: 2014-08-08 +Bug-Ubuntu: https://bugs.launchpad.net/bugs/1353473 + +--- + lib/services/services.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 45 insertions(+), 4 deletions(-) + +diff --git a/lib/services/services.c b/lib/services/services.c +index adfc508..373736e 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -304,6 +304,9 @@ services_action_free(svc_action_t * op) + return; + } + ++ if (op->opaque->repeat_timer) { ++ g_source_remove(op->opaque->repeat_timer); ++ } + if (op->opaque->stderr_gsource) { + mainloop_del_fd(op->opaque->stderr_gsource); + op->opaque->stderr_gsource = NULL; +@@ -386,6 +389,44 @@ services_action_cancel(const char *name, const char *action, int interval) + return TRUE; + } + ++/* add new recurring operation, check for duplicates. ++ * - if duplicate found, return TRUE, immediately reschedule op. ++ * - if no dup, return FALSE, inserve into recurring op list.*/ ++static gboolean ++handle_duplicate_recurring(svc_action_t * op, void (*action_callback) (svc_action_t *)) ++{ ++ svc_action_t * dup = NULL; ++ ++ if (recurring_actions == NULL) { ++ recurring_actions = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, NULL); ++ return FALSE; ++ } ++ ++ /* check for duplicates */ ++ dup = g_hash_table_lookup(recurring_actions, op->id); ++ ++ if (dup && (dup != op)) { ++ /* update user data */ ++ if (op->opaque->callback) { ++ dup->opaque->callback = op->opaque->callback; ++ dup->cb_data = op->cb_data; ++ op->cb_data = NULL; ++ } ++ /* immediately execute the next interval */ ++ if (dup->pid != 0) { ++ if (op->opaque->repeat_timer) { ++ g_source_remove(op->opaque->repeat_timer); ++ } ++ recurring_action_timer(dup); ++ } ++ /* free the dup. */ ++ services_action_free(op); ++ return TRUE; ++ } ++ ++ return FALSE; ++} ++ + gboolean + services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *)) + { +@@ -393,11 +434,11 @@ services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t * + op->opaque->callback = action_callback; + } + +- if (recurring_actions == NULL) { +- recurring_actions = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, NULL); +- } +- + if (op->interval > 0) { ++ if (handle_duplicate_recurring(op, action_callback) == TRUE) { ++ /* entry rescheduled, dup freed */ ++ return TRUE; ++ } + g_hash_table_replace(recurring_actions, op->id, op); + } + #if SUPPORT_UPSTART +-- +1.9.1 + diff -Nru pacemaker-1.1.10+git20130802/debian/patches/High-lrmd-Merge-duplicate-recurring-monitor-operatio.patch pacemaker-1.1.10+git20130802/debian/patches/High-lrmd-Merge-duplicate-recurring-monitor-operatio.patch --- pacemaker-1.1.10+git20130802/debian/patches/High-lrmd-Merge-duplicate-recurring-monitor-operatio.patch 1969-12-31 21:00:00.000000000 -0300 +++ pacemaker-1.1.10+git20130802/debian/patches/High-lrmd-Merge-duplicate-recurring-monitor-operatio.patch 2014-08-08 16:09:43.000000000 -0300 @@ -0,0 +1,230 @@ +Description: [PATCH 2/3] High: lrmd: Merge duplicate recurring monitor operations + +Never allow two instances of the same recurring monitor operation +to exist in the lrmd. + +Conflicts: + include/crm/services.h + +* This conflict was due to different functions declared in services.h. +* resources_find_service_class() does not exist in 1.1.10. + +Origin: upstream, commit: c29ab27 +Author: David Vossel +Last-Updated: 2014-08-08 +Bug-Ubuntu: https://bugs.launchpad.net/bugs/1353473 +Signed-off-by: Rafael David Tinoco + +--- + include/crm/services.h | 5 +++ + lib/services/services.c | 29 +++++++++++++++ + lrmd/lrmd.c | 93 ++++++++++++++++++++++++++++++++++++++++++------- + 3 files changed, 114 insertions(+), 13 deletions(-) + +diff --git a/include/crm/services.h b/include/crm/services.h +index fb5c6b0..6c34782 100644 +--- a/include/crm/services.h ++++ b/include/crm/services.h +@@ -231,6 +231,11 @@ enum nagios_exitcode { + int timeout /* ms */ , GHashTable * params); + + /** ++ * Kick a recurring action so it is scheduled immediately for re-execution ++ */ ++ gboolean services_action_kick(const char *name, const char *action, int interval /* ms */); ++ ++/** + * Utilize services API to execute an arbitrary command. + * + * This API has useful infrastructure in place to be able to run a command +diff --git a/lib/services/services.c b/lib/services/services.c +index 373736e..a9a7fd4 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -389,6 +389,35 @@ services_action_cancel(const char *name, const char *action, int interval) + return TRUE; + } + ++gboolean ++services_action_kick(const char *name, const char *action, int interval /* ms */) ++{ ++ svc_action_t * op = NULL; ++ char *id = NULL; ++ ++ if (asprintf(&id, "%s_%s_%d", name, action, interval) == -1) { ++ return FALSE; ++ } ++ ++ op = g_hash_table_lookup(recurring_actions, id); ++ free(id); ++ ++ if (op == NULL) { ++ return FALSE; ++ } ++ ++ if (op->pid) { ++ return TRUE; ++ } else { ++ if (op->opaque->repeat_timer) { ++ g_source_remove(op->opaque->repeat_timer); ++ } ++ recurring_action_timer(op); ++ return TRUE; ++ } ++ ++} ++ + /* add new recurring operation, check for duplicates. + * - if duplicate found, return TRUE, immediately reschedule op. + * - if no dup, return FALSE, inserve into recurring op list.*/ +diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c +index a4747cb..a3a00ab 100644 +--- a/lrmd/lrmd.c ++++ b/lrmd/lrmd.c +@@ -122,6 +122,17 @@ log_execute(lrmd_cmd_t * cmd) + cmd->rsc_id, cmd->action, cmd->call_id); + } + ++static const char * ++normalize_action_name(lrmd_rsc_t * rsc, const char *action) ++{ ++ if (safe_str_eq(action, "monitor") && ++ (safe_str_eq(rsc->class, "lsb") || ++ safe_str_eq(rsc->class, "service") || safe_str_eq(rsc->class, "systemd"))) { ++ return "status"; ++ } ++ return action; ++} ++ + static lrmd_rsc_t * + build_rsc_from_xml(xmlNode * msg) + { +@@ -233,13 +244,76 @@ start_delay_helper(gpointer data) + return FALSE; + } + ++static gboolean ++merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) ++{ ++ GListPtr gIter = NULL; ++ lrmd_cmd_t * dup = NULL; ++ gboolean dup_pending = FALSE; ++ ++ if (cmd->interval == 0) { ++ return 0; ++ } ++ ++ for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) { ++ dup = gIter->data; ++ if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) { ++ dup_pending = TRUE; ++ goto merge_dup; ++ } ++ } ++ ++ /* if dup is in recurring_ops list, that means it has already executed ++ * and is in the interval loop. we can't just remove it in this case. */ ++ for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) { ++ dup = gIter->data; ++ if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) { ++ goto merge_dup; ++ } ++ } ++ ++ return FALSE; ++merge_dup: ++ ++ /* merge */ ++ dup->first_notify_sent = 0; ++ free(dup->userdata_str); ++ dup->userdata_str = cmd->userdata_str; ++ cmd->userdata_str = NULL; ++ dup->call_id = cmd->call_id; ++ ++ if (safe_str_eq(rsc->class, "stonith")) { ++ /* if we are waiting for the next interval, kick it off now */ ++ if (dup_pending == TRUE) { ++ g_source_remove(cmd->stonith_recurring_id); ++ cmd->stonith_recurring_id = 0; ++ stonith_recurring_op_helper(cmd); ++ } ++ ++ } else if (dup_pending == FALSE) { ++ /* if we've already handed this to the service lib, kick off an early execution */ ++ services_action_kick(rsc->rsc_id, normalize_action_name(rsc, dup->action), dup->interval); ++ } ++ free_lrmd_cmd(cmd); ++ ++ return TRUE; ++} ++ + static void + schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + { ++ gboolean dup_processed = FALSE; + CRM_CHECK(cmd != NULL, return); + CRM_CHECK(rsc != NULL, return); + + crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id); ++ ++ dup_processed = merge_recurring_duplicate(rsc, cmd); ++ if (dup_processed) { ++ /* duplicate recurring cmd found, cmds merged */ ++ return; ++ } ++ + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); + #ifdef HAVE_SYS_TIMEB_H + ftime(&cmd->t_queue); +@@ -249,7 +323,6 @@ schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + if (cmd->start_delay) { + cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); + } +- + } + + static void +@@ -809,17 +882,6 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + return rc; + } + +-static const char * +-normalize_action_name(lrmd_rsc_t * rsc, const char *action) +-{ +- if (safe_str_eq(action, "monitor") && +- (safe_str_eq(rsc->class, "lsb") || +- safe_str_eq(rsc->class, "service") || safe_str_eq(rsc->class, "systemd"))) { +- return "status"; +- } +- return action; +-} +- + static void + dup_attr(gpointer key, gpointer value, gpointer user_data) + { +@@ -1129,6 +1191,7 @@ process_lrmd_rsc_exec(crm_client_t * client, uint32_t id, xmlNode * request) + lrmd_cmd_t *cmd = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); ++ int call_id; + + if (!rsc_id) { + return -EINVAL; +@@ -1140,9 +1203,13 @@ process_lrmd_rsc_exec(crm_client_t * client, uint32_t id, xmlNode * request) + } + + cmd = create_lrmd_cmd(request, client); ++ call_id = cmd->call_id; ++ ++ /* Don't reference cmd after handing it off to be scheduled. ++ * The cmd could get merged and freed. */ + schedule_lrmd_cmd(rsc, cmd); + +- return cmd->call_id; ++ return call_id; + } + + static int +-- +1.9.1 + diff -Nru pacemaker-1.1.10+git20130802/debian/patches/series pacemaker-1.1.10+git20130802/debian/patches/series --- pacemaker-1.1.10+git20130802/debian/patches/series 2013-08-06 08:02:49.000000000 -0300 +++ pacemaker-1.1.10+git20130802/debian/patches/series 2014-08-08 16:10:00.000000000 -0300 @@ -5,3 +5,6 @@ cli_stop_after_assertion_failure.patch gracefully_handle_ECHILD_in_waitpid.patch fix_crm_mon_host_list.patch +Fix-services-Do-not-allow-duplicate-recurring-op-ent.patch +High-lrmd-Merge-duplicate-recurring-monitor-operatio.patch +Fix-lrmd-Cancel-recurring-operations-before-stop-act.patch