Skip to content

Fix: fencer: Refresh CIB devices on change to nodes section #3855

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 67 additions & 41 deletions daemons/fenced/fenced_cib.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,70 +232,91 @@ cib_devices_update(void)
}
}

#define PRIMITIVE_ID_XP_FRAGMENT "/" PCMK_XE_PRIMITIVE "[@" PCMK_XA_ID "='"

static void
update_cib_stonith_devices(const char *event, xmlNode * msg)
update_cib_stonith_devices(const xmlNode *patchset)
{
int format = 1;
xmlNode *wrapper = pcmk__xe_first_child(msg, PCMK__XE_CIB_UPDATE_RESULT,
NULL, NULL);
xmlNode *patchset = pcmk__xe_first_child(wrapper, NULL, NULL, NULL);
char *reason = NULL;

CRM_CHECK(patchset != NULL, return);
crm_element_value_int(patchset, PCMK_XA_FORMAT, &format);

if (format != 2) {
crm_warn("Unknown patch format: %d", format);
return;
}

for (xmlNode *change = pcmk__xe_first_child(patchset, NULL, NULL, NULL);
for (const xmlNode *change = pcmk__xe_first_child(patchset, NULL, NULL,
NULL);
change != NULL; change = pcmk__xe_next(change, NULL)) {

const char *op = crm_element_value(change, PCMK_XA_OPERATION);
const char *xpath = crm_element_value(change, PCMK_XA_PATH);
const char *shortpath = NULL;
const char *primitive_xpath = NULL;

if (pcmk__str_eq(op, PCMK_VALUE_MOVE, pcmk__str_null_matches)
|| (strstr(xpath, "/" PCMK_XE_STATUS) != NULL)) {
continue;
}

if (pcmk__str_eq(op, PCMK_VALUE_DELETE, pcmk__str_none)
&& (strstr(xpath, "/" PCMK_XE_PRIMITIVE) != NULL)) {
primitive_xpath = strstr(xpath, PRIMITIVE_ID_XP_FRAGMENT);
if ((primitive_xpath != NULL)
&& pcmk__str_eq(op, PCMK_VALUE_DELETE, pcmk__str_none)) {

const char *rsc_id = NULL;
char *search = NULL;
char *mutable = NULL;
const char *end_quote = NULL;

if ((strstr(xpath, PCMK_XE_INSTANCE_ATTRIBUTES) != NULL)
|| (strstr(xpath, PCMK_XE_META_ATTRIBUTES) != NULL)) {
if ((strstr(primitive_xpath, PCMK_XE_INSTANCE_ATTRIBUTES) != NULL)
|| (strstr(primitive_xpath, PCMK_XE_META_ATTRIBUTES) != NULL)) {

reason = pcmk__str_copy("(meta) attribute deleted from "
"resource");
break;
}
mutable = pcmk__str_copy(xpath);
rsc_id = strstr(mutable, PCMK_XE_PRIMITIVE "[@" PCMK_XA_ID "=\'");
if (rsc_id != NULL) {
rsc_id += strlen(PCMK_XE_PRIMITIVE "[@" PCMK_XA_ID "=\'");
search = strchr(rsc_id, '\'');

rsc_id = primitive_xpath + sizeof(PRIMITIVE_ID_XP_FRAGMENT) - 1;
end_quote = strchr(rsc_id, '\'');

CRM_LOG_ASSERT(end_quote != NULL);
if (end_quote == NULL) {
crm_err("Bug: Malformed item in Pacemaker-generated patchset");
continue;
}
if (search != NULL) {
*search = 0;
stonith_device_remove(rsc_id, true);

if (strchr(end_quote, '/') == NULL) {
/* The primitive element itself was deleted. If this was a
* fencing resource, it's faster to remove it directly than to
* run the scheduler and update all device registrations.
*/
char *copy = strndup(rsc_id, end_quote - rsc_id);

pcmk__assert(copy != NULL);
stonith_device_remove(copy, true);

/* watchdog_device_update called afterwards
to fall back to implicit definition if needed */
} else {
crm_warn("Ignoring malformed CIB update (resource deletion)");

free(copy);
continue;
}
free(mutable);

} else if (strstr(xpath, "/" PCMK_XE_RESOURCES)
|| strstr(xpath, "/" PCMK_XE_CONSTRAINTS)
|| strstr(xpath, "/" PCMK_XE_RSC_DEFAULTS)) {
shortpath = strrchr(xpath, '/');
pcmk__assert(shortpath != NULL);
reason = crm_strdup_printf("%s %s", op, shortpath+1);
}

/* @FIXME Suppose a node gets deleted in one CIB diff and then re-added
* in another. For example, this can happen if a client deletes a node
* from the CIB when it's still part of the cluster, and then the
* controller's diff callback repopulates the nodes section. If
* responding to the deletion here causes a stonith device to get
* unregistered, then the next monitor will fail. This is because when
* the device gets re-registered after nodes are repopulated, the
* api_registered flag doesn't get set. Only the cib_registered flag
* gets set.
*
* See commit message in eaee0fc for some more info.
*
* The fencer should not refuse to run a monitor action for a resource
* that the controller knows is started.
*/
if (strstr(xpath, "/" PCMK_XE_NODES)
|| strstr(xpath, "/" PCMK_XE_RESOURCES)
|| strstr(xpath, "/" PCMK_XE_CONSTRAINTS)
|| strstr(xpath, "/" PCMK_XE_RSC_DEFAULTS)) {

const char *shortpath = strrchr(xpath, '/');

reason = crm_strdup_printf("%s %s", op, shortpath + 1);
break;
}
}
Expand Down Expand Up @@ -465,6 +486,7 @@ update_fencing_topology(const char *event, xmlNode *msg)
static void
update_cib_cache_cb(const char *event, xmlNode * msg)
{
xmlNode *patchset = NULL;
long long timeout_ms_saved = stonith_watchdog_timeout_ms;
bool need_full_refresh = false;

Expand All @@ -483,7 +505,6 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
if (local_cib != NULL) {
int rc = pcmk_ok;
xmlNode *wrapper = NULL;
xmlNode *patchset = NULL;

crm_element_value_int(msg, PCMK__XA_CIB_RC, &rc);
if (rc != pcmk_ok) {
Expand All @@ -498,6 +519,11 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
switch (rc) {
case pcmk_ok:
case -pcmk_err_old_data:
/* @TODO Full refresh (with or without query) in case of
* -pcmk_err_old_data? It seems wrong to call
* stonith_device_remove() based on primitive deletion in an
* old diff.
*/
break;
case -pcmk_err_diff_resync:
case -pcmk_err_diff_failed:
Expand Down Expand Up @@ -532,7 +558,7 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
} else {
// Partial refresh
update_fencing_topology(event, msg);
update_cib_stonith_devices(event, msg);
update_cib_stonith_devices(patchset);
}

watchdog_device_update();
Expand Down
36 changes: 4 additions & 32 deletions daemons/fenced/fenced_scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,35 +52,6 @@ fenced_scheduler_init(void)
return pcmk_rc_ok;
}

/*!
* \internal
* \brief Set the local node name for scheduling purposes
*
* \param[in] node_name Name to set as local node name
*/
void
fenced_set_local_node(const char *node_name)
{
pcmk__assert(scheduler != NULL);

scheduler->priv->local_node_name = pcmk__str_copy(node_name);
}

/*!
* \internal
* \brief Get the local node name
*
* \return Local node name
*/
const char *
fenced_get_local_node(void)
{
if (scheduler == NULL) {
return NULL;
}
return scheduler->priv->local_node_name;
}

/*!
* \internal
* \brief Free all scheduler-related resources
Expand Down Expand Up @@ -112,14 +83,15 @@ fenced_scheduler_cleanup(void)
static pcmk_node_t *
local_node_allowed_for(const pcmk_resource_t *rsc)
{
if ((rsc != NULL) && (scheduler->priv->local_node_name != NULL)) {
const char *local_node = fenced_get_local_node();

if ((rsc != NULL) && (local_node != NULL)) {
GHashTableIter iter;
pcmk_node_t *node = NULL;

g_hash_table_iter_init(&iter, rsc->priv->allowed_nodes);
while (g_hash_table_iter_next(&iter, NULL, (void **) &node)) {
if (pcmk__str_eq(node->priv->name, scheduler->priv->local_node_name,
pcmk__str_casei)) {
if (pcmk__str_eq(node->priv->name, local_node, pcmk__str_casei)) {
return node;
}
}
Expand Down
21 changes: 19 additions & 2 deletions daemons/fenced/pacemaker-fenced.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2024 the Pacemaker project contributors
* Copyright 2009-2025 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand Down Expand Up @@ -40,6 +40,9 @@

#define SUMMARY "daemon for executing fencing devices in a Pacemaker cluster"

//! Local node's name in cluster
static char *local_node_name = NULL;

// @TODO This should be guint
long long stonith_watchdog_timeout_ms = 0;

Expand Down Expand Up @@ -68,6 +71,18 @@ crm_exit_t exit_code = CRM_EX_OK;

static void stonith_cleanup(void);

/*!
* \internal
* \brief Get the local node name
*
* \return Local node name
*/
const char *
fenced_get_local_node(void)
{
return local_node_name;
}

static int32_t
st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
{
Expand Down Expand Up @@ -628,7 +643,7 @@ main(int argc, char **argv)
crm_crit("Cannot sign in to the cluster... terminating");
goto done;
}
fenced_set_local_node(cluster->priv->node_name);
local_node_name = pcmk__str_copy(cluster->priv->node_name);

if (!options.stand_alone) {
setup_cib();
Expand All @@ -654,6 +669,8 @@ main(int argc, char **argv)
pcmk_cluster_free(cluster);
fenced_scheduler_cleanup();

free(local_node_name);

pcmk__output_and_clear_error(&error, out);

if (out != NULL) {
Expand Down
6 changes: 3 additions & 3 deletions daemons/fenced/pacemaker-fenced.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2024 the Pacemaker project contributors
* Copyright 2009-2025 the Pacemaker project contributors
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
Expand All @@ -13,6 +13,8 @@
#include <crm/stonith-ng.h>
#include <crm/fencing/internal.h>

const char *fenced_get_local_node(void);

/*!
* \internal
* \brief Check whether target has already been fenced recently
Expand Down Expand Up @@ -296,8 +298,6 @@ void setup_cib(void);
void fenced_cib_cleanup(void);

int fenced_scheduler_init(void);
void fenced_set_local_node(const char *node_name);
const char *fenced_get_local_node(void);
void fenced_scheduler_cleanup(void);
void fenced_scheduler_run(xmlNode *cib);

Expand Down