Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions src/uraft/saunafs-uraft-helper.in
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,28 @@ saunafs_demote() {
saunafs_master -o initial-personality=shadow restart
}

saunafs_cleanup_dirty_state() {
load_config

log "Attempting to clean up dirty metadata state"

# Remove temporary metadata file created during metadata dump, if it exists
if [[ -f "${SAUNAFS_DATA_DIR}/metadata.sfs.tmp" ]]; then
# Check if temporary metadata file is stale (no process holding it).
# This could represent a leftover from a crashed operation during metadata dump.
if ! fuser "${SAUNAFS_DATA_DIR}/metadata.sfs.tmp" &>/dev/null; then
log "Removing ${SAUNAFS_DATA_DIR}/metadata.sfs.tmp"
rm -f "${SAUNAFS_DATA_DIR}/metadata.sfs.tmp" || {
log "Failed to remove metadata.sfs.tmp"
return $SAUNAFS_URAFT_ERROR
}
fi
fi

log "Cleanup dirty metadata completed successfully"
return $SAUNAFS_URAFT_OK
}

saunafs_quick_stop() {
echo -n "${ADMIN_PASSWORD}" | \
saunafs_admin stop-master-without-saving-metadata "${matocl_host}" ${matocl_port}
Expand Down Expand Up @@ -292,9 +314,32 @@ saunafs_assign_ip() {

saunafs_drop_ip() {
load_config
sudo ip -f inet addr delete $ipaddr/$netmask dev $iface
if [[ "$ipaddr2" != "" ]]; then
sudo ip -f inet addr delete $ipaddr2/$netmask2 dev $iface2

# Check if floating IP exists before trying to drop it
if [[ -n "$ipaddr" ]]; then
if is_ip_present "${ipaddr}" "${iface}"; then
log "Dropping floating IP: ${ipaddr} from interface: ${iface}"
if ! sudo ip -f inet addr delete $ipaddr/$netmask dev $iface; then
log "Warning: Failed to drop floating IP: ${ipaddr}"
else
log "Successfully dropped floating IP: ${ipaddr}"
fi
else
log "Floating IP ${ipaddr} not present - nothing to drop"
fi
fi

if [[ -n "$ipaddr2" ]]; then
if is_ip_present "$ipaddr2" "$iface2"; then
log "Dropping floating IP: ${ipaddr2} from interface: ${iface2}"
if ! sudo ip -f inet addr delete $ipaddr2/$netmask2 dev $iface2; then
log "Warning: Failed to drop floating IP: ${ipaddr2}"
else
log "Successfully dropped floating IP: ${ipaddr2}"
fi
else
log "Floating IP ${ipaddr2} not present - nothing to drop"
fi
fi
}

Expand All @@ -310,6 +355,7 @@ print_help() {
echo "quick-stop"
echo "promote"
echo "demote"
echo "cleanup"
echo "assign-ip"
echo "drop-ip"
echo "dead"
Expand All @@ -325,6 +371,7 @@ case "$1" in
quick-stop) saunafs_quick_stop;;
promote) saunafs_promote;;
demote) saunafs_demote;;
cleanup) saunafs_cleanup_dirty_state;;
assign-ip) saunafs_assign_ip;;
drop-ip) saunafs_drop_ip;;
dead) saunafs_dead;;
Expand Down
30 changes: 30 additions & 0 deletions src/uraft/uraft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include "uraft.h"

#include <syslog.h>

#if defined(SAUNAFS_HAVE_GETIFADDRS)
#include <sys/types.h>
#include <ifaddrs.h>
Expand Down Expand Up @@ -95,6 +97,8 @@ void uRaft::checkTerm(int /*id*/, const RpcHeader &data) {
if (data.term > state_.current_term) {
if (state_.president) {
assert(state_.type != kFollower);
syslog(LOG_NOTICE, "(%s): Higher term detected (%lu): Node %s starting new election",
__func__, data.term, nodeToString(state_.id).c_str());
state_.current_term = data.term;
electionTimeout(boost::system::error_code());
return;
Expand Down Expand Up @@ -215,6 +219,8 @@ void uRaft::electionTimeout(const boost::system::error_code &error) {
startElectionTimer();
} else {
state_.type = kLeader;
syslog(LOG_NOTICE, "(%s): Election won with quorum: Promoting node %s to Leader", __func__,
nodeToString(state_.id).c_str());
nodePromote();
}
}
Expand All @@ -240,6 +246,10 @@ void uRaft::heartbeat(const boost::system::error_code &error) {
state_.voted_for = -1;
startElectionTimer();
}

syslog(LOG_WARNING,
"(%s): Heartbeat quorum lost: Demoting current Leader %s to follower", __func__,
nodeToString(state_.id).c_str());
state_.president = false;

nodeDemote();
Expand All @@ -257,6 +267,8 @@ void uRaft::heartbeat(const boost::system::error_code &error) {
if (!state_.president) {
if (voteCount(true) >= opt_.quorum) {
state_.president = true;
syslog(LOG_NOTICE, "(%s): Heartbeat quorum confirmed: Node %s is now active Leader",
__func__, nodeToString(state_.id).c_str());
nodePromote();
}
}
Expand Down Expand Up @@ -441,6 +453,9 @@ void uRaft::demoteLeader() {
if (state_.type == kFollower) {
return;
}

syslog(LOG_NOTICE, "(%s): Manual demotion: Stepping down Leader %s to follower", __func__,
nodeToString(state_.id).c_str());
state_.type = kFollower;
state_.voted_for = -1;
state_.leader_id = -1;
Expand Down Expand Up @@ -507,3 +522,18 @@ int uRaft::scanLocalInterfaces() {
return -1;
#endif
}

std::string uRaft::nodeToString(int id) {
if (id < 0 || id >= static_cast<int>(opt_.server.size())) {
return "Invalid node ID";
}

std::string name = opt_.server[id];
std::string::size_type p = name.find(":");

if (p != std::string::npos) {
name = name.substr(0, p);
}

return name;
}
2 changes: 2 additions & 0 deletions src/uraft/uraft.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ class uRaft {
int findMatchingAddress(const boost::asio::ip::address &addr, int &id);
int scanLocalInterfaces();

std::string nodeToString(int id);

protected:
boost::asio::io_context &io_service_;
boost::asio::ip::udp::socket socket_;
Expand Down
140 changes: 101 additions & 39 deletions src/uraft/uraftcontroller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ uRaftController::uRaftController(boost::asio::io_context &ios)
cmd_timeout_timer_(ios) {
command_pid_ = -1;
command_type_ = kCmdNone;
force_demote_ = false;
is_demote_pending_ = false;
is_promote_pending_ = false;
node_alive_ = false;

opt_.check_node_status_period = 250;
Expand Down Expand Up @@ -67,42 +68,48 @@ void uRaftController::set_options(const uRaftController::Options &opt) {
void uRaftController::nodePromote() {
syslog(LOG_NOTICE, "Starting metadata server switch to master mode");

if (command_pid_ >= 0 && command_type_ != kCmdPromote) {
syslog(LOG_ERR, "Trying to switch metadata server to master during switch to slave");
stopFloatingIpManager();
demoteLeader();
set_block_promotion(true);
return;
}
// Prevent concurrent transitions
if (command_pid_ >= 0) {
if (command_type_ == kCmdDemote) {
syslog(
LOG_WARNING,
"Promotion requested during demotion - will be started after completing demotion");
is_promote_pending_ = true;
return;
}
// Already promoting
syslog(LOG_DEBUG, "Promotion already in progress (PID %d)", command_pid_);
return;
}

setSlowCommandTimeout(opt_.promote_timeout);
if (runSlowCommand("saunafs-uraft-helper promote")) {
command_type_ = kCmdPromote;
startFloatingIpManager();
// Floating IP Manager will be started after promotion completes successfully
}
}

void uRaftController::nodeDemote() {
syslog(LOG_NOTICE, "Starting metadata server switch to slave mode");

if (command_pid_ >= 0 && command_type_ != kCmdDemote) {
syslog(LOG_ERR, "Trying to switch metadata server to slave during switch to master");
force_demote_ = true;
set_block_promotion(true);
return;
}
// Stop Floating IP Manager immediately when starting demotion
stopFloatingIpManager();

if (command_pid_ >= 0) {
if (command_type_ == kCmdPromote) {
syslog(
LOG_WARNING,
"Demotion requested during promotion - will be started after completing promotion");
is_demote_pending_ = true;
return;
}
syslog(LOG_DEBUG, "Demotion already in progress (PID %d)", command_pid_);
return;
}

setSlowCommandTimeout(opt_.demote_timeout);
if (runSlowCommand("saunafs-uraft-helper demote")) {
command_type_ = kCmdDemote;
set_block_promotion(true);
stopFloatingIpManager();
}
}

Expand Down Expand Up @@ -136,18 +143,8 @@ uint64_t uRaftController::nodeGetVersion() {
}

void uRaftController::nodeLeader(int id) {
if (id < 0) {
return;
}

std::string name = opt_.server[id];
std::string::size_type p = name.find(":");

if (p != std::string::npos) {
name = name.substr(0, p);
}

syslog(LOG_NOTICE, "Node '%s' is now a leader.", name.c_str());
auto leaderNode = nodeToString(id);
syslog(LOG_NOTICE, "Node '%s' is now a leader.", leaderNode.c_str());
}

/*! \brief Check promote/demote script status. */
Expand All @@ -157,20 +154,44 @@ void uRaftController::checkCommandStatus(const boost::system::error_code &error)
int status;
if (checkSlowCommand(status)) {
cmd_timeout_timer_.cancel();

// Check if command completed successfully
bool commandSucceeded = WIFEXITED(status) && WEXITSTATUS(status) == 0;

if (command_type_ == kCmdDemote) {
syslog(LOG_NOTICE, "Metadata server switch to slave mode done");
if (commandSucceeded) {
syslog(LOG_NOTICE, "Metadata server switch to slave mode succeeded");
} else {
syslog(LOG_ERR, "Demotion failed with exit code: %d", WEXITSTATUS(status));
}

command_type_ = kCmdNone;
command_pid_ = -1;
set_block_promotion(false);

if (is_promote_pending_) {
syslog(LOG_WARNING, "Starting pending promotion to master mode");
nodePromote();
is_promote_pending_ = false;
}
} else if (command_type_ == kCmdPromote) {
syslog(LOG_NOTICE, "Metadata server switch to master mode done");
node_alive_ = true;
if (commandSucceeded) {
syslog(LOG_NOTICE, "Metadata server switch to master mode done");
node_alive_ = true;

// Start floating IP only after successful promotion
startFloatingIpManager();
} else {
syslog(LOG_ERR, "Promotion failed with exit code: %d", WEXITSTATUS(status));
handlePromotionFailure();
}

command_type_ = kCmdNone;
command_pid_ = -1;
if (force_demote_) {
syslog(LOG_WARNING, "Staring forced switch to slave mode");

if (is_demote_pending_) {
syslog(LOG_WARNING, "Starting pending demotion to slave mode");
nodeDemote();
force_demote_ = false;
is_demote_pending_ = false;
}
} else if (command_type_ == kCmdStatusDead) {
syslog(LOG_NOTICE, "Waiting for new metadata server instance to be available");
Expand Down Expand Up @@ -211,7 +232,6 @@ void uRaftController::checkNodeStatus(const boost::system::error_code &error) {
syslog(LOG_NOTICE, "Metadata server is dead");
stopFloatingIpManager();
demoteLeader();
set_block_promotion(true);
setSlowCommandTimeout(opt_.dead_handler_timeout);
if (runSlowCommand("saunafs-uraft-helper dead")) {
command_type_ = kCmdStatusDead;
Expand All @@ -228,9 +248,20 @@ void uRaftController::checkNodeStatus(const boost::system::error_code &error) {

void uRaftController::setSlowCommandTimeout(int timeout) {
cmd_timeout_timer_.expires_from_now(boost::posix_time::millisec(timeout));
cmd_timeout_timer_.async_wait([this](const boost::system::error_code & error) {
cmd_timeout_timer_.async_wait([this, timeout](const boost::system::error_code & error) {
if (!error) {
syslog(LOG_ERR, "Metadata server mode switching timeout");
syslog(LOG_ERR, "Metadata server mode switching timeout after %d ms", timeout);

// Cleanup based on operation type
if (command_type_ == kCmdPromote) {
syslog(LOG_WARNING, "Promotion timeout - attempting cleanup");
handlePromotionFailure();
} else if (command_type_ == kCmdDemote) {
syslog(LOG_WARNING, "Demotion timeout - killing stuck command");
} else if (command_type_ == kCmdStatusDead) {
syslog(LOG_WARNING, "Dead handler timeout - killing stuck command");
}

stopSlowCommand();
}
});
Expand Down Expand Up @@ -417,3 +448,34 @@ void uRaftController::stopFloatingIpManager() {
haFloatingIpManager.reset();
}
}

/// @brief Clean up dirty metadata state after failed promotion.
///
/// This prevents subsequent promotion attempts from failing due to leftover files from crashed or
/// timed-out promotion operations.
///
/// \note Uses a 5-second timeout for the cleanup operation.
/// \note Logs errors if cleanup fails but does not throw exceptions.
void uRaftController::cleanupDirtyPromotion() {
std::vector<std::string> cleanup = {"saunafs-uraft-helper", "cleanup"};
std::string result;

if (!runCommand(cleanup, result, 5000)) {
syslog(LOG_ERR, "Failed to cleanup dirty state: %s", result.c_str());
}
}

/// @brief Handle failed promotion attempts and restore cluster consistency.
///
/// This ensures that if a node wins a Raft election but cannot complete the promotion to
/// SaunaFS master (e.g., due to corrupted metadata, resource exhaustion, or script failures),
/// the cluster can recover by electing a different node as leader.
///
/// Without this recovery mechanism, the cluster could enter a deadlock where:
/// - The failed node remains Raft leader but cannot serve as master.
/// - Other nodes cannot win elections due to Raft term constraints.
/// - Manual intervention becomes necessary to restore cluster operation.
void uRaftController::handlePromotionFailure() {
demoteLeader();
cleanupDirtyPromotion();
}
Loading