Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Current develop

### Added (new features/APIs/variables/...)
- [[PR 1332]](https://github.com/parthenon-hpc-lab/parthenon/pull/1332) Add global WatchDog
- [[PR 1330]](https://github.com/parthenon-hpc-lab/parthenon/pull/1330) Add userspace mechanisms to control number of comm buffers allocated
- [[PR 1319]](https://github.com/parthenon-hpc-lab/parthenon/pull/1319) Add common scratch variable utilities
- [[PR 1326]](https://github.com/parthenon-hpc-lab/parthenon/pull/1326) Thread initial swarm pool size into userspace via metadata
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ add_library(parthenon
utils/unique_id.cpp
utils/unique_id.hpp
utils/utils.hpp
utils/watchdog.cpp

argument_parser.hpp
basic_types.hpp
Expand Down
8 changes: 8 additions & 0 deletions src/argument_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ class ArgParse {
std::sscanf(argv[++i], "%d:%d:%d", &wth, &wtm, &wts);
wtlim = wth * 3600 + wtm * 60 + wts;
break;
case 'w': // -w <seconds>
invalid = invalid_arg();
watchdog_enabled = true;
watchdog_timeout = static_cast<int>(std::strtol(argv[++i], nullptr, 10));
break;
case 'c':
if (Globals::my_rank == 0) ShowConfig();
return ArgStatus::error;
Expand All @@ -104,6 +109,7 @@ class ArgParse {
std::cout << " -c show configuration and quit\n";
std::cout << " -m <nproc> output mesh structure and quit\n";
std::cout << " -t hh:mm:ss wall time limit for final output\n";
std::cout << " -w ss watchdog timeout in seconds\n";
std::cout << " -h this help\n";
}
error = true;
Expand Down Expand Up @@ -140,6 +146,8 @@ class ArgParse {
char *params_regex = nullptr;
bool analysis_flag = false;
bool is_restart = false;
bool watchdog_enabled = false;
int watchdog_timeout = 0;
int param_flag = 0;
int mesh_flag = 0;
int wtlim = 0;
Expand Down
5 changes: 5 additions & 0 deletions src/driver/driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ DriverStatus EvolutionDriver::Execute() {
while (tm.KeepGoing() && signal != OutputSignal::analysis) {
if (Globals::my_rank == 0) OutputCycleDiagnostics();

// poke the dog
if (Globals::watchdog_enabled) {
WatchDog::WatchDog(0);
}

if (pmesh->PreStepUserWorkInLoop != nullptr) {
pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm);
}
Expand Down
7 changes: 4 additions & 3 deletions src/globals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ namespace Globals {
int nghost;

// all of these global variables are set at the start of main():
int my_rank; // MPI rank of this process
int nranks; // total number of MPI ranks
bool is_restart; // Whether this simulation is restarted from a checkpoint file
int my_rank; // MPI rank of this process
int nranks; // total number of MPI ranks
bool is_restart; // Whether this simulation is restarted from a checkpoint file
bool watchdog_enabled; // Whether the simulation uses a global watchdog

// sparse configuration values that are needed in various places
SparseConfig sparse_config;
Expand Down
1 change: 1 addition & 0 deletions src/globals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct SparseConfig {

extern int my_rank, nranks, nghost;
extern bool is_restart;
extern bool watchdog_enabled;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super happy about adding a global variable, but given that the watchdog itself may or may not be tied to the driver, this reduces the amount of "threading through" and also should allow downstream to flexibly call the dog from any place.


extern SparseConfig sparse_config;

Expand Down
5 changes: 5 additions & 0 deletions src/parthenon_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ ParthenonStatus ParthenonManager::ParthenonInitEnv(int argc, char *argv[]) {
} else if (arg_status == ArgStatus::complete) {
return ParthenonStatus::complete;
}

Globals::watchdog_enabled = arg.watchdog_enabled;
if (Globals::watchdog_enabled) {
parthenon::WatchDog::WatchDog(arg.watchdog_timeout);
}
// Now that the input is parsed we can pass the info to globals
Globals::is_restart = arg.is_restart;

Expand Down
4 changes: 4 additions & 0 deletions src/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ constexpr auto get_array_from_tuple(tuple_t &&tuple) {
return std::apply(get_array, std::forward<tuple_t>(tuple));
}

namespace WatchDog {
void WatchDog(int timeout);
}

//----------------------------------------------------------------------------------------
//! SignalHandler
// \brief static data and functions that implement a simple signal handling system
Expand Down
96 changes: 96 additions & 0 deletions src/utils/watchdog.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//========================================================================================
// Parthenon performance portable AMR framework
// Copyright(C) 2025 The Parthenon collaboration
// Licensed under the 3-clause BSD License, see LICENSE file for details
//========================================================================================
// AthenaXXX astrophysical plasma code
// Copyright(C) 2020 James M. Stone <[email protected]> and the Athena code team
// Licensed under the 3-clause BSD License (the "LICENSE")
//========================================================================================
//! \file watchdog.cpp
// \brief WatchDog implementation ported from the Einstein Toolkit and AthenaK

#include <pthread.h>
#include <unistd.h>

#include <cstdio>
#include <ctime>

#include "globals.hpp"
#include "utils.hpp"

namespace parthenon {
namespace WatchDog {
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static time_t timestamp;
static struct {
int timeout_sec;
int mpi_rank;
} param;

static void *patrol(void *args) {
time_t time_old, time_new, ltime;
char tstamp[128];

pthread_mutex_lock(&mutex);
time_old = timestamp;
pthread_mutex_unlock(&mutex);

if (0 == param.mpi_rank) {
ltime = time(NULL);
asctime_r(localtime(&ltime), &tstamp[0]); // NOLINT
tstamp[24] = '\0';
fprintf(stderr, "[WATCHDOG (%s)] Starting.\n", tstamp);
fflush(stderr);
}

while (true) {
unsigned int left = param.timeout_sec;
while (left > 0) {
left = sleep(left);
}

pthread_mutex_lock(&mutex);
time_new = timestamp;
pthread_mutex_unlock(&mutex);

ltime = time(NULL);
asctime_r(localtime(&ltime), &tstamp[0]); // NOLINT
tstamp[24] = '\0';
if (time_new == time_old) {
fprintf(stderr, "[WATCHDOG (%s)] Rank %d is not progressing.\n", tstamp,
param.mpi_rank);
fprintf(stderr, "[WATCHDOG (%s)] Terminating...\n", tstamp);
fflush(stderr);
abort();
} else {
if (0 == param.mpi_rank) {
fprintf(stderr, "[WATCHDOG (%s)] Everything is fine.\n", tstamp);
fflush(stderr);
}
time_old = time_new;
}
}
}

void WatchDog(int timeout) {
pthread_mutex_lock(&mutex);
timestamp = time(NULL);
pthread_mutex_unlock(&mutex);

static bool first_time = true;
if (first_time) {
param.timeout_sec = timeout;
param.mpi_rank = Globals::my_rank;
pthread_t dog; /* not used beyond passed to phread_ceate */
int ierr = pthread_create(&dog, NULL, patrol, NULL);
if (ierr) {
printf("#### FATAL ERROR in %s at line %d\n", __FILE__, __LINE__);
puts("Could not start WatchDog thread");
std::exit(EXIT_FAILURE);
}
first_time = false;
}
}
} // namespace WatchDog
} // namespace parthenon
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing newline

Loading