Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Enable auto checkpointing on SIGTERM #546

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions coreneuron/apps/main1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
* @brief File containing main driver routine for CoreNeuron
*/

#include <cstring>
#include <climits>
#include <cstring>
#include <csignal>
#include <memory>
#include <vector>

Expand Down Expand Up @@ -114,11 +115,24 @@ char* prepare_args(int& argc, char**& argv, int use_mpi, const char* arg) {
// return actual data to be freed
return first;
}
}

} // extern "C"

namespace coreneuron {

void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);


static std::string check_restore() {
auto restore_path = corenrn_param.restorepath;
const auto auto_chkpt_path = corenrn_param.outpath + "/_corenrn_ckpt";
if (restore_path.empty() && fs_isdir(auto_chkpt_path.c_str())) {
restore_path = auto_chkpt_path;
}
return restore_path;
}


void nrn_init_and_load_data(int argc,
char* argv[],
bool is_mapping_needed = false,
Expand Down Expand Up @@ -168,7 +182,7 @@ void nrn_init_and_load_data(int argc,
set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed);

// set global variables for start time, timestep and temperature
std::string restore_path = corenrn_param.restorepath;
std::string restore_path = check_restore();
t = restore_time(restore_path.c_str());

if (corenrn_param.dt != -1000.) { // command line arg highest precedence
Expand Down Expand Up @@ -402,6 +416,20 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
return report_handler;
}

/**
* \brief Installs a SIGTERM handler so that we finish the current simulation without losing data
* \return True if a checkpoint was performed. False otherwise (not enough elapsed time)
*/
static void install_sigterm_handler() {
auto sigh = [](int) {
std::cerr << "SIGTERM caught! Halting sim and dumping checkpoint" << std::endl;
coreneuron::stoprun = true;
};
Comment on lines +425 to +427
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is often do is to enable a "double sigterm", to really cut things

How long is this checkpoint can be?

if (std::signal(SIGTERM, sigh) == SIG_ERR) {
std::cerr << "Could not install SIGTERM handler" << std::endl;
}
}

} // namespace coreneuron

/// The following high-level functions are marked as "extern C"
Expand Down Expand Up @@ -482,6 +510,9 @@ extern "C" int run_solve_core(int argc, char** argv) {
if (nrnmpi_myid == 0) {
mkdir_p(output_dir.c_str());
}

install_sigterm_handler();

#if NRNMPI
nrnmpi_barrier();
#endif
Expand Down
10 changes: 10 additions & 0 deletions coreneuron/io/file_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,13 @@ int mkdir_p(const char* path) {
delete[] dirpath;
return 0;
}

bool fs_exists(const char* path) {
struct stat buffer;
return (stat(path, &buffer) == 0);
}

bool fs_isdir(const char* path) {
struct stat buffer;
return (stat(path, &buffer) == 0 && S_ISDIR(buffer.st_mode));
}
8 changes: 8 additions & 0 deletions coreneuron/io/file_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,12 @@
*/
int mkdir_p(const char* path);

/** @brief Checks an arbitrary path exists
*/
bool fs_exists(const char* path);

/** @brief Checks an arbitrary path is an existing directory
*/
bool fs_isdir(const char* path);

#endif /* ifndef NRN_FILE_UTILS */
2 changes: 1 addition & 1 deletion coreneuron/nrnconf.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ extern double pi;
extern double t, dt;
extern int rev_dt;
extern int secondorder;
extern bool stoprun;
extern bool volatile stoprun;
extern const char* bbcore_write_version;
#define tstopbit (1 << 15)
#define tstopset stoprun |= tstopbit
Expand Down
32 changes: 32 additions & 0 deletions coreneuron/sim/fadvance_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# =============================================================================.
*/

#include <cstdlib>
#include <functional>

#include "coreneuron/coreneuron.hpp"
Expand All @@ -23,12 +24,19 @@
#include "coreneuron/utils/progressbar/progressbar.h"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/io/nrn_checkpoint.hpp"

// Do an auto checkpoint only if execution lasted longer than this var (secs)
#define CHECKPOINT_MIN_RUNTIME (4 * 3600) // 4h

namespace coreneuron {

extern corenrn_parameters corenrn_param;
static void* nrn_fixed_step_thread(NrnThread*);
static void* nrn_fixed_step_group_thread(NrnThread*, int, int, int&);
static bool nrn_auto_checkpoint();
static time_t sim_start_time;


void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
if (adt != nrn_threads[0]._dt) {
Expand Down Expand Up @@ -109,6 +117,7 @@ void nrn_fixed_single_steps_minimal(int total_sim_steps, double tstop) {
#endif
nrn_fixed_step_minimal();
if (stoprun) {
nrn_auto_checkpoint();
break;
}
current_steps++;
Expand Down Expand Up @@ -141,6 +150,7 @@ void nrn_fixed_step_group_minimal(int total_sim_steps) {
nrn_flush_reports(nrn_threads[0]._t);
#endif
if (stoprun) {
nrn_auto_checkpoint();
break;
}
current_steps++;
Expand Down Expand Up @@ -377,4 +387,26 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {

return nullptr;
}

/**
* \brief Does a checkpoint of the simulation in enough time has passed
* \return True if a checkpoint was performed. False otherwise (not enough elapsed time)
*/
static bool nrn_auto_checkpoint() {
time_t cur_time = time(NULL);
int elapsed_secs = difftime(sim_start_time, cur_time);
if (elapsed_secs < CHECKPOINT_MIN_RUNTIME) {
return false;
}
// Write to tmp location first because allocated time may not be enough to complete
const auto ckpt_tmp = corenrn_param.outpath + "/_corenrn_ckpt_dirty",
ckpt_dir = corenrn_param.outpath + "/_corenrn_ckpt";
Instrumentor::phase p("Checkpointing");
write_checkpoint(nrn_threads, nrn_nthread, ckpt_tmp.c_str());
system(("/bin/rm -rf '" + ckpt_dir + "'; " + "/bin/mv '" + ckpt_tmp + "' '" + ckpt_dir + "'")
.c_str());
return true;
}


} // namespace coreneuron
2 changes: 1 addition & 1 deletion coreneuron/utils/nrnoc_aux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {
bool stoprun;
bool volatile stoprun;
int v_structure_change;
int diam_changed;
#define MAXERRCOUNT 5
Expand Down