Commit eb682698 authored by Lukas Weber's avatar Lukas Weber

be less smart about safe exit

parent b166b53f
...@@ -11,8 +11,8 @@ batchscript_claix18 = '''#!/usr/bin/env zsh ...@@ -11,8 +11,8 @@ batchscript_claix18 = '''#!/usr/bin/env zsh
#SBATCH --mem-per-cpu={mem_per_cpu} #SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH --ntasks={num_cores} #SBATCH --ntasks={num_cores}
#SBATCH --export=NONE #SBATCH --export=NONE
#SBATCH --output={jobname}.data/stdout.log #SBATCH --output={jobname}.data/stdout.%j.log
#SBATCH --error={jobname}.data/stderr.log #SBATCH --error={jobname}.data/stderr.%j.log
{custom_cmds} {custom_cmds}
{mpirun} $FLAGS_MPI_BATCH {mc_cmd} {mpirun} $FLAGS_MPI_BATCH {mc_cmd}
......
...@@ -84,7 +84,7 @@ jobinfo::jobinfo(const std::string &jobfile_name) : jobfile{jobfile_name} { ...@@ -84,7 +84,7 @@ jobinfo::jobinfo(const std::string &jobfile_name) : jobfile{jobfile_name} {
parser jobconfig{jobfile["jobconfig"]}; parser jobconfig{jobfile["jobconfig"]};
walltime = parse_duration(jobconfig.get<std::string>("mc_walltime")); runtime = parse_duration(jobconfig.get<std::string>("mc_runtime"));
checkpoint_time = parse_duration(jobconfig.get<std::string>("mc_checkpoint_time")); checkpoint_time = parse_duration(jobconfig.get<std::string>("mc_checkpoint_time"));
} }
......
...@@ -14,8 +14,8 @@ struct jobinfo { ...@@ -14,8 +14,8 @@ struct jobinfo {
std::vector<std::string> task_names; std::vector<std::string> task_names;
double checkpoint_time; double checkpoint_time{};
double walltime; double runtime{};
jobinfo(const std::string &jobfile_name); jobinfo(const std::string &jobfile_name);
......
...@@ -39,9 +39,6 @@ void mc::_do_measurement() { ...@@ -39,9 +39,6 @@ void mc::_do_measurement() {
double measurement_time = double measurement_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec); (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_measurement_time", measurement_time); measure.add("_ll_measurement_time", measurement_time);
if(measurement_time > max_meas_time_) {
max_meas_time_ = measurement_time;
}
} }
void mc::_do_update() { void mc::_do_update() {
...@@ -54,9 +51,6 @@ void mc::_do_update() { ...@@ -54,9 +51,6 @@ void mc::_do_update() {
double sweep_time = (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec); double sweep_time = (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_sweep_time", sweep_time); measure.add("_ll_sweep_time", sweep_time);
if(sweep_time > max_sweep_time_) {
max_sweep_time_ = sweep_time;
}
} }
void mc::_pt_update_param(int target_rank, const std::string &param_name, double new_param) { void mc::_pt_update_param(int target_rank, const std::string &param_name, double new_param) {
...@@ -88,10 +82,6 @@ void mc::_write(const std::string &dir) { ...@@ -88,10 +82,6 @@ void mc::_write(const std::string &dir) {
checkpoint_write(g.open_group("simulation")); checkpoint_write(g.open_group("simulation"));
measure.checkpoint_write(g.open_group("measurements")); measure.checkpoint_write(g.open_group("measurements"));
g.write("max_checkpoint_write_time", max_checkpoint_write_time_);
g.write("max_sweep_time", max_sweep_time_);
g.write("max_meas_time", max_meas_time_);
g.write("sweeps", sweep_); g.write("sweeps", sweep_);
} }
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str()); rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
...@@ -100,14 +90,6 @@ void mc::_write(const std::string &dir) { ...@@ -100,14 +90,6 @@ void mc::_write(const std::string &dir) {
double checkpoint_write_time = double checkpoint_write_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec); (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_checkpoint_write_time", checkpoint_write_time); measure.add("_ll_checkpoint_write_time", checkpoint_write_time);
if(checkpoint_write_time > max_checkpoint_write_time_) {
max_checkpoint_write_time_ = checkpoint_write_time;
}
}
double mc::safe_exit_interval() {
// this is more or less guesswork in an attempt to make it safe for as many cases as possible
return 2 * (max_checkpoint_write_time_ + max_sweep_time_ + max_meas_time_) + 2;
} }
bool mc::_read(const std::string &dir) { bool mc::_read(const std::string &dir) {
...@@ -128,10 +110,6 @@ bool mc::_read(const std::string &dir) { ...@@ -128,10 +110,6 @@ bool mc::_read(const std::string &dir) {
g.read("sweeps", sweep_); g.read("sweeps", sweep_);
g.read("max_checkpoint_write_time", max_checkpoint_write_time_);
g.read("max_sweep_time", max_sweep_time_);
g.read("max_meas_time", max_meas_time_);
clock_gettime(CLOCK_MONOTONIC_RAW, &tend); clock_gettime(CLOCK_MONOTONIC_RAW, &tend);
measure.add("_ll_checkpoint_read_time", measure.add("_ll_checkpoint_read_time",
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec)); (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec));
......
...@@ -15,13 +15,6 @@ private: ...@@ -15,13 +15,6 @@ private:
int sweep_{0}; int sweep_{0};
int therm_{0}; int therm_{0};
int pt_sweeps_per_global_update_{-1}; int pt_sweeps_per_global_update_{-1};
// The following times in seconds are used to estimate a safe exit interval before walltime is
// up.
double max_checkpoint_write_time_{0};
double max_sweep_time_{0};
double max_meas_time_{0};
protected: protected:
parser param; parser param;
std::unique_ptr<random_number_generator> rng; std::unique_ptr<random_number_generator> rng;
...@@ -65,8 +58,6 @@ public: ...@@ -65,8 +58,6 @@ public:
void _pt_update_param(int target_rank, const std::string &param_name, double new_param); void _pt_update_param(int target_rank, const std::string &param_name, double new_param);
double _pt_weight_ratio(const std::string &param_name, double new_param); double _pt_weight_ratio(const std::string &param_name, double new_param);
double safe_exit_interval();
bool is_thermalized(); bool is_thermalized();
measurements measure; measurements measure;
......
...@@ -177,8 +177,7 @@ void runner_slave::start() { ...@@ -177,8 +177,7 @@ void runner_slave::start() {
if(time_is_up()) { if(time_is_up()) {
what_is_next(S_TIMEUP); what_is_next(S_TIMEUP);
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_, job_.log(fmt::format("rank {} exits: time up"));
sys_->safe_exit_interval()));
break; break;
} }
...@@ -195,11 +194,7 @@ bool runner_slave::is_checkpoint_time() { ...@@ -195,11 +194,7 @@ bool runner_slave::is_checkpoint_time() {
} }
bool runner_slave::time_is_up() { bool runner_slave::time_is_up() {
double safe_interval = 0; return MPI_Wtime() - time_start_ > job_.runtime;
if(sys_ != nullptr) {
safe_interval = sys_->safe_exit_interval();
}
return MPI_Wtime() - time_start_ > job_.walltime - safe_interval;
} }
int runner_slave::what_is_next(int status) { int runner_slave::what_is_next(int status) {
......
...@@ -647,8 +647,7 @@ void runner_pt_slave::start() { ...@@ -647,8 +647,7 @@ void runner_pt_slave::start() {
if(timeout == TR_TIMEUP) { if(timeout == TR_TIMEUP) {
send_status(S_TIMEUP); send_status(S_TIMEUP);
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_, job_.log(fmt::format("rank {} exits: time up", rank_));
sys_->safe_exit_interval()));
break; break;
} }
action = what_is_next(S_BUSY); action = what_is_next(S_BUSY);
...@@ -664,20 +663,13 @@ void runner_pt_slave::send_status(int status) { ...@@ -664,20 +663,13 @@ void runner_pt_slave::send_status(int status) {
} }
int runner_pt_slave::negotiate_timeout() { int runner_pt_slave::negotiate_timeout() {
double safe_interval = 0, max_safe_interval = 0;
if(sys_ != nullptr) {
safe_interval = sys_->safe_exit_interval();
}
MPI_Reduce(&safe_interval, &max_safe_interval, 1, MPI_DOUBLE, MPI_MAX, 0, chain_comm_);
int result = TR_CONTINUE; int result = TR_CONTINUE;
if(chain_rank_ == 0) { if(chain_rank_ == 0) {
if(MPI_Wtime() - time_last_checkpoint_ > job_.checkpoint_time) { if(MPI_Wtime() - time_last_checkpoint_ > job_.checkpoint_time) {
result = TR_CHECKPOINT; result = TR_CHECKPOINT;
} }
if(MPI_Wtime() - time_start_ > job_.walltime - max_safe_interval) { if(MPI_Wtime() - time_start_ > job_.runtime) {
result = TR_TIMEUP; result = TR_TIMEUP;
} }
} }
......
...@@ -58,7 +58,7 @@ bool runner_single::is_checkpoint_time() const { ...@@ -58,7 +58,7 @@ bool runner_single::is_checkpoint_time() const {
} }
bool runner_single::time_is_up() const { bool runner_single::time_is_up() const {
return time(nullptr) - time_start_ > job_.walltime; return time(nullptr) - time_start_ > job_.runtime;
} }
int runner_single::get_new_task_id(int old_id) { int runner_single::get_new_task_id(int old_id) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment