Commit eb682698 authored by Lukas Weber's avatar Lukas Weber

be less smart about safe exit

parent b166b53f
......@@ -11,8 +11,8 @@ batchscript_claix18 = '''#!/usr/bin/env zsh
#SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH --ntasks={num_cores}
#SBATCH --export=NONE
#SBATCH --output={jobname}.data/stdout.log
#SBATCH --error={jobname}.data/stderr.log
#SBATCH --output={jobname}.data/stdout.%j.log
#SBATCH --error={jobname}.data/stderr.%j.log
{custom_cmds}
{mpirun} $FLAGS_MPI_BATCH {mc_cmd}
......
......@@ -84,7 +84,7 @@ jobinfo::jobinfo(const std::string &jobfile_name) : jobfile{jobfile_name} {
parser jobconfig{jobfile["jobconfig"]};
walltime = parse_duration(jobconfig.get<std::string>("mc_walltime"));
runtime = parse_duration(jobconfig.get<std::string>("mc_runtime"));
checkpoint_time = parse_duration(jobconfig.get<std::string>("mc_checkpoint_time"));
}
......
......@@ -14,8 +14,8 @@ struct jobinfo {
std::vector<std::string> task_names;
double checkpoint_time;
double walltime;
double checkpoint_time{};
double runtime{};
jobinfo(const std::string &jobfile_name);
......
......@@ -39,9 +39,6 @@ void mc::_do_measurement() {
double measurement_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_measurement_time", measurement_time);
if(measurement_time > max_meas_time_) {
max_meas_time_ = measurement_time;
}
}
void mc::_do_update() {
......@@ -54,9 +51,6 @@ void mc::_do_update() {
double sweep_time = (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_sweep_time", sweep_time);
if(sweep_time > max_sweep_time_) {
max_sweep_time_ = sweep_time;
}
}
void mc::_pt_update_param(int target_rank, const std::string &param_name, double new_param) {
......@@ -88,10 +82,6 @@ void mc::_write(const std::string &dir) {
checkpoint_write(g.open_group("simulation"));
measure.checkpoint_write(g.open_group("measurements"));
g.write("max_checkpoint_write_time", max_checkpoint_write_time_);
g.write("max_sweep_time", max_sweep_time_);
g.write("max_meas_time", max_meas_time_);
g.write("sweeps", sweep_);
}
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
......@@ -100,14 +90,6 @@ void mc::_write(const std::string &dir) {
double checkpoint_write_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_checkpoint_write_time", checkpoint_write_time);
if(checkpoint_write_time > max_checkpoint_write_time_) {
max_checkpoint_write_time_ = checkpoint_write_time;
}
}
double mc::safe_exit_interval() {
// this is more or less guesswork in an attempt to make it safe for as many cases as possible
return 2 * (max_checkpoint_write_time_ + max_sweep_time_ + max_meas_time_) + 2;
}
bool mc::_read(const std::string &dir) {
......@@ -128,10 +110,6 @@ bool mc::_read(const std::string &dir) {
g.read("sweeps", sweep_);
g.read("max_checkpoint_write_time", max_checkpoint_write_time_);
g.read("max_sweep_time", max_sweep_time_);
g.read("max_meas_time", max_meas_time_);
clock_gettime(CLOCK_MONOTONIC_RAW, &tend);
measure.add("_ll_checkpoint_read_time",
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec));
......
......@@ -15,13 +15,6 @@ private:
int sweep_{0};
int therm_{0};
int pt_sweeps_per_global_update_{-1};
// The following times in seconds are used to estimate a safe exit interval before walltime is
// up.
double max_checkpoint_write_time_{0};
double max_sweep_time_{0};
double max_meas_time_{0};
protected:
parser param;
std::unique_ptr<random_number_generator> rng;
......@@ -65,8 +58,6 @@ public:
void _pt_update_param(int target_rank, const std::string &param_name, double new_param);
double _pt_weight_ratio(const std::string &param_name, double new_param);
double safe_exit_interval();
bool is_thermalized();
measurements measure;
......
......@@ -177,8 +177,7 @@ void runner_slave::start() {
if(time_is_up()) {
what_is_next(S_TIMEUP);
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_,
sys_->safe_exit_interval()));
job_.log(fmt::format("rank {} exits: time up"));
break;
}
......@@ -195,11 +194,7 @@ bool runner_slave::is_checkpoint_time() {
}
bool runner_slave::time_is_up() {
double safe_interval = 0;
if(sys_ != nullptr) {
safe_interval = sys_->safe_exit_interval();
}
return MPI_Wtime() - time_start_ > job_.walltime - safe_interval;
return MPI_Wtime() - time_start_ > job_.runtime;
}
int runner_slave::what_is_next(int status) {
......
......@@ -647,8 +647,7 @@ void runner_pt_slave::start() {
if(timeout == TR_TIMEUP) {
send_status(S_TIMEUP);
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_,
sys_->safe_exit_interval()));
job_.log(fmt::format("rank {} exits: time up", rank_));
break;
}
action = what_is_next(S_BUSY);
......@@ -664,20 +663,13 @@ void runner_pt_slave::send_status(int status) {
}
int runner_pt_slave::negotiate_timeout() {
double safe_interval = 0, max_safe_interval = 0;
if(sys_ != nullptr) {
safe_interval = sys_->safe_exit_interval();
}
MPI_Reduce(&safe_interval, &max_safe_interval, 1, MPI_DOUBLE, MPI_MAX, 0, chain_comm_);
int result = TR_CONTINUE;
if(chain_rank_ == 0) {
if(MPI_Wtime() - time_last_checkpoint_ > job_.checkpoint_time) {
result = TR_CHECKPOINT;
}
if(MPI_Wtime() - time_start_ > job_.walltime - max_safe_interval) {
if(MPI_Wtime() - time_start_ > job_.runtime) {
result = TR_TIMEUP;
}
}
......
......@@ -58,7 +58,7 @@ bool runner_single::is_checkpoint_time() const {
}
bool runner_single::time_is_up() const {
return time(nullptr) - time_start_ > job_.walltime;
return time(nullptr) - time_start_ > job_.runtime;
}
int runner_single::get_new_task_id(int old_id) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment