Commit 7875ed9b authored by Lukas Weber's avatar Lukas Weber

parallel tempering mode: prevent inconsistent dumps

parent aa9d4818
......@@ -89,7 +89,6 @@ void mc::_write(const std::string &dir) {
g.write("thermalization_sweeps", std::min(sweep_,therm));
g.write("sweeps", std::max(0,sweep_-therm));
}
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
clock_gettime(CLOCK_MONOTONIC_RAW, &tend);
double checkpoint_write_time =
......@@ -97,6 +96,12 @@ void mc::_write(const std::string &dir) {
measure.add("_ll_checkpoint_write_time", checkpoint_write_time);
}
// This function is called if it is certain that the *.tmp files have been completely written.
// Important for parallel tempering mode where all slaves in a chain have to write consistent dumps.
void mc::_write_finalize(const std::string &dir) {
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
}
bool mc::_read(const std::string &dir) {
if(!file_exists(dir + ".dump.h5")) {
return false;
......@@ -124,10 +129,6 @@ bool mc::_read(const std::string &dir) {
return true;
}
void mc::_write_output(const std::string &filename) {
write_output(filename);
}
bool mc::is_thermalized() {
int sweep = sweep_;
if(pt_mode_ && pt_sweeps_per_global_update_ > 0) {
......
......@@ -22,7 +22,6 @@ protected:
virtual void init() = 0;
virtual void checkpoint_write(const iodump::group &out) = 0;
virtual void checkpoint_read(const iodump::group &in) = 0;
virtual void write_output(const std::string &filename);
virtual void do_update() = 0;
virtual void do_measurement() = 0;
virtual void pt_update_param(const std::string & /*param_name*/, double /*new_param*/) {
......@@ -43,16 +42,16 @@ public:
int sweep() const;
virtual void register_evalables(std::vector<evalable> &evalables) = 0;
virtual void write_output(const std::string &filename);
// these functions do a little more, like taking care of the
// random number generator state, then call the child class versions.
void _init();
void _write(const std::string &dir);
void _write_finalize(const std::string &dir);
bool _read(const std::string &dir);
void _write_output(const std::string &filename);
void _do_update();
void _do_measurement();
void _pt_update_param(int target_rank, const std::string &param_name, double new_param);
......
......@@ -240,12 +240,13 @@ int runner_slave::recv_action() {
void runner_slave::checkpoint_write() {
time_last_checkpoint_ = MPI_Wtime();
sys_->_write(job_.rundir(task_id_, run_id_));
sys_->_write_finalize(job_.rundir(task_id_, run_id_));
job_.log(fmt::format("* rank {}: checkpoint {}", rank_, job_.rundir(task_id_, run_id_)));
}
void runner_slave::merge_measurements() {
std::string unique_filename = job_.taskdir(task_id_);
sys_->_write_output(unique_filename);
sys_->write_output(unique_filename);
std::vector<evalable> evalables;
sys_->register_evalables(evalables);
......
......@@ -750,6 +750,8 @@ int runner_pt_slave::what_is_next(int status) {
void runner_pt_slave::checkpoint_write() {
time_last_checkpoint_ = MPI_Wtime();
sys_->_write(job_.rundir(task_id_, run_id_));
MPI_Barrier(chain_comm_);
sys_->_write_finalize(job_.rundir(task_id_, run_id_));
job_.log(fmt::format("* rank {}: checkpoint {}", rank_, job_.rundir(task_id_, run_id_)));
}
......@@ -765,7 +767,7 @@ int runner_pt_slave::recv_action() {
void runner_pt_slave::merge_measurements() {
std::string unique_filename = job_.taskdir(task_id_);
sys_->_write_output(unique_filename);
sys_->write_output(unique_filename);
std::vector<evalable> evalables;
if(job_.jobfile["jobconfig"].defined("pt_parameter_optimization")) {
......
......@@ -89,12 +89,13 @@ void runner_single::read() {
void runner_single::checkpointing() {
time_last_checkpoint_ = time(nullptr);
sys_->_write(job_.rundir(task_id_, 1));
sys_->_write_finalize(job_.rundir(task_id_, 1));
job_.log(fmt::format("* checkpointing {}", job_.rundir(task_id_, 1)));
}
void runner_single::merge_measurements() {
std::string unique_filename = job_.taskdir(task_id_);
sys_->_write_output(unique_filename);
sys_->write_output(unique_filename);
std::vector<evalable> evalables;
sys_->register_evalables(evalables);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment