Commit 296e2f1e authored by Lukas Weber's avatar Lukas Weber

seemingly working parallel tempering

parent 253a9209
......@@ -271,5 +271,4 @@ bool file_exists(const std::string &path) {
struct stat buf;
return stat(path.c_str(), &buf) == 0;
}
}
......@@ -3,8 +3,8 @@
#include <fmt/format.h>
#include <hdf5.h>
#include <string>
#include <vector>
#include <sys/stat.h>
#include <vector>
namespace loadl {
......@@ -114,7 +114,7 @@ public:
// TODO: once the intel compiler can do guaranteed copy elision,
// please uncomment this line! and be careful about bugs!
//iodump(iodump &) = delete;
// iodump(iodump &) = delete;
~iodump();
friend class group;
......@@ -268,5 +268,4 @@ void iodump::group::read(const std::string &name, T &value) const {
// utility
bool file_exists(const std::string &path);
}
......@@ -7,14 +7,6 @@ mc::mc(const parser &p) : param{p} {
void mc::write_output(const std::string &) {}
void mc::random_init() {
if(param.defined("seed")) {
rng.reset(new random_number_generator(param.get<uint64_t>("seed")));
} else {
rng.reset(new random_number_generator());
}
}
double mc::random01() {
return rng->random_double();
}
......@@ -29,7 +21,17 @@ void mc::_init() {
measure.add_observable("_ll_checkpoint_write_time", 1);
measure.add_observable("_ll_measurement_time", 1000);
measure.add_observable("_ll_sweep_time", 1000);
random_init();
if(param.get<bool>("pt_statistics", false)) {
measure.add_observable("_ll_pt_rank", 1);
}
if(param.defined("seed")) {
rng.reset(new random_number_generator(param.get<uint64_t>("seed")));
} else {
rng.reset(new random_number_generator());
}
init();
}
......@@ -40,8 +42,9 @@ void mc::_do_measurement() {
do_measurement();
clock_gettime(CLOCK_MONOTONIC_RAW, &tend);
double measurement_time = (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
double measurement_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_measurement_time", measurement_time);
if(measurement_time > max_meas_time_) {
max_meas_time_ = measurement_time;
......@@ -69,6 +72,12 @@ void mc::_pt_update_param(double new_param, const std::string &new_dir) {
iodump dump_file = iodump::create(new_dir + ".dump.h5.tmp");
measure.checkpoint_read(dump_file.get_root().open_group("measurements"));
}
if(param.get<bool>("pt_statistics", false)) {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
measure.add("_ll_pt_rank", rank);
}
pt_update_param(new_param);
}
......@@ -98,10 +107,10 @@ void mc::_write(const std::string &dir) {
g.write("thermalization_sweeps", std::min(therm_, sweep_)); // only for convenience
}
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
clock_gettime(CLOCK_MONOTONIC_RAW, &tend);
double checkpoint_write_time = (tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
double checkpoint_write_time =
(tend.tv_sec - tstart.tv_sec) + 1e-9 * (tend.tv_nsec - tstart.tv_nsec);
measure.add("_ll_checkpoint_write_time", checkpoint_write_time);
if(checkpoint_write_time > max_checkpoint_write_time_) {
max_checkpoint_write_time_ = checkpoint_write_time;
......@@ -110,10 +119,9 @@ void mc::_write(const std::string &dir) {
double mc::safe_exit_interval() {
// this is more or less guesswork in an attempt to make it safe for as many cases as possible
return 2*(max_checkpoint_write_time_ + max_sweep_time_ + max_meas_time_) + 2;
return 2 * (max_checkpoint_write_time_ + max_sweep_time_ + max_meas_time_) + 2;
}
bool mc::_read(const std::string &dir) {
if(!file_exists(dir + ".dump.h5")) {
return false;
......
......@@ -12,15 +12,15 @@ namespace loadl {
class mc {
private:
void random_init();
int sweep_ = 0;
int therm_ = 0;
// The following times in seconds are used to estimate a safe exit interval before walltime is up.
// The following times in seconds are used to estimate a safe exit interval before walltime is
// up.
double max_checkpoint_write_time_{0};
double max_sweep_time_{0};
double max_meas_time_{0};
protected:
parser param;
std::unique_ptr<random_number_generator> rng;
......@@ -34,6 +34,7 @@ protected:
virtual void pt_update_param(double /*new_param*/) {
throw std::runtime_error{"running parallel tempering, but pt_update_param not implemented"};
}
public:
double random01();
int sweep() const;
......@@ -43,7 +44,7 @@ public:
throw std::runtime_error{"running parallel tempering, but pt_weight_ratio not implemented"};
return 1;
}
// these functions do a little more, like taking care of the
// random number generator state, then call the child class versions.
void _init();
......
......@@ -33,7 +33,8 @@ private:
template<class T>
void measurements::add(const std::string name, T value) {
if(observables_.count(name) == 0) {
throw std::runtime_error{fmt::format("tried to add to observable '{}' which was not registered!", name)};
throw std::runtime_error{
fmt::format("tried to add to observable '{}' which was not registered!", name)};
}
observables_.at(name).add(value);
}
......
......@@ -91,7 +91,8 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
if(obs.total_sample_count <= min_bin_count) {
obs.rebinning_bin_count = obs.total_sample_count;
} else {
obs.rebinning_bin_count = min_bin_count + cbrt(obs.total_sample_count-min_bin_count);
obs.rebinning_bin_count =
min_bin_count + cbrt(obs.total_sample_count - min_bin_count);
}
} else {
obs.rebinning_bin_count = rebinning_bin_count;
......
......@@ -17,5 +17,4 @@ void rng_internal_mersenne::backend_checkpoint_read(const iodump::group &d) {
d.read("state", rand_state);
mtrand_.load(rand_state);
}
}
......@@ -77,6 +77,7 @@ public:
class rng_internal_mersenne {
private:
MTRand mtrand_;
public:
void backend_checkpoint_write(const iodump::group &dump_file);
void backend_checkpoint_read(const iodump::group &dump_file);
......@@ -85,7 +86,7 @@ public:
double random_double() {
return mtrand_.randDblExc(1);
}
int random_integer(int bound) {
return mtrand_.randInt(bound - 1);
}
......
......@@ -30,7 +30,6 @@ struct observable_result {
std::vector<double> autocorrelation_time;
};
// results holds the means and errors merged from all the runs belonging to a task
// this includes both regular observables and evalables.
struct results {
......
#include "runner.h"
#include "merger.h"
#include "runner_pt.h"
#include <dirent.h>
#include <fmt/format.h>
#include <fstream>
#include <iomanip>
#include <regex>
#include <sys/stat.h>
#include "runner_pt.h"
namespace loadl {
enum {
......@@ -35,13 +35,13 @@ static int parse_duration(const std::string &str) {
if(idx == str.size()) {
return i1;
} else if(str[idx] == ':') {
std::string str1 = str.substr(idx+1);
std::string str1 = str.substr(idx + 1);
int i2 = std::stoi(str1, &idx, 10);
if(idx == str1.size()) {
return 60 * i1 + i2;
} else if(str[idx] == ':') {
std::string str2 = str1.substr(idx+1);
std::string str2 = str1.substr(idx + 1);
int i3 = std::stoi(str2, &idx, 10);
if(idx != str2.size()) {
throw std::runtime_error{"minutes"};
......@@ -55,8 +55,8 @@ static int parse_duration(const std::string &str) {
throw std::runtime_error{"seconds"};
}
} catch(std::exception &e) {
throw std::runtime_error{
fmt::format("'{}' does not fit time format [[hours:]minutes:]seconds: {}", str, e.what())};
throw std::runtime_error{fmt::format(
"'{}' does not fit time format [[hours:]minutes:]seconds: {}", str, e.what())};
}
}
......@@ -236,9 +236,10 @@ void runner_master::react() {
send_action(A_NEW_JOB, node);
tasks_[current_task_id_].scheduled_runs++;
int msg[3] = {current_task_id_, tasks_[current_task_id_].scheduled_runs,
tasks_[current_task_id_].target_sweeps+tasks_[current_task_id_].target_thermalization-tasks_[current_task_id_].sweeps};
MPI_Send(&msg, sizeof(msg) / sizeof(msg[0]), MPI_INT, node, T_NEW_JOB,
MPI_COMM_WORLD);
tasks_[current_task_id_].target_sweeps +
tasks_[current_task_id_].target_thermalization -
tasks_[current_task_id_].sweeps};
MPI_Send(&msg, sizeof(msg) / sizeof(msg[0]), MPI_INT, node, T_NEW_JOB, MPI_COMM_WORLD);
}
} else if(node_status == S_BUSY) {
int msg[2];
......@@ -327,10 +328,11 @@ void runner_slave::start() {
if(time_is_up()) {
what_is_next(S_TIMEUP);
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_, sys_->safe_exit_interval()));
job_.log(fmt::format("rank {} exits: walltime up (safety interval = {} s)", rank_,
sys_->safe_exit_interval()));
break;
}
action = what_is_next(S_BUSY);
}
......@@ -348,7 +350,7 @@ bool runner_slave::time_is_up() {
if(sys_ != nullptr) {
safe_interval = sys_->safe_exit_interval();
}
return MPI_Wtime() - time_start_ > job_.walltime-safe_interval;
return MPI_Wtime() - time_start_ > job_.walltime - safe_interval;
}
int runner_slave::what_is_next(int status) {
......
......@@ -52,6 +52,7 @@ private:
void react();
void send_action(int action, int destination);
public:
runner_master(jobinfo job);
void start();
......
This diff is collapsed.
#pragma once
#include <vector>
#include <map>
#include "runner.h"
#include <map>
#include <vector>
namespace loadl {
......@@ -22,14 +22,15 @@ struct pt_chain {
struct pt_chain_run {
private:
pt_chain_run() = default;
public:
int id;
int run_id;
pt_chain_run(const pt_chain& chain, int run_id);
static pt_chain_run checkpoint_read(const iodump::group& g);
void checkpoint_write(const iodump::group& g);
pt_chain_run(const pt_chain &chain, int run_id);
static pt_chain_run checkpoint_read(const iodump::group &g);
void checkpoint_write(const iodump::group &g);
std::vector<double> params;
std::vector<int> node_to_pos;
......@@ -52,7 +53,7 @@ private:
int chain_len_;
std::unique_ptr<random_number_generator> rng_;
std::map<int,int> node_to_chain_run_;
std::map<int, int> node_to_chain_run_;
int current_chain_id_{-1};
void construct_pt_chains();
......@@ -60,7 +61,7 @@ private:
void checkpoint_read();
int schedule_chain_run();
void pt_global_update(pt_chain& chain, pt_chain_run& chain_run);
void pt_global_update(pt_chain &chain, pt_chain_run &chain_run);
void react();
void send_action(int action, int destination);
......
......@@ -33,7 +33,6 @@ int runner_single::start() {
job_.log(fmt::format("* read {}", job_.rundir(task_id_, 1)));
}
while(!tasks_[task_id_].is_done() && !time_is_up()) {
sys_->_do_update();
tasks_[task_id_].sweeps++;
......@@ -103,6 +102,5 @@ void runner_single::merge_measurements() {
job_.log(fmt::format("merging {}", job_.taskdir(task_id_)));
job_.merge_task(task_id_, evalables);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment