Commit 253a9209 authored by Lukas Weber's avatar Lukas Weber

first implementation of parallel tempering

parent f6f3e95d
......@@ -266,4 +266,10 @@ iodump::h5_handle::~h5_handle() {
hid_t iodump::h5_handle::operator*() {
return handle_;
}
bool file_exists(const std::string &path) {
struct stat buf;
return stat(path.c_str(), &buf) == 0;
}
}
......@@ -4,6 +4,7 @@
#include <hdf5.h>
#include <string>
#include <vector>
#include <sys/stat.h>
namespace loadl {
......@@ -264,4 +265,8 @@ void iodump::group::read(const std::string &name, T &value) const {
assert(buf.size() == 1);
value = buf.at(0);
}
// utility
bool file_exists(const std::string &path);
}
#include "mc.h"
#include <sys/stat.h>
namespace loadl {
mc::mc(const parser &p) : param{p} {
......@@ -65,6 +63,15 @@ void mc::_do_update() {
}
}
void mc::_pt_update_param(double new_param, const std::string &new_dir) {
// take over the bins of the new target dir
{
iodump dump_file = iodump::create(new_dir + ".dump.h5.tmp");
measure.checkpoint_read(dump_file.get_root().open_group("measurements"));
}
pt_update_param(new_param);
}
void mc::_write(const std::string &dir) {
struct timespec tstart, tend;
clock_gettime(CLOCK_MONOTONIC_RAW, &tstart);
......@@ -106,10 +113,6 @@ double mc::safe_exit_interval() {
return 2*(max_checkpoint_write_time_ + max_sweep_time_ + max_meas_time_) + 2;
}
static bool file_exists(const std::string &path) {
struct stat buf;
return stat(path.c_str(), &buf) == 0;
}
bool mc::_read(const std::string &dir) {
if(!file_exists(dir + ".dump.h5")) {
......
......@@ -31,13 +31,19 @@ protected:
virtual void write_output(const std::string &filename);
virtual void do_update() = 0;
virtual void do_measurement() = 0;
virtual void pt_update_param(double /*new_param*/) {
throw std::runtime_error{"running parallel tempering, but pt_update_param not implemented"};
}
public:
double random01();
int sweep() const;
virtual void register_evalables(std::vector<evalable> &evalables) = 0;
virtual double pt_weight_ratio(double /*new_param*/) {
throw std::runtime_error{"running parallel tempering, but pt_weight_ratio not implemented"};
return 1;
}
// these functions do a little more, like taking care of the
// random number generator state, then call the child class versions.
void _init();
......@@ -48,6 +54,7 @@ public:
void _do_update();
void _do_measurement();
void _pt_update_param(double new_param, const std::string &new_dir);
double safe_exit_interval();
......
......@@ -26,6 +26,7 @@ loadleveller_sources = files([
'runner.cpp',
'runner_single.cpp',
'runner_task.cpp',
'runner_pt.cpp',
])
loadleveller_headers = files([
......@@ -43,6 +44,7 @@ loadleveller_headers = files([
'runner.h',
'runner_single.h',
'runner_task.h',
'runner_pt.h',
])
libloadleveller = library('loadleveller',
......
......@@ -18,12 +18,4 @@ void rng_internal_mersenne::backend_checkpoint_read(const iodump::group &d) {
mtrand_.load(rand_state);
}
double rng_internal_mersenne::random_double() {
return mtrand_.randDblExc(1);
}
int rng_internal_mersenne::random_integer(int bound) {
return mtrand_.randInt(bound - 1);
}
}
......@@ -75,16 +75,20 @@ public:
// based on a dinosaur code in the MersenneTwister.h header
class rng_internal_mersenne {
private:
MTRand mtrand_;
public:
void backend_checkpoint_write(const iodump::group &dump_file);
void backend_checkpoint_read(const iodump::group &dump_file);
void set_seed(uint64_t seed);
double random_double();
int random_integer(int bound);
private:
MTRand mtrand_;
double random_double() {
return mtrand_.randDblExc(1);
}
int random_integer(int bound) {
return mtrand_.randInt(bound - 1);
}
};
// based on the c++ stl implementation
......
......@@ -6,7 +6,7 @@
#include <iomanip>
#include <regex>
#include <sys/stat.h>
#include "runner_pt.h"
namespace loadl {
enum {
......@@ -60,8 +60,12 @@ static int parse_duration(const std::string &str) {
}
}
std::string jobinfo::jobdir() const {
return jobname + ".data";
}
std::string jobinfo::taskdir(int task_id) const {
return fmt::format("{}.data/{}", jobname, task_names.at(task_id));
return fmt::format("{}/{}", jobdir(), task_names.at(task_id));
}
std::string jobinfo::rundir(int task_id, int run_id) const {
......@@ -77,7 +81,7 @@ jobinfo::jobinfo(const std::string &jobfile_name) : jobfile{jobfile_name} {
jobname = jobfile.get<std::string>("jobname");
std::string datadir = fmt::format("{}.data", jobname);
std::string datadir = jobdir();
int rc = mkdir(datadir.c_str(), 0755);
if(rc != 0 && errno != EEXIST) {
throw std::runtime_error{
......@@ -124,6 +128,22 @@ std::vector<std::string> jobinfo::list_run_files(const std::string &taskdir,
return results;
}
int jobinfo::read_dump_progress(int task_id) const {
int sweeps = 0;
try {
for(auto &dump_name : list_run_files(taskdir(task_id), "dump\\.h5")) {
int dump_sweeps = 0;
iodump d = iodump::open_readonly(dump_name);
d.get_root().read("sweeps", dump_sweeps);
sweeps += dump_sweeps;
}
} catch(std::ios_base::failure &e) {
// might happen if the taskdir does not exist
}
return sweeps;
}
void jobinfo::concatenate_results() {
std::ofstream cat_results{fmt::format("{}.results.yml", jobname)};
for(size_t i = 0; i < task_names.size(); i++) {
......@@ -153,6 +173,11 @@ void jobinfo::log(const std::string &message) {
}
int runner_mpi_start(jobinfo job, const mc_factory &mccreator, int argc, char **argv) {
if(job.jobfile["jobconfig"].defined("parallel_tempering_parameter")) {
runner_pt_start(std::move(job), mccreator, argc, argv);
return 0;
}
MPI_Init(&argc, &argv);
int rank;
......@@ -211,7 +236,7 @@ void runner_master::react() {
send_action(A_NEW_JOB, node);
tasks_[current_task_id_].scheduled_runs++;
int msg[3] = {current_task_id_, tasks_[current_task_id_].scheduled_runs,
tasks_[current_task_id_].target_sweeps};
tasks_[current_task_id_].target_sweeps+tasks_[current_task_id_].target_thermalization-tasks_[current_task_id_].sweeps};
MPI_Send(&msg, sizeof(msg) / sizeof(msg[0]), MPI_INT, node, T_NEW_JOB,
MPI_COMM_WORLD);
}
......@@ -246,31 +271,13 @@ void runner_master::send_action(int action, int destination) {
MPI_Send(&action, 1, MPI_INT, destination, T_ACTION, MPI_COMM_WORLD);
}
int runner_master::read_dump_progress(int task_id) {
int sweeps = 0;
try {
for(auto &dump_name : jobinfo::list_run_files(job_.taskdir(task_id), "dump\\.h5")) {
int dump_sweeps = 0;
iodump d = iodump::open_readonly(dump_name);
d.get_root().read("sweeps", dump_sweeps);
sweeps += dump_sweeps;
}
} catch(iodump_exception &e) {
// okay
} catch(std::ios_base::failure &e) {
// might happen if the taskdir does not exist
}
return sweeps;
}
void runner_master::read() {
for(size_t i = 0; i < job_.task_names.size(); i++) {
auto task = job_.jobfile["tasks"][job_.task_names[i]];
int target_sweeps = task.get<int>("sweeps");
int target_thermalization = task.get<int>("thermalization");
int sweeps = read_dump_progress(i);
int sweeps = job_.read_dump_progress(i);
int scheduled_runs = 0;
tasks_.emplace_back(target_sweeps, target_thermalization, sweeps, scheduled_runs);
......@@ -293,7 +300,7 @@ void runner_slave::start() {
if(!sys_->_read(job_.rundir(task_id_, run_id_))) {
sys_->_init();
job_.log(fmt::format("* initialized {}", job_.rundir(task_id_, run_id_)));
checkpointing();
write_checkpoint();
} else {
job_.log(fmt::format("* read {}", job_.rundir(task_id_, run_id_)));
}
......@@ -316,7 +323,7 @@ void runner_slave::start() {
break;
}
}
checkpointing();
write_checkpoint();
if(time_is_up()) {
what_is_next(S_TIMEUP);
......@@ -388,7 +395,7 @@ int runner_slave::recv_action() {
return new_action;
}
void runner_slave::checkpointing() {
void runner_slave::write_checkpoint() {
time_last_checkpoint_ = MPI_Wtime();
sys_->_write(job_.rundir(task_id_, run_id_));
job_.log(fmt::format("* rank {}: checkpoint {}", rank_, job_.rundir(task_id_, run_id_)));
......
......@@ -24,11 +24,13 @@ struct jobinfo {
jobinfo(const std::string &jobfile_name);
std::string jobdir() const;
std::string rundir(int task_id, int run_id) const;
std::string taskdir(int task_id) const;
static std::vector<std::string> list_run_files(const std::string &taskdir,
const std::string &file_ending);
int read_dump_progress(int task_id) const;
void merge_task(int task_id, const std::vector<evalable> &evalables);
void concatenate_results();
void log(const std::string &message);
......@@ -46,7 +48,6 @@ private:
int current_task_id_{-1};
void read();
int read_dump_progress(int task_id);
int get_new_task_id(int old_id);
void react();
......@@ -77,7 +78,7 @@ private:
void end_of_run();
int recv_action();
int what_is_next(int);
void checkpointing();
void write_checkpoint();
void merge_measurements();
public:
......
This diff is collapsed.
#pragma once
#include <vector>
#include <map>
#include "runner.h"
namespace loadl {
struct pt_chain {
int id{};
std::vector<int> task_ids;
std::vector<double> start_params;
std::vector<int> sweeps;
int target_sweeps{};
int target_thermalization{};
int scheduled_runs{};
bool is_done();
};
struct pt_chain_run {
private:
pt_chain_run() = default;
public:
int id;
int run_id;
pt_chain_run(const pt_chain& chain, int run_id);
static pt_chain_run checkpoint_read(const iodump::group& g);
void checkpoint_write(const iodump::group& g);
std::vector<double> params;
std::vector<int> node_to_pos;
std::vector<int> weight_ratios;
std::vector<uint8_t> done;
};
int runner_pt_start(jobinfo job, const mc_factory &mccreator, int argc, char **argv);
class runner_pt_master {
private:
jobinfo job_;
int num_active_ranks_{0};
double time_last_checkpoint_{0};
bool pt_swap_odd_{};
std::vector<pt_chain> pt_chains_;
std::vector<pt_chain_run> pt_chain_runs_;
int chain_len_;
std::unique_ptr<random_number_generator> rng_;
std::map<int,int> node_to_chain_run_;
int current_chain_id_{-1};
void construct_pt_chains();
void checkpoint_write();
void checkpoint_read();
int schedule_chain_run();
void pt_global_update(pt_chain& chain, pt_chain_run& chain_run);
void react();
void send_action(int action, int destination);
int assign_new_chain(int node_section);
public:
runner_pt_master(jobinfo job);
void start();
};
class runner_pt_slave {
private:
jobinfo job_;
mc_factory mccreator_;
std::unique_ptr<mc> sys_;
double time_last_checkpoint_{0};
double time_start_{0};
int rank_{};
int sweeps_since_last_query_{};
int sweeps_before_communication_{};
int sweeps_per_global_update_{};
int task_id_{-1};
int run_id_{-1};
void pt_global_update();
bool is_checkpoint_time();
bool time_is_up();
void send_status(int status);
int recv_action();
void checkpoint_write();
void merge_measurements();
bool accept_new_chain();
int what_is_next(int status);
public:
runner_pt_slave(jobinfo job, mc_factory mccreator);
void start();
};
}
......@@ -83,12 +83,7 @@ void runner_single::read() {
int target_thermalization = task.get<int>("thermalization");
int sweeps = 0;
try {
iodump dump = iodump::open_readonly(job_.rundir(i, 1) + ".dump.h5");
dump.get_root().read("sweeps", sweeps);
} catch(iodump_exception &e) {
}
sweeps = job_.read_dump_progress(i);
tasks_.emplace_back(target_sweeps, target_thermalization, sweeps, 0);
}
}
......
......@@ -2,8 +2,6 @@
namespace loadl {
class iodump;
// used by the runner
struct runner_task {
int target_sweeps;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment