Commit 4f73681a authored by Lukas Weber's avatar Lukas Weber

remove binsize limitations and fix bugs

parent fc8d34fd
......@@ -20,8 +20,8 @@ void mc::_init() {
// simple profiling support: measure the time spent for sweeps/measurements etc
measure.add_observable("_ll_checkpoint_read_time", 1);
measure.add_observable("_ll_checkpoint_write_time", 1);
measure.add_observable("_ll_measurement_time", pt_mode_ ? pt_sweeps_per_global_update_ : 1000);
measure.add_observable("_ll_sweep_time", pt_mode_ ? pt_sweeps_per_global_update_ : 1000);
measure.add_observable("_ll_measurement_time", 1000);
measure.add_observable("_ll_sweep_time", 1000);
if(pt_mode_) {
if(param.get<bool>("pt_statistics", false)) {
......@@ -69,23 +69,8 @@ void mc::_do_update() {
}
}
void mc::_pt_update_param(const std::string& param_name, double new_param, const std::string &new_dir) {
// take over the bins of the new target dir
{
iodump dump_file = iodump::open_readonly(new_dir + ".dump.h5");
measure.checkpoint_read(dump_file.get_root().open_group("measurements"));
}
auto unclean = measure.is_unclean();
if(unclean) {
throw std::runtime_error(
fmt::format("Unclean observable: {}\nIn parallel tempering mode you have to choose the "
"binsize for all observables so that it is commensurate with "
"pt_sweeps_per_global_update (so that all bins are empty once it happens). "
"If you don’t like this limitation, implement it properly.",
*unclean));
}
void mc::_pt_update_param(int target_rank, const std::string& param_name, double new_param) {
measure.mpi_sendrecv(target_rank);
pt_update_param(param_name, new_param);
}
......@@ -102,20 +87,16 @@ double mc::_pt_weight_ratio(const std::string& param_name, double new_param) {
return wr;
}
void mc::measurements_write(const std::string &dir) {
void mc::_write(const std::string &dir) {
struct timespec tstart, tend;
clock_gettime(CLOCK_MONOTONIC_RAW, &tstart);
// blocks limit scopes of the dump file handles to ensure they are closed at the right time.
{
iodump meas_file = iodump::open_readwrite(dir + ".meas.h5");
auto g = meas_file.get_root();
measure.samples_write(g);
}
}
void mc::_write(const std::string &dir) {
struct timespec tstart, tend;
clock_gettime(CLOCK_MONOTONIC_RAW, &tstart);
measurements_write(dir);
{
iodump dump_file = iodump::create(dir + ".dump.h5.tmp");
......
......@@ -55,13 +55,11 @@ public:
void _write(const std::string &dir);
bool _read(const std::string &dir);
void measurements_write(const std::string &dir);
void _write_output(const std::string &filename);
void _do_update();
void _do_measurement();
void _pt_update_param(const std::string& param_name, double new_param, const std::string &new_dir);
void _pt_update_param(int target_rank, const std::string& param_name, double new_param);
double _pt_weight_ratio(const std::string& param_name, double new_param);
void pt_measure_statistics();
......
#include "measurements.h"
#include <fmt/format.h>
#include <mpi.h>
namespace loadl {
bool measurements::observable_name_is_legal(const std::string &obs_name) {
......@@ -31,7 +31,7 @@ void measurements::checkpoint_write(const iodump::group &dump_file) {
void measurements::checkpoint_read(const iodump::group &dump_file) {
for(const auto &obs_name : dump_file) {
add_observable(obs_name);
observables_.at(obs_name).checkpoint_read(dump_file.open_group(obs_name));
observables_.at(obs_name).checkpoint_read(obs_name, dump_file.open_group(obs_name));
}
}
......@@ -42,12 +42,49 @@ void measurements::samples_write(const iodump::group &meas_file) {
}
}
std::optional<std::string> measurements::is_unclean() const {
for(const auto &obs : observables_) {
if(!obs.second.is_clean()) {
return obs.first;
void measurements::mpi_sendrecv(int target_rank) {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(rank == target_rank) {
return;
}
if(mpi_checked_targets_.count(target_rank) == 0) {
if(rank < target_rank) {
unsigned long obscount = observables_.size();
MPI_Send(&obscount, 1, MPI_UNSIGNED_LONG, target_rank, 0, MPI_COMM_WORLD);
for(auto& [name, obs] : observables_) {
(void)obs;
int size = name.size()+1;
MPI_Send(&size, 1, MPI_INT, target_rank, 0, MPI_COMM_WORLD);
MPI_Send(name.c_str(), size, MPI_CHAR, target_rank, 0, MPI_COMM_WORLD);
}
} else {
unsigned long obscount;
MPI_Recv(&obscount, 1, MPI_UNSIGNED_LONG, target_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
if(obscount != observables_.size()) {
throw std::runtime_error{fmt::format("ranks {}&{} have to contain identical sets of registered observables. But they contain different amounts of observables! {} != {}.", target_rank, rank, obscount, observables_.size())};
}
for(auto& [name, obs] : observables_) {
(void)obs;
int size;
MPI_Recv(&size, 1, MPI_INT, target_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
std::vector<char> buf(size);
MPI_Recv(buf.data(), size, MPI_CHAR, target_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
if(std::string{buf.data()} != name) {
throw std::runtime_error{fmt::format("ranks {}&{} have to contain identical sets of registered observables. Found '{}' != '{}'.", target_rank, rank, name, std::string{buf.data()})};
}
}
}
mpi_checked_targets_.insert(target_rank);
}
for(auto& [name, obs] : observables_) {
(void)name;
obs.mpi_sendrecv(target_rank);
}
return std::nullopt;
}
}
......@@ -6,6 +6,7 @@
#include <string>
#include <valarray>
#include <vector>
#include <set>
namespace loadl {
......@@ -26,11 +27,11 @@ public:
// should be opened in read/write mode.
void samples_write(const iodump::group &meas_file);
// returns nullopt if all observables are clean,
// otherwise the name of a non-empty observable
std::optional<std::string> is_unclean() const;
// switches the content of the measurement buffers with the target_rank
// both ranks must have the same set of observables!
void mpi_sendrecv(int target_rank);
private:
std::set<int> mpi_checked_targets_;
std::map<std::string, observable> observables_;
};
......
#include "observable.h"
#include <fmt/format.h>
#include <iostream>
#include <mpi.h>
namespace loadl {
observable::observable(std::string name, size_t bin_length, size_t vector_length)
......@@ -23,7 +22,6 @@ void observable::checkpoint_write(const iodump::group &dump_file) const {
// Another sanity check: the samples_ array should contain one partial bin.
assert(samples_.size() == vector_length_);
dump_file.write("name", name_);
dump_file.write("vector_length", vector_length_);
dump_file.write("bin_length", bin_length_);
dump_file.write("current_bin_filling", current_bin_filling_);
......@@ -48,8 +46,8 @@ void observable::measurement_write(const iodump::group &meas_file) {
current_bin_ = 0;
}
void observable::checkpoint_read(const iodump::group &d) {
d.read("name", name_);
void observable::checkpoint_read(const std::string& name, const iodump::group &d) {
name_ = name;
d.read("vector_length", vector_length_);
d.read("bin_length", bin_length_);
d.read("current_bin_filling", current_bin_filling_);
......@@ -57,11 +55,22 @@ void observable::checkpoint_read(const iodump::group &d) {
current_bin_ = 0;
}
bool observable::is_clean() const {
if(current_bin_filling_ != 0) {
std::cout << current_bin_filling_ << "\n";
}
return current_bin_ == 0 && current_bin_filling_ == 0;
void observable::mpi_sendrecv(int target_rank) {
const int msg_size = 4;
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
unsigned long msg[msg_size] = {current_bin_, vector_length_, bin_length_, current_bin_filling_};
MPI_Sendrecv_replace(msg, msg_size, MPI_UNSIGNED_LONG, target_rank, 0, target_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
current_bin_ = msg[0];
vector_length_ = msg[1];
bin_length_ = msg[2];
current_bin_filling_ = msg[3];
std::vector<double> recvbuf((current_bin_+1)*vector_length_);
MPI_Sendrecv(samples_.data(), samples_.size(), MPI_DOUBLE, target_rank, 0, recvbuf.data(), recvbuf.size(), MPI_DOUBLE, target_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
samples_ = recvbuf;
}
}
......@@ -25,11 +25,11 @@ public:
// This will empty the cache of already completed bins
void measurement_write(const iodump::group &meas_file);
void checkpoint_read(const iodump::group &dump_file);
// true if there are no samples in the bin
bool is_clean() const;
void checkpoint_read(const std::string& name, const iodump::group &dump_file);
// switch copy with target rank.
// useful for parallel tempering mode
void mpi_sendrecv(int target_rank);
private:
static const size_t initial_bin_length = 1000;
......
This diff is collapsed.
......@@ -9,18 +9,18 @@ struct pt_chain {
int id{};
std::vector<int> task_ids;
std::vector<double> params;
std::vector<int> nup_histogram;
std::vector<int> ndown_histogram;
int sweeps{-1};
int target_sweeps{-1};
int target_thermalization{-1};
int scheduled_runs{};
// parameter optimization
std::vector<int> nup_histogram;
std::vector<int> ndown_histogram;
int entries_before_optimization{0};
int histogram_entries{};
int scheduled_runs{};
bool is_done();
void checkpoint_read(const iodump::group &g);
void checkpoint_write(const iodump::group &g);
......@@ -36,12 +36,14 @@ private:
public:
int id{};
int run_id{};
bool swap_odd{};
pt_chain_run(const pt_chain &chain, int run_id);
static pt_chain_run checkpoint_read(const iodump::group &g);
void checkpoint_write(const iodump::group &g);
std::vector<int> rank_to_pos;
std::vector<int> switch_partners;
std::vector<double> weight_ratios;
std::vector<int> last_visited;
......@@ -57,7 +59,6 @@ private:
double time_last_checkpoint_{0};
bool use_param_optimization_{};
bool pt_swap_odd_{};
std::vector<pt_chain> pt_chains_;
std::vector<pt_chain_run> pt_chain_runs_;
int chain_len_;
......@@ -108,8 +109,8 @@ private:
void pt_global_update();
bool is_checkpoint_time();
bool time_is_up();
int negotiate_timeout();
void send_status(int status);
int recv_action();
void checkpoint_write();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment