Commit 932dcf29 authored by laochailan's avatar laochailan
Browse files

make measurement files corruption safe.

at the cost of some i/o cost (one copy of the meas file at each checkpointing),
we can now assure that the results of the simulation are always in a valid state,
even if the program crashes while writing them out. At least in theory. Let’s see
how it works.

We also changed to the std::filesystem API… meaning there could be bugs…
and loadleveller probably runs on Windows now!
parent 1cf9afcf
#include "iodump.h"
#include <iostream>
#include <sstream>
#include <sys/file.h>
#include <typeinfo>
#include <unistd.h>
#include <filesystem>
namespace loadl {
......@@ -132,7 +131,7 @@ iodump iodump::open_readonly(const std::string &filename) {
iodump iodump::open_readwrite(const std::string &filename) {
H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);
if(access(filename.c_str(), R_OK) != F_OK) {
if(!std::filesystem::exists(filename)) {
create(filename);
}
......@@ -265,9 +264,4 @@ iodump::h5_handle::~h5_handle() {
hid_t iodump::h5_handle::operator*() {
return handle_;
}
bool file_exists(const std::string &path) {
struct stat buf;
return stat(path.c_str(), &buf) == 0;
}
}
......@@ -3,7 +3,7 @@
#include <fmt/format.h>
#include <hdf5.h>
#include <string>
#include <sys/stat.h>
#include <cassert>
#include <vector>
namespace loadl {
......@@ -272,6 +272,4 @@ void iodump::group::read(const std::string &name, T &value) const {
value = buf.at(0);
}
// utility
bool file_exists(const std::string &path);
}
......@@ -2,11 +2,11 @@
#include "mc.h"
#include "merger.h"
#include <ctime>
#include <dirent.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <regex>
#include <filesystem>
namespace loadl {
......@@ -67,20 +67,13 @@ jobinfo::jobinfo(const std::string &jobfile_name) : jobfile{jobfile_name} {
jobname = jobfile.get<std::string>("jobname");
std::string datadir = jobdir();
int rc = mkdir(datadir.c_str(), 0755);
if(rc != 0 && errno != EEXIST) {
throw std::runtime_error{
fmt::format("creation of output directory '{}' failed: {}", datadir, strerror(errno))};
}
std::error_code ec;
std::filesystem::create_directories(datadir, ec);
// perhaps a bit controversally, jobinfo tries to create the task directories. TODO: single file
// output.
for(size_t i = 0; i < task_names.size(); i++) {
int rc = mkdir(taskdir(i).c_str(), 0755);
if(rc != 0 && errno != EEXIST) {
throw std::runtime_error{fmt::format("creation of output directory '{}' failed: {}",
taskdir(i), strerror(errno))};
}
std::filesystem::create_directories(taskdir(i));
}
parser jobconfig{jobfile["jobconfig"]};
......@@ -96,19 +89,12 @@ std::vector<std::string> jobinfo::list_run_files(const std::string &taskdir,
const std::string &file_ending) {
std::regex run_filename{"^run\\d{4,}\\." + file_ending + "$"};
std::vector<std::string> results;
DIR *dir = opendir(taskdir.c_str());
if(dir == nullptr) {
throw std::ios_base::failure(
fmt::format("could not open directory '{}': {}", taskdir, strerror(errno)));
}
struct dirent *result;
while((result = readdir(dir)) != nullptr) {
std::string fname{result->d_name};
if(std::regex_search(fname, run_filename)) {
results.emplace_back(fmt::format("{}/{}", taskdir, fname));
for(const auto &p : std::filesystem::directory_iterator(taskdir)) {
if(std::regex_search(p.path().filename().string(), run_filename)) {
results.emplace_back(p.path());
}
}
closedir(dir);
return results;
}
......
#include "mc.h"
#include <filesystem>
namespace loadl {
mc::mc(const parser &p) : param{p}, measure{p.get<size_t>("binsize")} {
......@@ -67,7 +69,12 @@ void mc::_write(const std::string &dir) {
// blocks limit scopes of the dump file handles to ensure they are closed at the right time.
{
iodump meas_file = iodump::open_readwrite(dir + ".meas.h5");
std::error_code ec;
std::filesystem::copy(dir + ".meas.h5", dir + ".meas.h5.tmp", std::filesystem::copy_options::none, ec);
if(ec && ec != std::errc::no_such_file_or_directory) {
throw std::system_error(ec);
}
iodump meas_file = iodump::open_readwrite(dir + ".meas.h5.tmp");
auto g = meas_file.get_root();
measure.samples_write(g);
}
......@@ -97,11 +104,12 @@ void mc::_write(const std::string &dir) {
// This function is called if it is certain that the *.tmp files have been completely written.
// Important for parallel tempering mode where all slaves in a chain have to write consistent dumps.
void mc::_write_finalize(const std::string &dir) {
rename((dir + ".dump.h5.tmp").c_str(), (dir + ".dump.h5").c_str());
std::filesystem::rename(dir + ".dump.h5.tmp", dir + ".dump.h5");
std::filesystem::rename(dir + ".meas.h5.tmp", dir + ".meas.h5");
}
bool mc::_read(const std::string &dir) {
if(!file_exists(dir + ".dump.h5")) {
if(!std::filesystem::exists(dir + ".dump.h5")) {
return false;
}
......
#include "runner_pt.h"
#include "util.h"
#include <fstream>
#include <filesystem>
namespace loadl {
......@@ -250,7 +251,7 @@ void runner_pt_master::checkpoint_read() {
construct_pt_chains();
std::string master_dump_name = job_.jobdir() + "/pt_master.dump.h5";
if(file_exists(master_dump_name)) {
if(std::filesystem::exists(master_dump_name)) {
job_.log(fmt::format("master reading dump from '{}'", master_dump_name));
iodump dump = iodump::open_readonly(master_dump_name);
auto g = dump.get_root();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment