From a5ba2e885b2759162b68da645ed9e0beef8749ab Mon Sep 17 00:00:00 2001 From: Lukas Weber Date: Wed, 28 Aug 2019 15:01:45 +0200 Subject: [PATCH] fix oddities with done jobs --- python/loadl | 2 +- src/runner.cpp | 3 +-- src/runner_pt.cpp | 7 ++++--- src/runner_task.cpp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/loadl b/python/loadl index 5d64bc1..a582100 100755 --- a/python/loadl +++ b/python/loadl @@ -50,7 +50,7 @@ def run(): # check age of the different files binary_modtime = os.stat(job.jobconfig['mc_binary']).st_mtime try: - f = next(glob.iglob('{}.data/{}/*.h5'.format(job.jobname,job.tasks.keys()[-1]))) # only check one of the output files for speed + f = next(glob.iglob('{}.data/*/*.h5'.format(job.jobname))) # only check one of the output files for speed data_modtime = os.stat(f).st_mtime label = 'Warning' if args_run.force else 'Error' diff --git a/src/runner.cpp b/src/runner.cpp index 4d26ef2..5e63b61 100644 --- a/src/runner.cpp +++ b/src/runner.cpp @@ -77,7 +77,6 @@ void runner_master::react() { int node = stat.MPI_SOURCE; if(node_status == S_IDLE) { current_task_id_ = get_new_task_id(current_task_id_); - if(current_task_id_ < 0) { send_action(A_EXIT, node); num_active_ranks_--; @@ -85,7 +84,7 @@ void runner_master::react() { send_action(A_NEW_JOB, node); tasks_[current_task_id_].scheduled_runs++; int msg[3] = {current_task_id_, tasks_[current_task_id_].scheduled_runs, - tasks_[current_task_id_].target_sweeps - tasks_[current_task_id_].sweeps}; + std::max(1,tasks_[current_task_id_].target_sweeps - tasks_[current_task_id_].sweeps)}; MPI_Send(&msg, sizeof(msg) / sizeof(msg[0]), MPI_INT, node, T_NEW_JOB, MPI_COMM_WORLD); } } else if(node_status == S_BUSY) { diff --git a/src/runner_pt.cpp b/src/runner_pt.cpp index 21b5255..b770875 100644 --- a/src/runner_pt.cpp +++ b/src/runner_pt.cpp @@ -350,7 +350,8 @@ void runner_pt_master::start() { MPI_Comm tmp; MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, 0, &tmp); - for(int rank_section = 0; rank_section < (num_active_ranks_ - 1) / chain_len_; rank_section++) { + int chain_count = (num_active_ranks_ - 1) / chain_len_; + for(int rank_section = 0; rank_section < chain_count; rank_section++) { assign_new_chain(rank_section); } @@ -394,7 +395,6 @@ int runner_pt_master::schedule_chain_run() { int runner_pt_master::assign_new_chain(int rank_section) { int chain_run_id = schedule_chain_run(); - for(int target = 0; target < chain_len_; target++) { int msg[3] = {-1, 0, 0}; if(chain_run_id >= 0) { @@ -402,7 +402,7 @@ int runner_pt_master::assign_new_chain(int rank_section) { auto &chain = pt_chains_[chain_run.id]; msg[0] = chain.task_ids[target]; msg[1] = chain_run.run_id; - msg[2] = chain.target_sweeps + chain.sweeps; + msg[2] = std::max(1, chain.target_sweeps - chain.sweeps); } else { // this will prompt the slave to quit num_active_ranks_--; @@ -577,6 +577,7 @@ void runner_pt_slave::start() { job_.jobfile["jobconfig"].defined("pt_parameter_optimization"); if(!accept_new_chain()) { + job_.log(fmt::format("rank {} exits: out of work", rank_)); return; } diff --git a/src/runner_task.cpp b/src/runner_task.cpp index 100ed23..f313c6b 100644 --- a/src/runner_task.cpp +++ b/src/runner_task.cpp @@ -9,6 +9,6 @@ runner_task::runner_task(int target_sweeps, int sweeps, scheduled_runs{scheduled_runs} {} bool runner_task::is_done() const { - return sweeps > target_sweeps; + return sweeps >= target_sweeps; } } -- GitLab