Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Lukas Weber
load_leveller
Commits
78e37406
Commit
78e37406
authored
Oct 21, 2019
by
Lukas Weber
Browse files
format and return 1 on unfinished runs
parent
6128a19d
Changes
18
Hide whitespace changes
Inline
Side-by-side
python/loadleveller/clusterutils.py
View file @
78e37406
...
...
@@ -16,6 +16,7 @@ batchscript_claix18 = '''#!/usr/bin/env zsh
{custom_cmds}
{mpirun} $FLAGS_MPI_BATCH {mc_cmd}
{custom_post_cmds}
'''
batch_commands
=
{
...
...
@@ -31,6 +32,7 @@ def generate_batchscript_claix18(cmd, jobname, jobconfig):
if
'project'
in
jobconfig
:
custom_cmds
+=
'#SBATCH --account={}
\n
'
.
format
(
jobconfig
[
'project'
])
custom_cmds
+=
jobconfig
.
get
(
'custom_cmds'
,
''
)
custom_cmds
=
jobconfig
.
get
(
'custom_post_cmds'
,
''
)
try
:
return
template
.
format
(
...
...
@@ -40,6 +42,7 @@ def generate_batchscript_claix18(cmd, jobname, jobconfig):
walltime
=
jobconfig
[
'mc_walltime'
],
num_cores
=
jobconfig
[
'num_cores'
],
custom_cmds
=
custom_cmds
,
custom_post_cmds
=
custom_post_cmds
,
mc_cmd
=
' '
.
join
(
cmd
)
)
except
KeyError
as
e
:
...
...
src/evalable.cpp
View file @
78e37406
...
...
@@ -83,7 +83,8 @@ void evalable::jackknife(const results &res, observable_result &obs_res) const {
if
(
jacked_eval_mean
.
size
()
!=
jacked_eval
.
size
())
{
throw
std
::
runtime_error
(
fmt
::
format
(
"evalable '{}': evalables must not change their dimensions depending "
"on the input"
,
name_
));
"on the input"
,
name_
));
}
for
(
size_t
i
=
0
;
i
<
jacked_eval
.
size
();
i
++
)
{
...
...
src/iodump.cpp
View file @
78e37406
...
...
@@ -48,7 +48,7 @@ const char *iodump_exception::what() const noexcept {
}
iodump
::
group
::
group
(
hid_t
parent
,
const
std
::
string
&
filename
,
const
std
::
string
&
path
)
:
filename_
{
filename
}
{
:
filename_
{
filename
}
{
group_
=
H5Gopen
(
parent
,
path
.
c_str
(),
H5P_DEFAULT
);
if
(
group_
<
0
)
{
group_
=
H5Gcreate2
(
parent
,
path
.
c_str
(),
H5P_DEFAULT
,
H5P_DEFAULT
,
H5P_DEFAULT
);
...
...
src/jobinfo.cpp
View file @
78e37406
...
...
@@ -140,7 +140,7 @@ void jobinfo::concatenate_results() {
std
::
vector
<
char
>
buf
(
size
+
1
,
0
);
res_file
.
read
(
buf
.
data
(),
size
);
cat_results
<<
buf
.
data
();
if
(
i
<
task_names
.
size
()
-
1
)
{
if
(
i
<
task_names
.
size
()
-
1
)
{
cat_results
<<
","
;
}
cat_results
<<
"
\n
"
;
...
...
src/mc.cpp
View file @
78e37406
...
...
@@ -86,8 +86,8 @@ void mc::_write(const std::string &dir) {
if
(
pt_mode_
)
{
therm
*=
pt_sweeps_per_global_update_
;
}
g
.
write
(
"thermalization_sweeps"
,
std
::
min
(
sweep_
,
therm
));
g
.
write
(
"sweeps"
,
sweep_
-
std
::
min
(
sweep_
,
therm
));
g
.
write
(
"thermalization_sweeps"
,
std
::
min
(
sweep_
,
therm
));
g
.
write
(
"sweeps"
,
sweep_
-
std
::
min
(
sweep_
,
therm
));
}
clock_gettime
(
CLOCK_MONOTONIC_RAW
,
&
tend
);
...
...
src/mc.h
View file @
78e37406
...
...
@@ -15,6 +15,7 @@ private:
size_t
sweep_
{
0
};
size_t
therm_
{
0
};
int
pt_sweeps_per_global_update_
{
-
1
};
protected:
parser
param
;
std
::
unique_ptr
<
random_number_generator
>
rng
;
...
...
src/merger.cpp
View file @
78e37406
...
...
@@ -67,10 +67,10 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
"merge: sample count is not an integer multiple of the vector length. Corrupt "
"file?"
};
}
sample_size
/=
vector_length
;
obs
.
total_sample_count
+=
sample_size
-
std
::
min
(
sample_size
,
sample_skip
);
obs
.
total_sample_count
+=
sample_size
-
std
::
min
(
sample_size
,
sample_skip
);
obs
.
mean
.
resize
(
vector_length
);
obs
.
error
.
resize
(
vector_length
);
obs
.
autocorrelation_time
.
resize
(
vector_length
);
...
...
@@ -98,9 +98,9 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
min_bin_count
+
cbrt
(
obs
.
total_sample_count
-
min_bin_count
);
}
}
else
{
obs
.
rebinning_bin_count
=
obs
.
total_sample_count
/
rebinning_bin_length
;
obs
.
rebinning_bin_count
=
obs
.
total_sample_count
/
rebinning_bin_length
;
}
if
(
obs
.
rebinning_bin_count
==
0
)
{
continue
;
}
...
...
@@ -124,14 +124,14 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
// total_sample_count. In that case, we throw away the leftover samples.
//
size_t
vector_length
=
obs
.
mean
.
size
();
for
(
size_t
i
=
sample_skip
*
vector_length
;
metadata
[
obs_name
].
sample_counter
<
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
&&
i
<
samples
.
size
();
for
(
size_t
i
=
sample_skip
*
vector_length
;
metadata
[
obs_name
].
sample_counter
<
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
&&
i
<
samples
.
size
();
i
++
)
{
size_t
vector_idx
=
i
%
vector_length
;
obs
.
mean
[
vector_idx
]
+=
samples
[
i
];
obs
.
mean
[
vector_idx
]
+=
samples
[
i
];
if
(
vector_idx
==
vector_length
-
1
)
{
metadata
[
obs_name
].
sample_counter
++
;
...
...
@@ -140,15 +140,15 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
}
}
for
(
auto
&
[
obs_name
,
obs
]
:
res
.
observables
)
{
assert
(
metadata
[
obs_name
].
sample_counter
==
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
);
assert
(
metadata
[
obs_name
].
sample_counter
==
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
);
if
(
obs
.
rebinning_bin_count
==
0
)
{
continue
;
}
for
(
auto
&
mean
:
obs
.
mean
)
{
mean
/=
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
;
mean
/=
obs
.
rebinning_bin_count
*
obs
.
rebinning_bin_length
;
}
metadata
[
obs_name
].
sample_counter
=
0
;
}
...
...
@@ -165,7 +165,7 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
g
.
read
(
fmt
::
format
(
"{}/samples"
,
obs_name
),
samples
);
for
(
size_t
i
=
sample_skip
*
vector_length
;
for
(
size_t
i
=
sample_skip
*
vector_length
;
obs_meta
.
current_rebin
<
obs
.
rebinning_bin_count
&&
i
<
samples
.
size
();
i
++
)
{
size_t
vector_idx
=
i
%
vector_length
;
...
...
@@ -192,7 +192,7 @@ results merge(const std::vector<std::string> &filenames, const std::vector<evala
obs_meta
.
current_rebin_filling
=
0
;
}
}
if
(
vector_idx
==
vector_length
-
1
)
{
metadata
[
obs_name
].
sample_counter
++
;
}
...
...
src/results.cpp
View file @
78e37406
...
...
@@ -10,7 +10,7 @@ void results::write_json(const std::string &filename, const std::string &taskdir
using
json
=
nlohmann
::
json
;
json
obs_list
;
for
(
auto
&
[
obs_name
,
obs
]
:
observables
)
{
double
max_auto_time
=
0
;
if
(
obs
.
autocorrelation_time
.
size
()
>
0
)
{
...
...
@@ -19,21 +19,16 @@ void results::write_json(const std::string &filename, const std::string &taskdir
}
obs_list
[
obs_name
]
=
{
{
"rebin_len"
,
obs
.
rebinning_bin_length
},
{
"rebin_count"
,
obs
.
rebinning_bin_count
},
{
"internal_bin_len"
,
obs
.
internal_bin_length
},
{
"autocorr_time"
,
max_auto_time
},
{
"mean"
,
obs
.
mean
},
{
"error"
,
obs
.
error
},
{
"rebin_len"
,
obs
.
rebinning_bin_length
},
{
"rebin_count"
,
obs
.
rebinning_bin_count
},
{
"internal_bin_len"
,
obs
.
internal_bin_length
},
{
"autocorr_time"
,
max_auto_time
},
{
"mean"
,
obs
.
mean
},
{
"error"
,
obs
.
error
},
};
}
nlohmann
::
json
out
=
{
{
"task"
,
taskdir
},
{
"parameters"
,
params
},
{
"results"
,
obs_list
}
};
nlohmann
::
json
out
=
{{
"task"
,
taskdir
},
{
"parameters"
,
params
},
{
"results"
,
obs_list
}};
std
::
ofstream
file
(
filename
);
file
<<
out
.
dump
(
1
);
...
...
src/results.h
View file @
78e37406
#pragma once
#include
<map>
#include
<vector>
#include
<nlohmann/json.hpp>
#include
<vector>
namespace
loadl
{
...
...
src/runner.cpp
View file @
78e37406
...
...
@@ -31,10 +31,11 @@ int runner_mpi_start(jobinfo job, const mc_factory &mccreator, int argc, char **
int
rank
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
int
rc
=
0
;
if
(
rank
==
0
)
{
runner_master
r
{
std
::
move
(
job
)};
r
.
start
();
rc
=
r
.
start
();
}
else
{
runner_slave
r
{
std
::
move
(
job
),
mccreator
};
r
.
start
();
...
...
@@ -42,12 +43,12 @@ int runner_mpi_start(jobinfo job, const mc_factory &mccreator, int argc, char **
MPI_Finalize
();
return
0
;
return
rc
;
}
runner_master
::
runner_master
(
jobinfo
job
)
:
job_
{
std
::
move
(
job
)}
{}
void
runner_master
::
start
()
{
int
runner_master
::
start
()
{
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
num_active_ranks_
);
job_
.
log
(
fmt
::
format
(
"Starting job '{}'"
,
job_
.
jobname
));
...
...
@@ -56,6 +57,9 @@ void runner_master::start() {
while
(
num_active_ranks_
>
1
)
{
react
();
}
bool
all_done
=
current_task_id_
<
0
;
return
!
all_done
;
}
int
runner_master
::
get_new_task_id
(
int
old_id
)
{
...
...
@@ -84,15 +88,20 @@ void runner_master::react() {
send_action
(
A_NEW_JOB
,
node
);
tasks_
[
current_task_id_
].
scheduled_runs
++
;
size_t
sweeps_until_comm
=
1
+
tasks_
[
current_task_id_
].
target_sweeps
-
std
::
min
(
tasks_
[
current_task_id_
].
target_sweeps
,
tasks_
[
current_task_id_
].
sweeps
);
size_t
sweeps_until_comm
=
1
+
tasks_
[
current_task_id_
].
target_sweeps
-
std
::
min
(
tasks_
[
current_task_id_
].
target_sweeps
,
tasks_
[
current_task_id_
].
sweeps
);
assert
(
current_task_id_
>=
0
);
uint64_t
msg
[
3
]
=
{
static_cast
<
uint64_t
>
(
current_task_id_
),
static_cast
<
uint64_t
>
(
tasks_
[
current_task_id_
].
scheduled_runs
),
sweeps_until_comm
};
MPI_Send
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
node
,
T_NEW_JOB
,
MPI_COMM_WORLD
);
uint64_t
msg
[
3
]
=
{
static_cast
<
uint64_t
>
(
current_task_id_
),
static_cast
<
uint64_t
>
(
tasks_
[
current_task_id_
].
scheduled_runs
),
sweeps_until_comm
};
MPI_Send
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
node
,
T_NEW_JOB
,
MPI_COMM_WORLD
);
}
}
else
if
(
node_status
==
S_BUSY
)
{
uint64_t
msg
[
2
];
MPI_Recv
(
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
node
,
T_STATUS
,
MPI_COMM_WORLD
,
&
stat
);
MPI_Recv
(
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
node
,
T_STATUS
,
MPI_COMM_WORLD
,
&
stat
);
int
task_id
=
msg
[
0
];
size_t
completed_sweeps
=
msg
[
1
];
...
...
@@ -207,7 +216,8 @@ int runner_slave::what_is_next(int status) {
}
MPI_Status
stat
;
uint64_t
msg
[
3
];
MPI_Recv
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
0
,
T_NEW_JOB
,
MPI_COMM_WORLD
,
&
stat
);
MPI_Recv
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_UINT64_T
,
0
,
T_NEW_JOB
,
MPI_COMM_WORLD
,
&
stat
);
task_id_
=
msg
[
0
];
run_id_
=
msg
[
1
];
sweeps_before_communication_
=
msg
[
2
];
...
...
src/runner.h
View file @
78e37406
...
...
@@ -32,7 +32,7 @@ private:
public:
runner_master
(
jobinfo
job
);
void
start
();
int
start
();
};
class
runner_slave
{
...
...
src/runner_pt.cpp
View file @
78e37406
#include
"runner_pt.h"
#include
<fstream>
#include
"util.h"
#include
<fstream>
namespace
loadl
{
...
...
@@ -85,12 +85,11 @@ void pt_chain::clear_histograms() {
std
::
fill
(
rejection_rates
.
begin
(),
rejection_rates
.
end
(),
0
);
}
// https://arxiv.org/pdf/1905.02939.pdf
std
::
tuple
<
double
,
double
>
pt_chain
::
optimize_params
()
{
std
::
vector
<
double
>
rejection_est
(
rejection_rates
);
bool
odd
=
false
;
for
(
auto
&
r
:
rejection_est
)
{
for
(
auto
&
r
:
rejection_est
)
{
r
/=
rejection_rate_entries
[
odd
];
odd
=
!
odd
;
if
(
r
==
0
)
{
// ensure the comm_barrier is invertible
...
...
@@ -100,30 +99,29 @@ std::tuple<double, double> pt_chain::optimize_params() {
std
::
vector
<
double
>
comm_barrier
(
params
.
size
());
double
sum
{};
double
efficiency
{};
for
(
size_t
i
=
0
;
i
<
comm_barrier
.
size
()
-
1
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
comm_barrier
.
size
()
-
1
;
i
++
)
{
comm_barrier
[
i
]
=
sum
;
sum
+=
rejection_est
[
i
];
efficiency
+=
rejection_est
[
i
]
/
(
1
-
rejection_est
[
i
]);
sum
+=
rejection_est
[
i
];
efficiency
+=
rejection_est
[
i
]
/
(
1
-
rejection_est
[
i
]);
}
comm_barrier
[
comm_barrier
.
size
()
-
1
]
=
sum
;
comm_barrier
[
comm_barrier
.
size
()
-
1
]
=
sum
;
monotonic_interpolator
lambda
{
comm_barrier
,
params
};
double
convergence
{};
for
(
size_t
i
=
1
;
i
<
params
.
size
()
-
1
;
i
++
)
{
double
new_param
=
lambda
(
sum
*
i
/
(
params
.
size
()
-
1
));
for
(
size_t
i
=
1
;
i
<
params
.
size
()
-
1
;
i
++
)
{
double
new_param
=
lambda
(
sum
*
i
/
(
params
.
size
()
-
1
));
double
d
=
(
new_param
-
params
[
i
]);
convergence
+=
d
*
d
;
convergence
+=
d
*
d
;
params
[
i
]
=
new_param
;
}
convergence
=
sqrt
(
convergence
)
/
params
.
size
();
convergence
=
sqrt
(
convergence
)
/
params
.
size
();
double
round_trip_rate
=
(
1
+
sum
)
/
(
1
+
efficiency
);
double
round_trip_rate
=
(
1
+
sum
)
/
(
1
+
efficiency
);
return
std
::
tie
(
round_trip_rate
,
convergence
);
}
...
...
@@ -137,10 +135,11 @@ int runner_pt_start(jobinfo job, const mc_factory &mccreator, int argc, char **a
int
rank
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
int
rc
=
0
;
if
(
rank
==
0
)
{
runner_pt_master
r
{
std
::
move
(
job
)};
r
.
start
();
rc
=
r
.
start
();
}
else
{
runner_pt_slave
r
{
std
::
move
(
job
),
mccreator
};
r
.
start
();
...
...
@@ -148,7 +147,7 @@ int runner_pt_start(jobinfo job, const mc_factory &mccreator, int argc, char **a
MPI_Finalize
();
return
0
;
return
rc
;
}
runner_pt_master
::
runner_pt_master
(
jobinfo
job
)
:
job_
{
std
::
move
(
job
)}
{
...
...
@@ -187,13 +186,14 @@ void runner_pt_master::construct_pt_chains() {
chain
.
task_ids
.
at
(
chain_pos
)
=
i
;
const
char
*
pt_sweep_error
=
"chain {}: task {}: in parallel tempering mode, sweeps are measured in global updates and need to be the "
"chain {}: task {}: in parallel tempering mode, sweeps are measured in global updates "
"and need to be the "
"same within each chain: {} = {} != {}"
;
int64_t
target_sweeps
=
task
.
get
<
int
>
(
"sweeps"
);
if
(
chain
.
target_sweeps
>=
0
&&
target_sweeps
!=
chain
.
target_sweeps
)
{
throw
std
::
runtime_error
{
fmt
::
format
(
pt_sweep_error
,
chain
.
id
,
i
,
"target_sweeps"
,
chain
.
target_sweeps
,
target_sweeps
)};
throw
std
::
runtime_error
{
fmt
::
format
(
pt_sweep_error
,
chain
.
id
,
i
,
"target_sweeps"
,
chain
.
target_sweeps
,
target_sweeps
)};
}
chain
.
target_sweeps
=
target_sweeps
;
...
...
@@ -209,7 +209,8 @@ void runner_pt_master::construct_pt_chains() {
int64_t
sweeps_per_global_update
=
task
.
get
<
int
>
(
"pt_sweeps_per_global_update"
);
int64_t
sweeps
=
job_
.
read_dump_progress
(
i
)
/
sweeps_per_global_update
;
if
(
chain
.
sweeps
>=
0
&&
sweeps
!=
chain
.
sweeps
)
{
throw
std
::
runtime_error
{
fmt
::
format
(
pt_sweep_error
,
chain
.
id
,
i
,
"sweeps"
,
chain
.
sweeps
,
sweeps
)};
throw
std
::
runtime_error
{
fmt
::
format
(
pt_sweep_error
,
chain
.
id
,
i
,
"sweeps"
,
chain
.
sweeps
,
sweeps
)};
}
chain
.
sweeps
=
sweeps
;
}
...
...
@@ -226,7 +227,7 @@ void runner_pt_master::construct_pt_chains() {
chain_len_
=
c
.
task_ids
.
size
();
c
.
rejection_rates
.
resize
(
chain_len_
-
1
);
c
.
rejection_rates
.
resize
(
chain_len_
-
1
);
if
(
po_config_
.
enabled
)
{
c
.
entries_before_optimization
=
po_config_
.
nsamples_initial
;
...
...
@@ -288,7 +289,7 @@ void runner_pt_master::write_statistics(const pt_chain_run &chain_run) {
auto
g
=
stat
.
get_root
();
g
.
write
(
"chain_length"
,
chain_len_
);
auto
cg
=
g
.
open_group
(
fmt
::
format
(
"chain{:04d}_run{:04d}"
,
chain_run
.
id
,
chain_run
.
run_id
));
cg
.
insert_back
(
"rank_to_pos"
,
chain_run
.
rank_to_pos
);
}
...
...
@@ -299,7 +300,7 @@ void runner_pt_master::write_param_optimization_statistics() {
auto
g
=
stat
.
get_root
();
g
.
write
(
"chain_length"
,
chain_len_
);
for
(
auto
&
chain
:
pt_chains_
)
{
auto
cg
=
g
.
open_group
(
fmt
::
format
(
"chain{:04d}"
,
chain
.
id
));
cg
.
insert_back
(
"params"
,
chain
.
params
);
...
...
@@ -340,12 +341,10 @@ void runner_pt_master::checkpoint_write() {
}
}
void
runner_pt_master
::
start
()
{
int
runner_pt_master
::
start
()
{
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
num_active_ranks_
);
po_config_
.
enabled
=
job_
.
jobfile
[
"jobconfig"
].
defined
(
"pt_parameter_optimization"
);
po_config_
.
enabled
=
job_
.
jobfile
[
"jobconfig"
].
defined
(
"pt_parameter_optimization"
);
if
(
po_config_
.
enabled
)
{
job_
.
log
(
"using feedback parameter optimization"
);
const
auto
&
po
=
job_
.
jobfile
[
"jobconfig"
][
"pt_parameter_optimization"
];
...
...
@@ -380,6 +379,15 @@ void runner_pt_master::start() {
}
}
checkpoint_write
();
bool
all_done
=
true
;
for
(
auto
&
c
:
pt_chains_
)
{
if
(
!
c
.
is_done
())
{
all_done
=
false
;
break
;
}
}
return
!
all_done
;
}
int
runner_pt_master
::
schedule_chain_run
()
{
...
...
@@ -430,7 +438,8 @@ int runner_pt_master::assign_new_chain(int rank_section) {
}
void
runner_pt_master
::
pt_param_optimization
(
pt_chain
&
chain
)
{
if
(
std
::
min
(
chain
.
rejection_rate_entries
[
0
],
chain
.
rejection_rate_entries
[
1
])
>=
chain
.
entries_before_optimization
)
{
if
(
std
::
min
(
chain
.
rejection_rate_entries
[
0
],
chain
.
rejection_rate_entries
[
1
])
>=
chain
.
entries_before_optimization
)
{
chain
.
entries_before_optimization
*=
po_config_
.
nsamples_growth
;
auto
[
efficiency
,
convergence
]
=
chain
.
optimize_params
();
...
...
@@ -451,7 +460,8 @@ void runner_pt_master::react() {
int
rank
=
stat
.
MPI_SOURCE
-
1
;
if
(
rank_status
==
S_BUSY
)
{
int64_t
msg
[
1
];
MPI_Recv
(
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_INT64_T
,
rank
+
1
,
0
,
MPI_COMM_WORLD
,
&
stat
);
MPI_Recv
(
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_INT64_T
,
rank
+
1
,
0
,
MPI_COMM_WORLD
,
&
stat
);
int64_t
completed_sweeps
=
msg
[
0
];
int
chain_run_id
=
rank_to_chain_run_
[
rank
/
chain_len_
];
...
...
@@ -549,7 +559,7 @@ void runner_pt_master::pt_global_update(pt_chain &chain, pt_chain_run &chain_run
double
w1
=
chain_run
.
weight_ratios
[
i
];
double
w2
=
chain_run
.
weight_ratios
[
i
+
1
];
double
p
=
std
::
min
(
exp
(
w1
+
w2
),
1.
);
double
p
=
std
::
min
(
exp
(
w1
+
w2
),
1.
);
double
r
=
rng_
->
random_double
();
chain
.
rejection_rates
[
i
]
+=
1
-
p
;
...
...
@@ -592,8 +602,7 @@ void runner_pt_slave::start() {
MPI_Comm_rank
(
chain_comm_
,
&
chain_rank_
);
bool
use_param_optimization
=
job_
.
jobfile
[
"jobconfig"
].
defined
(
"pt_parameter_optimization"
);
bool
use_param_optimization
=
job_
.
jobfile
[
"jobconfig"
].
defined
(
"pt_parameter_optimization"
);
if
(
!
accept_new_chain
())
{
job_
.
log
(
fmt
::
format
(
"rank {} exits: out of work"
,
rank_
));
...
...
@@ -704,7 +713,8 @@ void runner_pt_slave::pt_global_update() {
bool
runner_pt_slave
::
accept_new_chain
()
{
int64_t
msg
[
3
];
MPI_Recv
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_INT64_T
,
0
,
0
,
MPI_COMM_WORLD
,
MPI_STATUS_IGNORE
);
MPI_Recv
(
&
msg
,
sizeof
(
msg
)
/
sizeof
(
msg
[
0
]),
MPI_INT64_T
,
0
,
0
,
MPI_COMM_WORLD
,
MPI_STATUS_IGNORE
);
task_id_
=
msg
[
0
];
run_id_
=
msg
[
1
];
sweeps_before_communication_
=
msg
[
2
];
...
...
@@ -713,8 +723,8 @@ bool runner_pt_slave::accept_new_chain() {
return
false
;
}
sweeps_per_global_update_
=
job_
.
jobfile
[
"tasks"
][
job_
.
task_names
[
task_id_
]].
get
<
int64_t
>
(
"pt_sweeps_per_global_update"
);
sweeps_per_global_update_
=
job_
.
jobfile
[
"tasks"
][
job_
.
task_names
[
task_id_
]].
get
<
int64_t
>
(
"pt_sweeps_per_global_update"
);
sys_
=
std
::
unique_ptr
<
mc
>
{
mccreator_
(
job_
.
jobfile
[
"tasks"
][
job_
.
task_names
[
task_id_
]])};
sys_
->
pt_mode_
=
true
;
...
...
src/runner_pt.h
View file @
78e37406
...
...
@@ -19,7 +19,7 @@ struct pt_chain {
int
entries_before_optimization
{
0
};
std
::
vector
<
double
>
rejection_rates
;
std
::
vector
<
int
>
rejection_rate_entries
{
0
,
0
};
std
::
vector
<
int
>
rejection_rate_entries
{
0
,
0
};
bool
is_done
();
void
checkpoint_read
(
const
iodump
::
group
&
g
);
...
...
@@ -90,7 +90,7 @@ private:
public:
runner_pt_master
(
jobinfo
job
);
void
start
();
int
start
();
};
class
runner_pt_slave
{
...
...
src/runner_single.cpp
View file @
78e37406
...
...
@@ -11,8 +11,7 @@ namespace loadl {
int
runner_single_start
(
jobinfo
job
,
const
mc_factory
&
mccreator
,
int
,
char
**
)
{
runner_single
r
{
std
::
move
(
job
),
mccreator
};
r
.
start
();
return
0
;
return
r
.
start
();
}
runner_single
::
runner_single
(
jobinfo
job
,
mc_factory
mccreator
)
...
...
@@ -50,7 +49,8 @@ int runner_single::start() {
task_id_
=
get_new_task_id
(
task_id_
);
}
return
0
;
bool
all_done
=
task_id_
<
0
;
return
!
all_done
;
}
bool
runner_single
::
is_checkpoint_time
()
const
{
...
...
src/runner_task.cpp
View file @
78e37406
...
...
@@ -3,10 +3,8 @@
namespace
loadl
{
runner_task
::
runner_task
(
size_t
target_sweeps
,
size_t
sweeps
,
int
scheduled_runs
)
:
target_sweeps
{
target_sweeps
},
sweeps
{
sweeps
},
scheduled_runs
{
scheduled_runs
}
{}
runner_task
::
runner_task
(
size_t
target_sweeps
,
size_t
sweeps
,
int
scheduled_runs
)
:
target_sweeps
{
target_sweeps
},
sweeps
{
sweeps
},
scheduled_runs
{
scheduled_runs
}
{}
bool
runner_task
::
is_done
()
const
{
return
sweeps
>=
target_sweeps
;
...
...
src/util.cpp
View file @
78e37406
...
...
@@ -4,66 +4,69 @@
namespace
loadl
{
//https://en.wikipedia.org/wiki/Monotone_cubic_interpolation
monotonic_interpolator
::
monotonic_interpolator
(
const
std
::
vector
<
double
>&
x
,
const
std
::
vector
<
double
>&
y
)
:
x_
(
x
),
y_
(
y
),
m_
(
x
.
size
())
{
// https://en.wikipedia.org/wiki/Monotone_cubic_interpolation
monotonic_interpolator
::
monotonic_interpolator
(
const
std
::
vector
<
double
>
&
x
,
const
std
::
vector
<
double
>
&
y
)
:
x_
(
x
),
y_
(
y
),
m_
(
x
.
size
())
{
assert
(
x
.
size
()
==
y
.
size
());
assert
(
x
.
size
()
>
1
);
std
::
vector
<
double
>
d
(
x
.
size
());
for
(
size_t
i
=
0
;
i
<
x
.
size
()
-
1
;
i
++
)
{
d
[
i
]
=
(
y
[
i
+
1
]
-
y
[
i
])
/
(
x
[
i
+
1
]
-
x
[
i
]);
for
(
size_t
i
=
0
;
i
<
x
.
size
()
-
1
;
i
++
)
{
d
[
i
]
=
(
y
[
i
+
1
]
-
y
[
i
])
/
(
x
[
i
+
1
]
-
x
[
i
]);
}
m_
[
0
]
=
d
[
0
];
m_
[
x
.
size
()
-
1
]
=
d
[
x
.
size
()
-
2
];
for
(
std
::
size_t
i
=
1
;
i
<
x
.
size
()
-
1
;
i
++
)
{
m_
[
i
]
=
(
d
[
i
-
1
]
+
d
[
i
])
/
2
;
m_
[
x
.
size
()
-
1
]
=
d
[
x
.
size
()
-
2
];
for
(
std
::
size_t
i
=
1
;
i
<
x
.
size
()
-
1
;
i
++
)
{
m_
[
i
]
=
(
d
[
i
-
1
]
+
d
[
i
])
/
2
;
if
(
d
[
i
-
1
]
*
d
[
i
]
<=
0
)
{
if
(
d
[
i
-
1
]
*
d
[
i
]
<=
0
)
{
m_
[
i
]
=
0
;
}
}
for
(
std
::
size_t
i
=
0
;
i
<
x
.
size
()
-
1
;
i
++
)
{
double
a
=
m_
[
i
]
/
d
[
i
];
double
b
=
m_
[
i
+
1
]
/
d
[
i
];
for
(
std
::
size_t
i
=
0
;
i
<
x
.
size
()
-
1
;
i
++
)
{
double
a
=
m_
[
i
]
/
d
[
i
];
double
b
=
m_
[
i
+
1
]
/
d
[
i
];
double
r
=
a
*
a
+
b
*
b
;
double
r
=
a
*
a
+
b
*
b
;
if
(
r
>
9
)
{