Commit 2c7fb9e5 authored by Julian Johannes Steinsberger-Dührßen's avatar Julian Johannes Steinsberger-Dührßen
Browse files

Added AdamW optimizer, misc bug fixes

parent cb761995
Pipeline #318608 failed with stage
in 21 seconds
......@@ -18,7 +18,7 @@
<!-- .. SE-Libraries .................................................. -->
<CNNArch.version>0.3.5-SNAPSHOT</CNNArch.version>
<CNNTrain.version>0.3.10-SNAPSHOT</CNNTrain.version>
<CNNTrain.version>0.3.11-SNAPSHOT</CNNTrain.version>
<CNNArch2X.version>0.0.6-SNAPSHOT</CNNArch2X.version>
<embedded-montiarc-math-opt-generator>0.1.6</embedded-montiarc-math-opt-generator>
<EMADL2PythonWrapper.version>0.0.2-SNAPSHOT</EMADL2PythonWrapper.version>
......
......@@ -130,12 +130,25 @@ public class CNNTrain2Gluon extends CNNTrainGenerator {
Map<String, String> fileContentMap = new HashMap<>();
//Context Information and Optimizer for local adaption during prediction for replay memory layer (the second only applicaple for supervised learning)
String cnnTrainLAOptimizerTemplateContent = templateConfiguration.processTemplate(ftlContext, "CNNLAOptimizer.ftl");
fileContentMap.put("CNNLAOptimizer_" + getInstanceName() + ".h", cnnTrainLAOptimizerTemplateContent);
//AdamW optimizer if used for training
String optimizerName = configuration.getOptimizer().getName();
Optional<OptimizerSymbol> criticOptimizer = configuration.getCriticOptimizer();
String criticOptimizerName = "";
if (criticOptimizer.isPresent()){
criticOptimizerName = criticOptimizer.get().getName();
}
if (optimizerName.equals("adamw") || criticOptimizerName.equals("adamw")){
String adamWContent = templateConfiguration.processTemplate(ftlContext, "Optimizer/AdamW.ftl");
fileContentMap.put("AdamW.py", adamWContent);
}
if (configData.isSupervisedLearning()) {
String cnnTrainTrainerTemplateContent = templateConfiguration.processTemplate(ftlContext, "CNNTrainer.ftl");
fileContentMap.put("CNNTrainer_" + getInstanceName() + ".py", cnnTrainTrainerTemplateContent);
//Optimizer for local adaption during prediction for replay memory layer
String cnnTrainLAOptimizerTemplateContent = templateConfiguration.processTemplate(ftlContext, "CNNLAOptimizer.ftl");
fileContentMap.put("CNNLAOptimizer_" + getInstanceName() + ".h", cnnTrainLAOptimizerTemplateContent);
} else if (configData.isGan()) {
final String trainerName = "CNNTrainer_" + getInstanceName();
if (!configuration.getDiscriminatorNetwork().isPresent()) {
......@@ -192,7 +205,6 @@ public class CNNTrain2Gluon extends CNNTrainGenerator {
final String ganTrainerContent = templateConfiguration.processTemplate(ftlContext, "gan/Trainer.ftl");
fileContentMap.put(trainerName + ".py", ganTrainerContent);
} else if (configData.isReinforcementLearning()) {
final String trainerName = "CNNTrainer_" + getInstanceName();
final RLAlgorithm rlAlgorithm = configData.getRlAlgorithm();
......
......@@ -40,7 +40,7 @@ class ${tc.fileNameWithoutEnding}:
if os.path.isdir(self._model_dir_):
for file in os.listdir(self._model_dir_):
if ".params" in file and self._model_prefix_ + "_" + str(i) in file:
if ".params" in file and self._model_prefix_ + "_" + str(i) in file and not "loss" in file:
epochStr = file.replace(".params","").replace(self._model_prefix_ + "_" + str(i) + "-","")
epoch = int(epochStr)
if epoch > lastEpoch:
......@@ -69,7 +69,7 @@ class ${tc.fileNameWithoutEnding}:
for file in os.listdir(self._weights_dir_):
if ".params" in file and self._model_prefix_ + "_" + str(i) in file:
if ".params" in file and self._model_prefix_ + "_" + str(i) in file and not "loss" in file:
epochStr = file.replace(".params","").replace(self._model_prefix_ + "_" + str(i) + "-","")
epoch = int(epochStr)
if epoch > lastEpoch:
......@@ -87,9 +87,9 @@ class ${tc.fileNameWithoutEnding}:
warnings.simplefilter("ignore")
self.networks[${networkInstruction?index}].collect_params().initialize(self.weight_initializer, force_reinit=False, ctx=context)
self.networks[${networkInstruction?index}].hybridize()
self.networks[${networkInstruction?index}](<#list tc.getStreamInputDimensions(networkInstruction.body) as dimensions><#if tc.cutDimensions(dimensions)[tc.cutDimensions(dimensions)?size-1] == "1">mx.nd.zeros((${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])<#else>mx.nd.zeros((1, ${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])</#if><#sep>, </#list>)
self.networks[${networkInstruction?index}](<#list tc.getStreamInputDimensions(networkInstruction.body) as dimensions><#if tc.cutDimensions(dimensions)[tc.cutDimensions(dimensions)?size-1] == "1" && tc.cutDimensions(dimensions)?size != 1>mx.nd.zeros((${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])<#else>mx.nd.zeros((1, ${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])</#if><#sep>, </#list>)
<#if networkInstruction.body.episodicSubNetworks?has_content>
self.networks[0].episodicsubnet0_(<#list tc.getStreamInputDimensions(networkInstruction.body) as dimensions><#if tc.cutDimensions(dimensions)[tc.cutDimensions(dimensions)?size-1] == "1">mx.nd.zeros((${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])<#else>mx.nd.zeros((1, ${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])</#if><#sep>, </#list>)
self.networks[0].episodicsubnet0_(<#list tc.getStreamInputDimensions(networkInstruction.body) as dimensions><#if tc.cutDimensions(dimensions)[tc.cutDimensions(dimensions)?size-1] == "1" && tc.cutDimensions(dimensions)?size != 1>mx.nd.zeros((${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])<#else>mx.nd.zeros((1, ${tc.join(tc.cutDimensions(dimensions), ",")},), ctx=context[0])</#if><#sep>, </#list>)
</#if>
</#list>
......
......@@ -184,16 +184,16 @@ class ${tc.fileNameWithoutEnding}:
del discriminator_optimizer_params['learning_rate_decay']
if normalize:
self._net_creator_dis.construct(mx_context, data_mean=data_mean, data_std=data_std)
self._net_creator_dis.construct([mx_context], data_mean=data_mean, data_std=data_std)
else:
self._net_creator_dis.construct(mx_context)
self._net_creator_dis.construct([mx_context])
self._net_creator_gen.construct(mx_context)
self._net_creator_gen.construct([mx_context])
if self.use_qnet:
self._net_creator_qnet.construct(mx_context)
self._net_creator_qnet.construct([mx_context])
if load_checkpoint:
self._net_creator_qnet.load(mx_context)
self._net_creator_qnet.load([mx_context])
else:
if os.path.isdir(self._net_creator_qnet._model_dir_):
shutil.rmtree(self._net_creator_qnet._model_dir_)
......@@ -206,8 +206,8 @@ class ${tc.fileNameWithoutEnding}:
begin_epoch = 0
if load_checkpoint:
begin_epoch = self._net_creator_dis.load(mx_context)
self._net_creator_gen.load(mx_context)
begin_epoch = self._net_creator_dis.load([mx_context])
self._net_creator_gen.load([mx_context])
else:
if os.path.isdir(self._net_creator_dis._model_dir_):
shutil.rmtree(self._net_creator_dis._model_dir_)
......@@ -255,9 +255,9 @@ class ${tc.fileNameWithoutEnding}:
gen_input, exp_qnet_output = create_generator_input(batch)
with autograd.record():
fake_data = gen_net(*gen_input)
fake_data = gen_net(*gen_input)[0][0]
fake_data.detach()
discriminated_fake_dis = dis_net(fake_data, *dis_conditional_input)
discriminated_fake_dis = dis_net(fake_data, *dis_conditional_input)[0][0]
if self.use_qnet:
discriminated_fake_dis, _ = discriminated_fake_dis
......@@ -265,7 +265,7 @@ class ${tc.fileNameWithoutEnding}:
real_labels = mx.nd.ones(discriminated_fake_dis.shape, ctx=mx_context)
loss_resultF = dis_loss(discriminated_fake_dis, fake_labels)
discriminated_real_dis = dis_net(real_data, *dis_conditional_input)
discriminated_real_dis = dis_net(real_data, *dis_conditional_input)[0][0]
if self.use_qnet:
discriminated_real_dis, _ = discriminated_real_dis
loss_resultR = dis_loss(discriminated_real_dis, real_labels)
......@@ -276,8 +276,8 @@ class ${tc.fileNameWithoutEnding}:
if batch_i % k_value == 0:
with autograd.record():
fake_data = gen_net(*gen_input)
discriminated_fake_gen = dis_net(fake_data, *dis_conditional_input)
fake_data = gen_net(*gen_input)[0][0]
discriminated_fake_gen = dis_net(fake_data, *dis_conditional_input)[0][0]
if self.use_qnet:
discriminated_fake_gen, features = discriminated_fake_gen
loss_resultG = dis_loss(discriminated_fake_gen, real_labels)
......@@ -285,7 +285,7 @@ class ${tc.fileNameWithoutEnding}:
condition = batch.data[traindata_to_index[generator_target_name + "_"]]
loss_resultG = loss_resultG + gen_loss_weight * generator_loss_func(fake_data, condition)
if self.use_qnet:
qnet_discriminated = [q_net(features)]
qnet_discriminated = [q_net(features)[0][0]]
for i, qnet_out in enumerate(qnet_discriminated):
loss_resultG = loss_resultG + qnet_losses[i](qnet_out, exp_qnet_output[i])
loss_resultG.backward()
......
......@@ -82,12 +82,12 @@ public:
loadComponent(network_json_path, network_param_path, network_symbol_list, network_param_map_list);
}else{
for(int i=0; i < num_subnets; i++){
network_json_path = file_prefix + "_replay_sub_net_" + std::to_string(i) + "-symbol.json";
network_param_path = file_prefix + "_replay_sub_net_" + std::to_string(i) + "-0000.params";
network_json_path = file_prefix + "_episodic_sub_net_" + std::to_string(i) + "-symbol.json";
network_param_path = file_prefix + "_episodic_sub_net_" + std::to_string(i) + "-0000.params";
loadComponent(network_json_path, network_param_path, network_symbol_list, network_param_map_list);
if(i >= 1){
query_json_path = file_prefix + "_replay_query_net_" + std::to_string(i) + "-symbol.json";
query_param_path = file_prefix + "_replay_query_net_" + std::to_string(i) + "-0000.params";
query_json_path = file_prefix + "_episodic_query_net_" + std::to_string(i) + "-symbol.json";
query_param_path = file_prefix + "_episodic_query_net_" + std::to_string(i) + "-0000.params";
loadComponent(query_json_path, query_param_path, query_symbol_list, query_param_map_list);
memory_path = file_prefix + "_replay_memory_" + std::to_string(i);
......
......@@ -378,34 +378,44 @@ class EpisodicMemory(EpisodicReplayMemoryInterface):
#propagate the input as the rest is only used for replay
return [args, []]
def store_samples(self, data, y, query_network, store_prob):
def store_samples(self, data, y, query_network, store_prob, context):
num_pus = len(data)
sub_batch_sizes = [data[i][0][0].shape[0] for i in range(num_pus)]
num_inputs = len(data[0][0])
num_outputs = len(y[0])
num_outputs = len(y)
mx_context = context[0]
if len(self.key_memory) == 0:
self.key_memory = nd.empty(0, ctx=mx.cpu())
self.value_memory = nd.empty(0, ctx=mx.cpu())
self.label_memory = nd.empty((num_outputs, 0), ctx=mx.cpu())
self.key_memory = nd.empty(0, ctx=mx_context)
self.value_memory = nd.empty(0, ctx=mx_context)
self.label_memory = nd.empty((num_outputs, 0), ctx=mx_context)
ind = nd.sample_multinomial(store_prob, sub_batch_sizes)
if len(sub_batch_sizes) == 1:
ind = nd.reshape(ind, (1,-1))
ind = [nd.sample_multinomial(store_prob, sub_batch_sizes[i]).as_in_context(mx_context) for i in range(num_pus)]
if(nd.max(ind)):
max_inds = [nd.max(ind[i]) for i in range(num_pus)]
if any(max_inds):
to_store_values = []
for i in range(num_inputs):
tmp_values = nd.contrib.boolean_mask(data[0][0][i].as_in_context(mx.cpu()), ind[0])
for j in range(1, num_pus):
tmp_values = nd.concat(tmp_values, nd.contrib.boolean_mask(data[j][0][i].as_in_context(mx.cpu()), ind[j]), dim=0)
tmp_values = []
for j in range(0, num_pus):
if max_inds[j]:
#tmp = data[j][0][i].as_in_context(mx_context)
if isinstance(tmp_values, list):
tmp_values = nd.contrib.boolean_mask(data[j][0][i].as_in_context(mx_context), ind[j])
else:
tmp_values = nd.concat(tmp_values, nd.contrib.boolean_mask(data[j][0][i].as_in_context(mx_context), ind[j]), dim=0)
to_store_values.append(tmp_values)
to_store_labels = nd.empty((num_outputs, len(to_store_values[0])), ctx=mx.cpu())
to_store_labels = nd.empty((num_outputs, len(to_store_values[0])), ctx=mx_context)
for i in range(num_outputs):
tmp_labels = nd.contrib.boolean_mask(y[0][i].as_in_context(mx.cpu()), ind[0])
for j in range(1, num_pus):
tmp_labels = nd.concat(tmp_labels, nd.contrib.boolean_mask(y[j][i].as_in_context(mx.cpu()), ind[j]), dim=0)
tmp_labels = []
for j in range(0, num_pus):
if max_inds[j]:
if isinstance(tmp_labels, list):
tmp_labels = nd.contrib.boolean_mask(y[i][j].as_in_context(mx_context), ind[j])
else:
tmp_labels = nd.concat(tmp_labels, nd.contrib.boolean_mask(y[i][j].as_in_context(mx_context), ind[j]), dim=0)
to_store_labels[i] = tmp_labels
to_store_keys = query_network(*to_store_values[0:self.query_net_num_inputs])
......@@ -441,7 +451,7 @@ class EpisodicMemory(EpisodicReplayMemoryInterface):
return sample_batches
def get_query_network(self):
def get_query_network(self, context):
lastEpoch = 0
for file in os.listdir(self.query_net_dir):
if self.query_net_prefix in file and ".json" in file:
......@@ -462,16 +472,16 @@ class EpisodicMemory(EpisodicReplayMemoryInterface):
inputNames.append("data" + str(i))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
net = mx.gluon.nn.SymbolBlock.imports(self.query_net_dir + symbolFile, inputNames, self.query_net_dir + weightFile, ctx=mx.cpu())
net = mx.gluon.nn.SymbolBlock.imports(self.query_net_dir + symbolFile, inputNames, self.query_net_dir + weightFile, ctx=context[0])
net.hybridize()
return net
def save_memory(self, path):
mem_arr = [("keys", self.key_memory)] + [("labels", self.key_memory)] + [("values_"+str(k),v) for (k,v) in enumerate(self.value_memory)]
mem_arr = [("keys", self.key_memory)] + [("labels", self.label_memory)] + [("values_"+str(k),v) for (k,v) in enumerate(self.value_memory)]
mem_dict = {entry[0]:entry[1] for entry in mem_arr}
nd.save(path, mem_dict)
<#list tc.architecture.networkInstructions as networkInstruction>
#Stream ${networkInstruction?index}
<#list networkInstruction.body.episodicSubNetworks as elements>
......@@ -530,7 +540,7 @@ class Net_${networkInstruction?index}(gluon.HybridBlock):
<#else>
${tc.include(networkInstruction.body, "ARCHITECTURE_DEFINITION")}
</#if>
pass
pass
def hybrid_forward(self, F, ${tc.join(tc.getStreamInputNames(networkInstruction.body, false), ", ")}):
<#if networkInstruction.body.episodicSubNetworks?has_content>
......
......@@ -98,9 +98,10 @@ public:
}
}
</#if>
NDArray input_temp;
<#list tc.getStreamInputNames(networkInstruction.body, false) as variable>
NDArray input_temp(network_input_shapes[${variable?index}], ctx, false, dtype);
input_temp = NDArray(network_input_shapes[${variable?index}], ctx, false, dtype);
input_temp.SyncCopyFromCPU(in_${variable}.data(), network_input_sizes[${variable?index}]);
input_temp.CopyTo(&(network_handles[0]->arg_dict()[network_input_keys[${variable?index}]]));
</#list>
......@@ -177,7 +178,8 @@ public:
}
for(mx_uint i=0; i < query_num_inputs[net_start_ind-1]; i++){
prev_output[I].CopyTo(&(query_handle->arg_dict()["data" + std::to_string(i)]));
prev_output[i].CopyTo(&(query_handle->arg_dict()["data" + std::to_string(i)]));
}
NDArray::WaitAll();
query_handle->Forward(false);
......@@ -417,9 +419,19 @@ ${tc.include(networkInstruction.body, "PREDICTION_PARAMETER")}
replay_memory = model_loader.GetReplayMemory();
std::vector<mx_uint> label_memory_shape = replay_memory[0]["labels"].GetShape();
std::vector<mx_uint> lab_shape;
if(label_memory_shape.size() == 2){
lab_shape = {1, 1};
}else{
lab_shape.push_back(1);
for(mx_uint i=2; i<label_memory_shape.size(); i++){
lab_shape.push_back(label_memory_shape[i]);
}
}
std::vector<std::vector<mx_uint>> label_shapes;
for(mx_uint i=0; i < num_outputs; i++){
label_shapes.push_back({1, 1}); //<--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
label_shapes.push_back(lab_shape);
}
Symbol loss = MakeLoss(model_loader.GetLoss());
......
......@@ -10,6 +10,11 @@ import math
import sys
import inspect
from mxnet import gluon, autograd, nd
try:
import AdamW
except:
pass
class CrossEntropyLoss(gluon.loss.Loss):
def __init__(self, axis=-1, sparse_label=True, weight=None, batch_axis=0, **kwargs):
......@@ -56,7 +61,7 @@ class SoftmaxCrossEntropyLossIgnoreIndices(gluon.loss.Loss):
loss = -(pred * label).sum(axis=self._axis, keepdims=True)
# ignore some indices for loss, e.g. <pad> tokens in NLP applications
for i in self._ignore_indices:
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i) * mx.nd.equal(mx.nd.argmax(pred, axis=1), label))
loss = F.broadcast_mul(loss, F.logical_not(F.broadcast_equal(F.argmax(pred, axis=1), F.ones_like(F.argmax(pred, axis=1))*i) * F.broadcast_equal(F.argmax(pred, axis=1), label)))
return loss.mean(axis=self._batch_axis, exclude=True)
class DiceLoss(gluon.loss.Loss):
......@@ -293,6 +298,7 @@ class ${tc.fileNameWithoutEnding}:
mx_context = [mx.cpu()]
else:
logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.")
single_pu_batch_size = int(batch_size/num_pus)
if preprocessing:
preproc_lib = "CNNPreprocessor_${tc.fileNameWithoutEnding?keep_after("CNNSupervisedTrainer_")}_executor"
......@@ -337,7 +343,10 @@ class ${tc.fileNameWithoutEnding}:
if not os.path.isdir(self._net_creator._model_dir_):
raise
trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in self._networks.values() if len(network.collect_params().values()) != 0]
if optimizer == "adamw":
trainers = [mx.gluon.Trainer(network.collect_params(), AdamW.AdamW(**optimizer_params)) for network in self._networks.values() if len(network.collect_params().values()) != 0]
else:
trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in self._networks.values() if len(network.collect_params().values()) != 0]
margin = loss_params['margin'] if 'margin' in loss_params else 1.0
sparseLabel = loss_params['sparse_label'] if 'sparse_label' in loss_params else True
......@@ -408,7 +417,7 @@ class ${tc.fileNameWithoutEnding}:
layer = [param for param in inspect.getmembers(sub_net, lambda x: not(inspect.isroutine(x))) if param[0].startswith("memory")][0][1]
episodic_layers[0].append(layer)
episodic_store_buffer[${networkInstruction?index}].append([])
episodic_query_networks[0].append(episodic_layers[0][-1].get_query_network())
episodic_query_networks[0].append(episodic_layers[0][-1].get_query_network(mx_context))
store_prob[${networkInstruction?index}].append(nd.array([1-episodic_layers[0][-1].store_prob, episodic_layers[0][-1].store_prob], ctx=mx.cpu()))
</#if>
</#list>
......@@ -440,7 +449,12 @@ class ${tc.fileNameWithoutEnding}:
with autograd.record():
<#include "pythonExecuteTrain.ftl">
for loss in losses:
for i in range(num_pus):
losses = [0]*num_pus
for element in lossList[i]:
losses[i] = losses[i] + element
for loss in losses:
loss.backward()
loss_total += loss.sum().asscalar()
global_loss_train += loss.sum().asscalar()
......@@ -462,7 +476,7 @@ class ${tc.fileNameWithoutEnding}:
#storing samples for episodic replay
for net_i in range(len(self._networks)):
for layer_i, layer in enumerate(episodic_layers[net_i]):
layer.store_samples(episodic_store_buffer[net_i][layer_i], labels, episodic_query_networks[net_i][layer_i], store_prob[net_i][layer_i])
layer.store_samples(episodic_store_buffer[net_i][layer_i], labels, episodic_query_networks[net_i][layer_i], store_prob[net_i][layer_i], mx_context)
</#if>
if tic is None:
......@@ -497,16 +511,14 @@ class ${tc.fileNameWithoutEnding}:
<#include "saveAttentionImageTrain.ftl">
for i in range(num_pus):
predictions = []
for output_name in outputs[i]:
if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1:
predictions.append(mx.nd.argmax(output_name, axis=1))
else:
predictions.append(output_name)
predictions = []
for output_name in outputs:
if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1:
predictions.append(mx.nd.argmax(output_name, axis=1))
else:
predictions.append(output_name)
labels_metric = [[labels[j][i] for j in range(len(labels))] for i in range(num_pus)]
metric.update(preds=predictions, labels=labels_metric[i])
metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))])
train_metric_score = metric.get()[1]
else:
......@@ -524,22 +536,22 @@ class ${tc.fileNameWithoutEnding}:
<#include "saveAttentionImageTest.ftl">
for loss in losses:
global_loss_test += loss.sum().asscalar()
for element in lossList:
loss = loss + element
global_loss_test += loss.sum().asscalar()
test_batches += 1
for i in range(num_pus):
predictions = []
for output_name in outputs[i]:
predictions.append(output_name)
predictions = []
for output_name in outputs:
predictions.append(output_name)
labels_metric = [[labels[j][i] for j in range(len(labels))] for i in range(num_pus)]
metric.update(preds=predictions, labels=labels_metric[i])
metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))])
test_metric_score = metric.get()[1]
global_loss_test /= (test_batches * batch_size)
global_loss_test /= (test_batches * single_pu_batch_size)
logging.info("Epoch[%d] Train metric: %f, Test metric: %f, Train loss: %f, Test loss: %f" % (epoch, train_metric_score, test_metric_score, global_loss_train, global_loss_test))
......@@ -547,10 +559,6 @@ class ${tc.fileNameWithoutEnding}:
for i, network in self._networks.items():
network.save_parameters(self.parameter_path(i) + '-' + str(epoch).zfill(4) + '.params')
print("--------------------------------------Speed------------------------------------------")
print(avg_speed/n)
print("--------------------------------------Speed------------------------------------------")
for i, network in self._networks.items():
network.save_parameters(self.parameter_path(i) + '-' + str(num_epoch + begin_epoch + 1).zfill(4) + '.params')
network.export(self.parameter_path(i) + '_newest', epoch=0)
......
import os
import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet.ndarray.contrib import mp_adamw_update, adamw_update, multi_mp_adamw_update, multi_adamw_update
class AdamW(mx.optimizer.Optimizer):
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
super(AdamW, self).__init__(learning_rate=learning_rate, **kwargs)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.aggregate_num = max(1, min(50, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', '4'))))
def create_state(self, _, weight):
return (nd.zeros(weight.shape, weight.context, dtype=weight.dtype), #mean
nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) #variance
def create_state_multi_precision(self, index, weight):
weight_master_copy = None
if self.multi_precision and weight.dtype == numpy.float16:
weight_master_copy = weight.astype(numpy.float32)
return (self.create_state(index, weight_master_copy), weight_master_copy)
return self.create_state(index, weight)
def update(self, index, weight, grad, state):
self._update_impl(index, weight, grad, state, multi_precision=False)
def update_multi_precision(self, index, weight, grad, state):
use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
self._update_impl(index, weight, grad, state, multi_precision=use_multi_precision)
def _update_impl(self, indices, weight, grad, state, multi_precision=False):
"""update function"""
aggregate = self.aggregate_num > 1
if not isinstance(indices, (tuple, list)):
indices = [indices]
weight = [weight]
grad = [grad]
state = [state]
for w_i, g_i in zip(weight, grad):
assert(isinstance(w_i, nd.NDArray))
assert(isinstance(g_i, nd.NDArray))
aggregate = (aggregate and
w_i.stype == 'default' and
g_i.stype == 'default')
self._update_count(indices)
lrs = self._get_lrs(indices)
wds = self._get_wds(indices)
# pylint: disable=access-member-before-definition
if not isinstance(self.rescale_grad, nd.NDArray):
self.rescale_grad = nd.full(shape=(1,), val=self.rescale_grad, ctx=weight[0].context)
else:
self.rescale_grad = self.rescale_grad.as_in_context(weight[0].context)
kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
'rescale_grad': self.rescale_grad}
if self.clip_gradient:
kwargs['clip_gradient'] = self.clip_gradient
if aggregate:
current_index = 0
while current_index < len(indices):
sidx = current_index
eidx = min(current_index + self.aggregate_num, len(indices))
if not multi_precision:
mean, var = list(zip(*state[sidx:eidx]))
multi_adamw_update(weight[sidx:eidx],
grad[sidx:eidx],
mean, var,
out=weight[sidx:eidx],
size=len(weight[sidx:eidx]),
lrs=list(np.ones(len(weight[sidx:eidx]))),
wds=wds[sidx:eidx],
etas=lrs[sidx:eidx],
**kwargs)
else:
mean_var = list(zip(*state[sidx:eidx]))[0]
tmean_var = list(zip(*mean_var))
mean = tmean_var[0]
var = tmean_var[1]
multi_mp_adamw_update(weight[sidx:eidx],
grad[sidx:eidx],
mean, var,
list(zip(*state[sidx:eidx]))[1],
out=weight[sidx:eidx],
size=len(weight[sidx:eidx]),
lrs=list(np.ones(len(weight[sidx:eidx]))),
wds=wds[sidx:eidx],
etas=lrs[sidx:eidx],
**kwargs)
current_index += self.aggregate_num
else:
for w_i, g_i, s_i, lr, wd in zip(weight, grad, state, lrs, wds):
if not multi_precision:
mean, var = s_i
adamw_update(w_i, g_i, mean, var, out=w_i, lr=1, wd=wd, eta=lr, **kwargs)
else:
mean, var = s_i[0]
mp_adamw_update(w_i, g_i, mean, var, s_i[1], out=w_i, lr=1, wd=wd, eta=lr, **kwargs)
\ No newline at end of file
......@@ -18,13 +18,13 @@
<#if numInputs == "1">
inputNames = ["data"]
zeroInputs = [nd.zeros(<#if element.element.inputTypes[0].dimensions[0] == 1>(${tc.join(element.element.inputTypes[0].dimensions?reverse, ",")})<#else>(1,${tc.join(element.element.inputTypes[0].dimensions?reverse, ",")})</#if>, ctx=mx_context[0])]
zeroInputs = [nd.zeros(<#if element.element.inputTypes[0].dimensions[0] == 1 && tc.cutDimensionsInteger(element.element.inputTypes[0].dimensions)?size != 1>(${tc.join(element.element.inputTypes[0].dimensions?reverse, ",")})<#else>(1,${tc.join(element.element.inputTypes[0].dimensions?reverse, ",")})</#if>, ctx=mx_context[0])]
<#else>
inputNames = []
zeroInputs = []
<#list element.element.inputTypes as inType>
inputNames.append("data" + str(${inType?index}))
zeroInputs.append(nd.zeros(<#if inType.dimensions[0] == 1>(${tc.join(tc.cutDimensionsInteger(inType.dimensions)?reverse, ",")})<#else>(1,${tc.join(tc.cutDimensionsInteger(inType.dimensions)?reverse, ",")})</#if>, ctx=mx_context[0]))
zeroInputs.append(nd.zeros(<#if inType.dimensions[0] == 1 && tc.cutDimensionsInteger(inType.dimensions)?size != 1>(${tc.join(tc.cutDimensionsInteger(inType.dimensions)?reverse, ",")})<#else>(1,${tc.join(tc.cutDimensionsInteger(inType.dimensions)?reverse, ",")})</#if>, ctx=mx_context[0]))
</#list>
</#if>
with warnings.catch_warnings():
......
......@@ -19,8 +19,6 @@
for gradient_step in range(layer.replay_gradient_steps):
with autograd.record():
nd.waitall()
episodic_output = [self._networks[${networkInstruction?index}].episodic_sub_nets[layer_i](*(episodic_data[i]))[0] for i in range(num_pus)]
for i in range(layer_i+1, len(episodic_layers[${networkInstruction?index}])):
episodic_output = self._networks[${networkInstruction?index}].episodic_sub_nets[i](*episodic_output)[0]
......
labels = [gluon.utils.split_and_load(batch.label[i], ctx_list=mx_context, even_split=False) for i in range(${tc.architectureOutputs?size?c})]
labels = [gluon.utils.split_and_load(batch.label[i], ctx_list=mx_context, even_split=False)[0] for i in range(${tc.architectureOutputs?size?c})]
<#list tc.architectureInputs as input_name>
${input_name} = gluon.utils.split_and_load(batch.data[${input_name?index}], ctx_list=mx_context, even_split=False)
${input_name} = gluon.utils.split_and_load(batch.data[${input_name?index}], ctx_list=mx_context, even_split=False)[0]
</#list>
<#if tc.architectureOutputSymbols?size gt 1>
<#assign outputName = tc.getNameWithoutIndex(tc.getName(tc.architectureOutputSymbols[0]))>
#${outputName} = [mx.nd.zeros((batch_size, ${tc.join(tc.architectureOutputSymbols[0].ioDeclaration.type.dimensions, ", ")},), ctx=mx_context) for i in range(${tc.architectureOutputs?size?c})]
${outputName} = [mx.nd.zeros((single_pu_batch_size, ${tc.join(tc.architectureOutputSymbols[0].ioDeclaration.type.dimensions, ", ")},), ctx=mx_context[0]) for i in range(${tc.architectureOutputs?size?c})]
<#else>
<#list tc.architectureOutputSymbols as output>
#${tc.getName(output)} = mx.nd.zeros((batch_size, ${tc.join(output.ioDeclaration.type.dimensions, ", ")},), ctx=mx_context)<#sep>,
${tc.getName(output)} = mx.nd.zeros((single_pu_batch_size, ${tc.join(output.ioDeclaration.type.dimensions, ", ")},), ctx=mx_context[0])<#sep>,
</#list>
</#if>
<#list tc.getLayerVariableMembers()?keys as member>
#${member} = mx.nd.zeros((batch_size, ${tc.join(tc.cutDimensions(tc.getLayerVariableMembers()[member]), ", ")},), ctx=mx_context)
${member} = mx.nd.zeros((single_pu_batch_size, ${tc.join(tc.cutDimensions(tc.getLayerVariableMembers()[member]), ", ")},), ctx=mx_context[0])
</#list>
<#list tc.architecture.constants as constant>
#${tc.getName(constant)} = mx.nd.full((batch_size, 1,), ${constant.intValue?c}, ctx=mx_context)
${tc.getName(constant)} = mx.nd.full((single_pu_batch_size, 1,), ${constant.intValue?c}, ctx=mx_context[0])
</#list>
nd.waitall()
lossList = []
outputs = []