Commit 139f1e98 authored by Christian Fuß's avatar Christian Fuß

Merge branch 'beamsearch_batch' into develop

parents bcb8841a 69b85ff0
Pipeline #220734 passed with stages
in 7 minutes and 50 seconds
......@@ -55,7 +55,7 @@ class SoftmaxCrossEntropyLossIgnoreIndices(gluon.loss.Loss):
loss = -(pred * label).sum(axis=self._axis, keepdims=True)
# ignore some indices for loss, e.g. <pad> tokens in NLP applications
for i in self._ignore_indices:
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i))
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i) * mx.nd.equal(mx.nd.argmax(pred, axis=1), label))
return loss.mean(axis=self._batch_axis, exclude=True)
@mx.metric.register
......@@ -214,7 +214,7 @@ class ${tc.fileNameWithoutEnding}:
del optimizer_params['learning_rate_decay']
train_batch_size = batch_size
test_batch_size = ${tc.hasUnrollInstructions()?then('1', 'batch_size')}
test_batch_size = batch_size
train_iter, train_test_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_data(train_batch_size, test_batch_size)
......@@ -279,6 +279,7 @@ class ${tc.fileNameWithoutEnding}:
tic = None
for epoch in range(begin_epoch, begin_epoch + num_epoch):
train_iter.reset()
for batch_i, batch in enumerate(train_iter):
with autograd.record():
......@@ -308,6 +309,7 @@ class ${tc.fileNameWithoutEnding}:
tic = None
train_test_iter.reset()
metric = mx.metric.create(eval_metric, **eval_metric_params)
for batch_i, batch in enumerate(train_test_iter):
......
......@@ -30,14 +30,14 @@
k = ${tc.getBeamSearchWidth(networkInstruction)}
<#list tc.getUnrollInputNames(networkInstruction, "1") as inputName>
<#if tc.getNameWithoutIndex(inputName) == tc.outputName>
sequences = [([${inputName}], 1.0)]
sequences = [([${inputName}], mx.nd.full((batch_size, 1,), 1.0, ctx=mx_context), [mx.nd.full((batch_size, 64,), 0.0, ctx=mx_context)])]
</#if>
</#list>
for i in range(1, ${tc.getBeamSearchMaxLength(networkInstruction)}):
all_candidates = []
for seq, score in sequences:
for seq, score, attention in sequences:
<#list tc.getUnrollInputNames(networkInstruction, "i") as inputName>
<#if tc.getNameWithoutIndex(inputName) == tc.outputName>
${inputName} = seq[-1]
......@@ -45,7 +45,6 @@
</#list>
<#if tc.isAttentionNetwork()>
${tc.join(tc.getUnrollOutputNames(networkInstruction, "i"), ", ")}, attention_ = self._networks[${networkInstruction?index}](${tc.join(tc.getUnrollInputNames(networkInstruction, "i"), ", ")})
attentionList.append(attention_)
<#else>
${tc.join(tc.getUnrollOutputNames(networkInstruction, "i"), ", ")} = self._networks[${networkInstruction?index}](${tc.join(tc.getUnrollInputNames(networkInstruction, "i"), ", ")})
</#if>
......@@ -55,20 +54,47 @@
</#if>
</#list>
topk = out.topk(k=k)[0]
topk = out.topk(k=k)
for j in topk:
candidate = (seq + [mx.nd.full((1, 1,), j, ctx=mx_context)], score * out[0][j].asscalar())
for top_index in range(len(topk[0])):
j = mx.nd.slice_axis(topk, axis=1, begin=top_index, end=top_index+1)
currentScore = mx.nd.slice_axis(out, axis=1, begin=top_index, end=top_index+1)
newScore = mx.nd.expand_dims(score.squeeze() * currentScore.squeeze(), axis=1)
<#if tc.isAttentionNetwork()>
candidate = (seq + [j], newScore, attention + [attention_])
<#else>
candidate = (seq + [j], newScore, attention + [])
</#if>
all_candidates.append(candidate)
ordered = sorted(all_candidates, key=lambda tup: tup[1])
sequences = ordered[:k]
ordered = []
newSequences = []
for batch_entry in range(batch_size):
ordered.append([])
batchCandidate = [([seq[batch_entry] for seq in candidate[0]], candidate[1][batch_entry], [attention[batch_entry].expand_dims(axis=0) for attention in candidate[2]]) for candidate in all_candidates]
ordered[batch_entry] = sorted(batchCandidate, key=lambda tup: tup[1].asscalar())
if batch_entry == 0:
newSequences = ordered[batch_entry]
else:
newSequences = [([mx.nd.concat(newSequences[sequenceIndex][0][seqIndex], ordered[batch_entry][sequenceIndex][0][seqIndex], dim=0) for seqIndex in range(len(newSequences[sequenceIndex][0]))],
mx.nd.concat(newSequences[sequenceIndex][1], ordered[batch_entry][sequenceIndex][1], dim=0),
[mx.nd.concat(newSequences[sequenceIndex][2][attentionIndex], ordered[batch_entry][sequenceIndex][2][attentionIndex], dim=0) for attentionIndex in range(len(newSequences[sequenceIndex][2]))])
for sequenceIndex in range(len(newSequences))]
newSequences = [([newSequences[sequenceIndex][0][seqIndex].expand_dims(axis=1) for seqIndex in range(len(newSequences[sequenceIndex][0]))],
newSequences[sequenceIndex][1].expand_dims(axis=1), [newSequences[sequenceIndex][2][attentionIndex] for attentionIndex in range(len(newSequences[sequenceIndex][2]))])
for sequenceIndex in range(len(newSequences))]
sequences = newSequences[:][:k]
for i in range(1, ${tc.getBeamSearchMaxLength(networkInstruction)}):
<#list tc.getUnrollOutputNames(networkInstruction, "i") as outputName>
<#if tc.getNameWithoutIndex(outputName) == tc.outputName>
${outputName} = sequences[0][0][i]
outputs.append(${outputName})
<#if tc.isAttentionNetwork()>
attentionList.append(sequences[0][2][i])
</#if>
</#if>
</#list>
<#else>
......
......@@ -54,7 +54,7 @@ class SoftmaxCrossEntropyLossIgnoreIndices(gluon.loss.Loss):
loss = -(pred * label).sum(axis=self._axis, keepdims=True)
# ignore some indices for loss, e.g. <pad> tokens in NLP applications
for i in self._ignore_indices:
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i))
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i) * mx.nd.equal(mx.nd.argmax(pred, axis=1), label))
return loss.mean(axis=self._batch_axis, exclude=True)
@mx.metric.register
......
......@@ -54,7 +54,7 @@ class SoftmaxCrossEntropyLossIgnoreIndices(gluon.loss.Loss):
loss = -(pred * label).sum(axis=self._axis, keepdims=True)
# ignore some indices for loss, e.g. <pad> tokens in NLP applications
for i in self._ignore_indices:
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i))
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i) * mx.nd.equal(mx.nd.argmax(pred, axis=1), label))
return loss.mean(axis=self._batch_axis, exclude=True)
@mx.metric.register
......
......@@ -54,7 +54,7 @@ class SoftmaxCrossEntropyLossIgnoreIndices(gluon.loss.Loss):
loss = -(pred * label).sum(axis=self._axis, keepdims=True)
# ignore some indices for loss, e.g. <pad> tokens in NLP applications
for i in self._ignore_indices:
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i))
loss = loss * mx.nd.logical_not(mx.nd.equal(mx.nd.argmax(pred, axis=1), mx.nd.ones_like(mx.nd.argmax(pred, axis=1))*i) * mx.nd.equal(mx.nd.argmax(pred, axis=1), label))
return loss.mean(axis=self._batch_axis, exclude=True)
@mx.metric.register
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment