Commits (44)
......@@ -86,6 +86,7 @@ class Dataset(object):
window=1, context_window=None, chunking=None,
seq_ordering='default', partition_epoch=None, repeat_epoch=None,
seq_list_filter_file=None, unique_seq_tags=False,
shuffle_frames_of_nseqs=0, min_chunk_size=0, chunking_variance=0,
......@@ -102,6 +103,7 @@ class Dataset(object):
:param str|None seq_list_filter_file: defines a subset of sequences (by tag) to use
:param bool unique_seq_tags: uniquify seqs with same seq tags in seq order
:param str|None seq_order_seq_lens_file: for seq order, use the seq length given by this file
:param int shuffle_frames_of_nseqs: shuffles the frames. not always supported
:param None|int estimated_num_seqs: for progress reporting in case the real num_seqs is unknown
......@@ -116,6 +118,8 @@ class Dataset(object):
self.repeat_epoch = repeat_epoch or 1
self.seq_tags_filter = set(self._load_seq_list_file(seq_list_filter_file)) if seq_list_filter_file else None
self.unique_seq_tags = unique_seq_tags
self._seq_order_seq_lens_file = seq_order_seq_lens_file
self._seq_order_seq_lens_by_idx = None
# There is probably no use case for combining the two, so avoid potential misconfiguration.
assert self.partition_epoch == 1 or self.repeat_epoch == 1, (
"Combining partition_epoch and repeat_epoch is prohibited.")
......@@ -337,6 +341,24 @@ class Dataset(object):
raise NotImplementedError
def _get_seq_order_seq_lens_by_idx(self, seq_idx):
:param int seq_idx:
:rtype: int
if not self._seq_order_seq_lens_by_idx:
assert self._seq_order_seq_lens_file
if self._seq_order_seq_lens_file.endswith(".gz"):
import gzip
raw = gzip.GzipFile(self._seq_order_seq_lens_file, "rb").read()
raw = open(self._seq_order_seq_lens_file, "rb").read()
seq_lens = eval(raw)
assert isinstance(seq_lens, dict)
all_tags = self.get_all_tags()
self._seq_order_seq_lens_by_idx = [seq_lens[tag] for tag in all_tags]
return self._seq_order_seq_lens_by_idx[seq_idx]
def get_seq_order_for_epoch(self, epoch, num_seqs, get_seq_len=None):
Returns the order of the given epoch.
......@@ -358,6 +380,8 @@ class Dataset(object):
full_epoch = (epoch - 1) // partition_epoch + 1
assert num_seqs > 0
seq_index = list(range(num_seqs)) # type: typing.List[int] # the real seq idx after sorting
if self._seq_order_seq_lens_file:
get_seq_len = self._get_seq_order_seq_lens_by_idx
if self.seq_ordering == 'default':
pass # Keep order as-is.
elif self.seq_ordering.startswith("default_every_n:"):
......@@ -377,6 +401,38 @@ class Dataset(object):
elif self.seq_ordering == "sorted_reverse":
assert get_seq_len
seq_index.sort(key=get_seq_len, reverse=True) # sort by length, in reverse, starting with longest
elif self.seq_ordering.startswith('sort_bin_shuffle'):
# Shuffle seqs, sort by length, and shuffle bins (then shuffle seqs within each bin if sort_bin_shuffle_x2).
assert get_seq_len
tmp = self.seq_ordering.split(':')[1:]
# Keep this deterministic! Use fixed seed.
if len(tmp) <= 1:
nth = 1
nth = int(tmp[1])
rnd_seed = ((full_epoch - 1) // nth + 1) if full_epoch else 1
rnd = Random(rnd_seed)
rnd.shuffle(seq_index) # Shuffle sequences.
seq_index.sort(key=get_seq_len) # Sort by length, starting with shortest.
if len(tmp) == 0:
bins = 2
if tmp[0].startswith("."): # starting with "." -> approx chunk size (num of seqs in one bin)
bins = max(num_seqs // int(tmp[0][1:]), 2)
else: # the number of bins
bins = int(tmp[0])
bin_ids = list(range(bins))
rnd.shuffle(bin_ids) # Shuffle bins.
out_index = []
for i in bin_ids:
if i == bins - 1:
part = seq_index[i * len(seq_index) // bins:][:]
part = seq_index[i * len(seq_index) // bins:(i + 1) * len(seq_index) // bins][:]
if self.seq_ordering.startswith('sort_bin_shuffle_x2'):
rnd.shuffle(part) # Shuffle within the bin.
out_index += part
seq_index = out_index
elif self.seq_ordering.startswith('laplace'):
assert get_seq_len
tmp = self.seq_ordering.split(':')[1:]
......@@ -2566,7 +2566,24 @@ class LibriSpeechCorpus(CachedDataset2):
class OggZipDataset(CachedDataset2):
Generic dataset which reads a Zip file containing Ogg files for each sequence.
Generic dataset which reads a Zip file containing Ogg files for each sequence and a text document.
The feature extraction settings are determined by the ``audio`` option, which is passed to :class:`ExtractAudioFeatures`.
Does also support Wav files, and might even support other file formats readable by the 'soundfile'
library (not tested). By setting ``audio`` or ``targets`` to ``None``, the dataset can be used in
text only or audio only mode. The content of the zip file is:
- a .txt file with the same name as the zipfile, containing a python list of dictionaries
- a subfolder with the same name as the zipfile, containing the audio files
The dictionaries in the .txt file must have the following structure:
.. code::
[{'seq_name': 'arbitrary_sequence_name', 'text': 'some utterance text', 'duration': 2.3, 'file': 'sequence0.wav'}, ...]
If ``seq_name`` is not included, the seq_tag will be the name of the file. ``duration`` is mandatory, as this information
is needed for the sequence sorting.
def __init__(self, path, audio, targets,
......@@ -2654,7 +2671,12 @@ class OggZipDataset(CachedDataset2):
assert isinstance(first_entry, dict)
assert isinstance(first_entry["text"], str)
assert isinstance(first_entry["duration"], float)
assert isinstance(first_entry["file"], str)
# when 'audio' is None and sequence names are given, this dataset can be used in text-only mode
if "file" in first_entry:
assert isinstance(first_entry["file"], str)
assert self.feature_extractor, "feature extraction is enabled, but no audio files are specified"
assert isinstance(first_entry["seq_name"], str)
return data
def _filter_fixed_random_subset(self, fixed_random_subset):
......@@ -100,14 +100,33 @@ class HDFDataset(CachedDataset):
self.timestamps = fin[attr_times][...]
self.timestamps = numpy.concatenate([self.timestamps, fin[attr_times][...]], axis=0)
prev_target_keys = None
if len(self.files) >= 2:
prev_target_keys = self.target_keys
if 'targets' in fin:
self.target_keys = sorted(fin['targets/labels'].keys())
self.target_keys = sorted(
set(fin['targets/labels'].keys()) |
set(fin['targets/data'].keys()) |
self.target_keys = ['classes']
seq_lengths = fin[attr_seqLengths][...]
seq_lengths = fin[attr_seqLengths][...] # shape (num_seqs,num_target_keys + 1)
if len(seq_lengths.shape) == 1:
seq_lengths = numpy.array(zip(*[seq_lengths.tolist() for _ in range(len(self.target_keys)+1)]))
assert seq_lengths.ndim == 2 and seq_lengths.shape[1] == len(self.target_keys) + 1
if prev_target_keys is not None and prev_target_keys != self.target_keys:
print("Warning: %s: loaded prev files %s, which defined target keys %s. Now loaded %s and got target keys %s." % (
self, self.files[:-1], prev_target_keys, filename, self.target_keys), file=log.v2)
# This can happen for multiple reasons. E.g. just different files. Or saved with different RETURNN versions.
# We currently support this by removing all the new additional targets, which only works if the prev targets
# were a subset (so the order in which you load the files matters).
assert all([key in self.target_keys for key in prev_target_keys]) # check if subset
# Filter out the relevant seq lengths
seq_lengths = seq_lengths[:, [0] + [self.target_keys.index(key) + 1 for key in prev_target_keys]]
assert seq_lengths.shape[1] == len(prev_target_keys) + 1
self.target_keys = prev_target_keys
seq_start = numpy.zeros((seq_lengths.shape[0] + 1, seq_lengths.shape[1]), dtype="int64")
numpy.cumsum(seq_lengths, axis=0, dtype="int64", out=seq_start[1:])
......@@ -141,7 +160,7 @@ class HDFDataset(CachedDataset):
filename, self.num_inputs, num_inputs[0])
if 'targets/size' in fin:
num_outputs = {}
for k in fin['targets/size'].attrs:
for k in self.target_keys:
if numpy.isscalar(fin['targets/size'].attrs[k]):
num_outputs[k] = (int(fin['targets/size'].attrs[k]), len(fin['targets/data'][k].shape))
else: # hdf_dump will give directly as tuple
......@@ -166,7 +185,7 @@ class HDFDataset(CachedDataset):
self.ctc_targets = numpy.concatenate((self.ctc_targets, tmp))
self.num_running_chars = numpy.sum(self.ctc_targets != -1)
if 'targets' in fin:
for name in fin['targets/data']:
for name in self.target_keys:
self.data_dtype[str(name)] = str(fin['targets/data'][name].dtype)
self.targets[str(name)] = None
if str(name) not in self.num_outputs:
......@@ -950,6 +969,9 @@ class SimpleHDFWriter:
shape = [None] * ndim # type: typing.List[typing.Optional[int]]
if ndim >= 2:
shape[-1] = dim
if dtype == "string":
# noinspection PyUnresolvedReferences
dtype = h5py.special_dtype(vlen=str)
self._datasets[data_key] = self._file['targets/data'].create_dataset(
data_key, shape=[d if d else 0 for d in shape], dtype=dtype, maxshape=shape)
self._file['targets/size'].attrs[data_key] = [dim or 1, ndim]
......@@ -1007,13 +1029,18 @@ class SimpleHDFWriter:
assert self._file.attrs['numSeqs'] > 0 and self._seq_lengths.shape[0] > 0
seq_idx = self._file.attrs['numSeqs'] - 1
if self._prepare_extra({data_key: (dim, raw_data.ndim, raw_data.dtype)}):
if raw_data.dtype == numpy.object:
# Is this a string?
assert isinstance(raw_data.flat[0], (str, bytes))
dtype = "string"
dtype = raw_data.dtype.name
if self._prepare_extra({data_key: (dim, raw_data.ndim, dtype)}):
# We added it now. Maybe other extra data keys were added before. The data_key_idx is different now.
# Thus, make sure that for all other already existing extra data keys, the offsets are the same.
assert seq_idx == 0 and all([
num == raw_data.shape[0]
for (key, num) in self._extra_num_time_steps.items()
if key != data_key])
# Thus, seq_lengths might have become invalid. Reinit them.
assert seq_idx == 0 # We can only do that in the beginning.
for data_key_idx_0, data_key_ in enumerate(sorted(self._prepared_extra)):
self._seq_lengths[seq_idx, data_key_idx_0 + 1] = self._extra_num_time_steps[data_key_]
self._extra_num_time_steps[data_key] += raw_data.shape[0]
self._datasets[data_key].resize(self._extra_num_time_steps[data_key], axis=0)
......@@ -346,6 +346,8 @@ class MetaDataset(CachedDataset2):
:rtype: int
return self._seq_lens[self.seq_list_original[self.default_dataset_key][s]]["data"]
elif self._seq_order_seq_lens_file:
get_seq_len = self._get_seq_order_seq_lens_by_idx
self.orig_seq_order_is_initialized = False
get_seq_len = self._get_dataset_seq_length
......@@ -360,6 +362,13 @@ class MetaDataset(CachedDataset2):
dataset.init_seq_order(epoch=epoch, seq_list=self.seq_list_ordered[dataset_key])
return True
def get_all_tags(self):
:return: list of all seq tags, of the whole dataset, without partition epoch
:rtype: list[str]
return self.seq_list_original[self.default_dataset_key]
def finish_epoch(self):
This would get called at the end of the epoch.
......@@ -1024,7 +1033,7 @@ class ConcatSeqsDataset(CachedDataset2):
def __init__(self, dataset, seq_list_file, seq_len_file, seq_tag_delim=";", remove_in_between_postfix=None,
use_cache_manager=False, epoch_wise_filter=None, **kwargs):
:param dict[str] dataset: kwargs for init_dataset
:param dict[str]|str|Dataset dataset: kwargs for init_dataset
:param str seq_list_file: filename. line-separated. seq_tag_delim.join(seq_tags) for concatenated seqs
:param str seq_len_file: file with Python dict, (single) seg_name -> len, which is used for sorting
:param str seq_tag_delim:
......@@ -1036,8 +1045,9 @@ class ConcatSeqsDataset(CachedDataset2):
self.seq_tag_delim = seq_tag_delim
self.remove_in_between_postfix = remove_in_between_postfix or {}
self.epoch_wise_filter = EpochWiseFilter(epoch_wise_filter) if epoch_wise_filter else None
dataset = dataset.copy()
dataset.setdefault("name", "%s_subdataset" % self.name)
if isinstance(dataset, dict):
dataset = dataset.copy()
dataset.setdefault("name", "%s_subdataset" % self.name)
self.sub_dataset = init_dataset(dataset)
self.num_outputs = self.sub_dataset.num_outputs
self.num_inputs = self.sub_dataset.num_inputs
......@@ -811,6 +811,16 @@ class ExternSprintDataset(SprintDatasetBase):
"--*.corpus.segments.file=%s" % self.seq_list_file,
"--*.corpus.segment-order=%s" % self.seq_list_file]
if self.seq_tags_filter is not None:
assert not self.predefined_seq_list_order
import tempfile
self.seq_list_file = tempfile.mktemp(prefix="crnn-sprint-predefined-seq-filter")
with open(self.seq_list_file, "w") as f:
for tag in self.seq_tags_filter:
args += ["--*.corpus.segments.file=%s" % self.seq_list_file]
return args
def _read_next_raw(self):
......@@ -789,13 +789,17 @@ class TFNetwork(object):
used_data_keys = used_data_keys.difference(self.extern_data.extra_added_keys)
return used_data_keys
def get_seq_tags(self, mark_data_key_as_used=True):
def get_seq_tags(self, mark_data_key_as_used=True, beam=None):
:param bool mark_data_key_as_used: for extern_data
:param TFUtil.SearchBeam|None beam:
:return: tensor of shape (batch,) of dtype string, via extern_data
:rtype: tf.Tensor
return self.get_extern_data(key="seq_tag", mark_data_key_as_used=mark_data_key_as_used).placeholder
data = self.get_extern_data(key="seq_tag", mark_data_key_as_used=mark_data_key_as_used)
if beam:
data = data.copy_extend_with_beam(beam)
return data.placeholder
def get_losses_initialized(self, reduce_func=None, with_total=False):
......@@ -1373,6 +1377,8 @@ class TFNetwork(object):
import os
filename = os.path.abspath(filename) # TF needs absolute path
from Util import maybe_make_dirs
if not self.saver:
# We add some extra logic to try again for DiskQuota and other errors.
......@@ -1467,7 +1473,8 @@ class TFNetwork(object):
from TFUtil import cond
return cond(self.train_flag, fn_train, fn_eval)
def get_search_choices(self, sources=None, src=None, base_search_choice=None, _visited=None, debug_stream=None):
def get_search_choices(self, sources=None, src=None, base_search_choice=None, _layer_to_search_choices=None,
Recursively searches through all sources,
and if there is a :class:`ChoiceLayer` / any layer with search_choices, returns it.
......@@ -1478,7 +1485,7 @@ class TFNetwork(object):
:param LayerBase|None src:
:param LayerBase|None base_search_choice:
:param list[LayerBase]|None sources:
:param dict[LayerBase]|None _visited: keep track about visited layers in case there are circular deps
:param dict[LayerBase]|None _layer_to_search_choices: keep track about visited layers in case there are circular deps
:param typing.TextIO|None debug_stream: if given, will print additional debug info into it
:return: (direct or indirect) source LayerBase which has search_choices, or None
:rtype: LayerBase|None
......@@ -1490,10 +1497,12 @@ class TFNetwork(object):
from TFNetworkLayer import SearchChoices
from functools import cmp_to_key
from pprint import pformat
if _visited is None:
_visited = {} # type: typing.Dict[LayerBase,typing.List[LayerBase]]
if _layer_to_search_choices is None:
_layer_to_search_choices = {} # type: typing.Dict[LayerBase,typing.List[LayerBase]]
normalized_to_layer = {} # type: typing.Dict[LayerBase,LayerBase]
layers = self._get_all_search_choices(
sources=sources, src=src, base_search_choice=base_search_choice, _visited=_visited)
sources=sources, src=src, base_search_choice=base_search_choice,
_layer_to_search_choices=_layer_to_search_choices, _normalized_to_layer=normalized_to_layer)
def full_trace_for_layer(layer, _layer_trace=None):
......@@ -1509,9 +1518,11 @@ class TFNetwork(object):
return _layer_trace
if layer not in _visited:
self._get_all_search_choices(base_search_choice=layer, _visited=_visited)
for dep in _visited[layer]:
if layer not in _layer_to_search_choices: # happens if layer is a choice
_layer_to_search_choices=_layer_to_search_choices, _normalized_to_layer=normalized_to_layer)
for dep in _layer_to_search_choices[layer]:
full_trace_for_layer(dep, _layer_trace=_layer_trace)
return _layer_trace
......@@ -1520,7 +1531,7 @@ class TFNetwork(object):
:rtype: dict[str,list[str]]
relevant_map = {}
for key, values in _visited.items():
for key, values in _layer_to_search_choices.items():
relevant_map[key.get_absolute_name()] = [value.get_absolute_name() for value in values]
return relevant_map
......@@ -1569,7 +1580,8 @@ class TFNetwork(object):
layers = sorted(layers, key=cmp_to_key(compare_layer))
return layers[-1]
def _get_all_search_choices(self, sources=None, src=None, base_search_choice=None, _visited=None):
def _get_all_search_choices(self, sources=None, src=None, base_search_choice=None,
_layer_to_search_choices=None, _normalized_to_layer=None):
Recursively searches through all sources,
and if there is a :class:`ChoiceLayer` / any layer with search_choices, returns it.
......@@ -1580,14 +1592,21 @@ class TFNetwork(object):
:param LayerBase|None src:
:param LayerBase|None base_search_choice:
:param list[LayerBase]|None sources:
:param dict[LayerBase,list[LayerBase]]|None _visited: tracks visited layers in case there are circular deps
:return: (direct or indirect) source LayerBase which has search_choices, or None
:param dict[LayerBase,list[LayerBase]]|None _layer_to_search_choices:
tracks visited layers in case there are circular deps
:param dict[LayerBase,LayerBase]|None _normalized_to_layer:
:return: (direct or indirect) sources LayerBase which has search_choices
:rtype: list[LayerBase]
if _visited is None:
_visited = {} # type: typing.Dict[LayerBase,typing.List[LayerBase]]
if _layer_to_search_choices is None:
_layer_to_search_choices = {} # type: typing.Dict[LayerBase,typing.List[LayerBase]]
if _normalized_to_layer is None:
_normalized_to_layer = {} # type: typing.Dict[LayerBase,LayerBase]
if src is not None:
assert isinstance(src, LayerBase)
normalized_src = src.get_normalized_layer()
if normalized_src != src:
assert _normalized_to_layer.setdefault(normalized_src, src) == src # Currently expecting that this is unique.
if src.search_choices:
if src.search_choices.is_decided:
return []
......@@ -1595,19 +1614,24 @@ class TFNetwork(object):
assert base_search_choice is None
base_search_choice = src
if base_search_choice is not None:
if base_search_choice in _visited:
return _visited[base_search_choice]
if base_search_choice in _layer_to_search_choices:
return _layer_to_search_choices[base_search_choice]
_visited[base_search_choice] = [] # we visit it now
_layer_to_search_choices[base_search_choice] = [] # we visit it now
normalized_base = base_search_choice.get_normalized_layer()
if normalized_base != base_search_choice:
# Currently expecting that this is unique.
assert _normalized_to_layer.setdefault(normalized_base, base_search_choice) == base_search_choice
assert sources is None
sources = base_search_choice.get_dep_layers()
assert sources is not None
layers = [] # type: typing.List[LayerBase]
for src_ in sources:
src_choice_layers = self._get_all_search_choices(src=src_, _visited=_visited)
src_choice_layers = self._get_all_search_choices(
src=src_, _layer_to_search_choices=_layer_to_search_choices, _normalized_to_layer=_normalized_to_layer)
for layer in src_choice_layers:
if base_search_choice and layer not in _visited[base_search_choice]:
if base_search_choice and layer not in _layer_to_search_choices[base_search_choice]:
if layer not in layers:
if not layers:
......@@ -1615,6 +1639,25 @@ class TFNetwork(object):
# noinspection PyProtectedMember
return self.parent_layer.network._get_all_search_choices(sources=self.parent_layer.get_dep_layers())
return []
if base_search_choice is not None:
normalized_base = base_search_choice.get_normalized_layer()
if normalized_base != base_search_choice:
# Just make sure we visit these as well.
normalized_choices = self._get_all_search_choices(
_layer_to_search_choices=_layer_to_search_choices, _normalized_to_layer=_normalized_to_layer)
if any([l.get_normalized_layer() == l for l in normalized_choices]):
# Filter any "prev:..." layers away. This should always be correct.
# Also, this is important to have the correct choice resolution for the prev layer (base_search_choice).
normalized_choices = [l for l in normalized_choices if l.get_normalized_layer() == l]
# Get corresponding "prev:..." layers.
from pprint import pformat
assert all([l in _normalized_to_layer for l in normalized_choices]), "\n".join([
"No cur -> prev mapping for some layers.", "Base: %s" % base_search_choice,
"Prev choices:", pformat(layers),
"Cur choices:", pformat(normalized_choices), "Mapping:", pformat(_normalized_to_layer)])
layers = [_normalized_to_layer[l] for l in normalized_choices]
_layer_to_search_choices[base_search_choice] = layers
return layers
def debug_search_choices(self, base_search_choice):
......@@ -1644,7 +1687,7 @@ class TFNetwork(object):
super(Visitor, self).__setitem__(key, value)
search_choices = self.get_search_choices(
base_search_choice=base_search_choice, _visited=Visitor(), debug_stream=sys.stdout)
base_search_choice=base_search_choice, _layer_to_search_choices=Visitor(), debug_stream=sys.stdout)
print("-> search choices:", search_choices)
def get_data_batch_dim(self):
This diff is collapsed.
This diff is collapsed.
......@@ -2767,6 +2767,8 @@ def get_valid_scope_name_from_str(s):
# NOTE: Be careful changing this logic. Try to never change the behavior for existing cases,
# because this name is used e.g. for layers, and you might introduce incompatibility by changes here.
s = s.replace(":", "__")
s = s.replace("(", "__")
s = s.replace(")", "__")
if s[:1] in "_-\\/": # invalid first chars
s = (".%i." % ord(s[0])) + s[1:]
return s
......@@ -5723,7 +5725,7 @@ def slice_nd(x, start, size):
:param tf.Tensor x: shape (B, T, ...)
:param tf.Tensor start: shape (B,), int32
:param int size:
:param int|tf.Tensor size: scalar
:return: [x[start_1:size], x[start_2:size], ..., x[start_B:size]], shape (B, size, ...)
Like :func:`slice_pad_zeros`, the size in the first axis will always be ``size``,
and we will pad with zeros.
......@@ -5736,11 +5738,11 @@ def slice_nd(x, start, size):
batch_idxs = expand_dims_unbroadcast(tf.range(n_batch), 1, size) # (n_batch, size)
batch_idxs = tf.reshape(batch_idxs, (-1,)) # (n_batch*size,)
window_pos = tf.expand_dims(start, 1) + tf.range(size) # (n_batch, size)
window_pos = tf.expand_dims(start, 1) + tf.range(size)[None, :] # (n_batch, size)
window_pos = tf.reshape(window_pos, (-1,)) # (n_batch*size,)
# build mask for zero-padding
mask = window_pos > shape[1]-1 # (n_batch*size,) tf.bool
mask = tf.logical_or(window_pos > shape[1]-1, window_pos < 0) # (n_batch*size,) tf.bool
# clip indices so that gather_nd doesn't fail, will zero-pad later
clip_time_idx = tf.clip_by_value(window_pos, 0, shape[1]-1)
......@@ -6188,7 +6190,7 @@ def minimum_with_identity_grad(x, y):
:param tf.Tensor x:
:param tf.Tensor y:
:return: tf.maximum(x, y) where each will receive the gradient
:return: tf.minimum(x, y) where each will receive the gradient
:rtype: tf.Tensor
with tf.name_scope("minimum_with_identity_grad"):
......@@ -8065,7 +8067,7 @@ def _tensor_array_select_src_beams(ta, src_beams):
return ta_new
def beam_search(scores, beam_size, keep_beams=False, cheating_gold_targets=None):
def beam_search(scores, beam_size, keep_beams=False, cheating_gold_targets=None, cheating_src_beam_idx=None):
This is mostly a higher-level wrapper around :func:`tf.nn.top_k`.
......@@ -8075,6 +8077,7 @@ def beam_search(scores, beam_size, keep_beams=False, cheating_gold_targets=None)
:param bool keep_beams: specifies that we keep the beam_in entries,
i.e. we just expand, i.e. we just search on the dim. beam_size must be a multiple of beam_in.
:param tf.Tensor|None cheating_gold_targets: (batch,), int32
:param tf.Tensor|None cheating_src_beam_idx: (batch,), int32. If not given, assumes beam_in - 1. See code below.
:rtype: (tf.Tensor,tf.Tensor,tf.Tensor)
:return: src_beams, labels, beam_scores.
src_beams: (batch, beam) -> beam_in idx (int32),
......@@ -8087,8 +8090,11 @@ def beam_search(scores, beam_size, keep_beams=False, cheating_gold_targets=None)
scores = tf.reshape(scores, [batch_dim * beam_in, 1, in_dim])
if cheating_gold_targets is not None:
cheating_gold_targets = tile_transposed(cheating_gold_targets, axis=0, multiples=beam_in)
if cheating_src_beam_idx is not None and cheating_src_beam_idx.shape.ndims > 0:
cheating_src_beam_idx = tile_transposed(cheating_src_beam_idx, axis=0, multiples=beam_in)
_, labels, beam_scores = beam_search(
scores=scores, beam_size=beam_size // beam_in, cheating_gold_targets=cheating_gold_targets)
scores=scores, beam_size=beam_size // beam_in,
cheating_gold_targets=cheating_gold_targets, cheating_src_beam_idx=cheating_src_beam_idx)
src_beams = tf.zeros([batch_dim, beam_in, beam_size // beam_in], dtype=tf.int32)
src_beams += tf.range(beam_in)[None, :, None]
src_beams = tf.reshape(src_beams, [batch_dim, beam_size])
......@@ -8111,13 +8117,21 @@ def beam_search(scores, beam_size, keep_beams=False, cheating_gold_targets=None)
# We replace them by the true labels.
cheating_gold_targets = tf.clip_by_value(
cheating_gold_targets, 0, in_dim - 1) # safety, for invalid values...
gold_beam_in_idx = beam_in - 1 # also assume last index
gold_labels = gold_beam_in_idx * in_dim + cheating_gold_targets # (batch,)
if cheating_src_beam_idx is None:
# We also assume that the last choice also has the cheating target in the last beam index.
cheating_src_beam_idx = beam_in - 1
cheating_src_beam_idx = tf.clip_by_value(cheating_src_beam_idx, 0, beam_in - 1) # safety
gold_labels = cheating_src_beam_idx * in_dim + cheating_gold_targets # (batch,)
gold_labels_bc = tf.expand_dims(gold_labels, axis=1) # (batch,1)
labels = tf.concat([labels[:, :beam_size - 1], gold_labels_bc], axis=1) # (batch,beam)
if cheating_src_beam_idx.shape.ndims == 0:
gold_scores = scores[:, cheating_src_beam_idx] # (batch,in_dim)
assert cheating_src_beam_idx.shape.ndims == 1
gold_scores = tf.gather_nd(scores, indices=nd_indices(cheating_src_beam_idx)) # (batch,in_dim)
# Note: In case the seq ended, we assume that the gold_targets are all 0, such that we get the right score.
gold_scores = tf.gather_nd(
scores[:, gold_beam_in_idx], indices=nd_indices(cheating_gold_targets)) # (batch,)
gold_scores = tf.gather_nd(gold_scores, indices=nd_indices(cheating_gold_targets)) # (batch,)
gold_scores_bc = tf.expand_dims(gold_scores, axis=1) # (batch,1)
beam_scores = tf.concat([beam_scores[:, :beam_size - 1], gold_scores_bc], axis=1) # (batch,beam)
src_beams = labels // in_dim # (batch, beam) -> beam_in idx
......@@ -2975,6 +2975,23 @@ def get_utc_start_time_filename_part():
return time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime(start_time))
def maybe_make_dirs(dirname):
Creates the directory if it does not yet exist.
:param str dirname: The path of the directory
import os
if not os.path.exists(dirname):
except Exception as exc:
print("maybe_create_folder: exception creating dir:", exc)
# Maybe a concurrent process, e.g. tf.summary.FileWriter created it in the mean-while,
# so then it would be ok now if it exists, but fail if it does not exist.
assert os.path.exists(dirname)
def log_runtime_info_to_dir(path, config):
This will write multiple logging information into the path.
......@@ -3000,14 +3017,7 @@ def log_runtime_info_to_dir(path, config):
"TensorFlow: %s" % (describe_tensorflow_version(),),
"Config files: %s" % (config.files,),
if not os.path.exists(path):
except Exception as exc:
print("log_runtime_info_to_dir: exception creating dir:", exc)
# Maybe a concurrent process, e.g. tf.summary.FileWriter created it in the mean-while,
# so then it would be ok now if it exists, but fail if it does not exist.
assert os.path.exists(path)
log_fn = "%s/returnn.%s.%s.%i.log" % (path, get_utc_start_time_filename_part(), hostname, os.getpid())
if not os.path.exists(log_fn):
with open(log_fn, "w") as f:
......@@ -4,6 +4,14 @@
Model Loading
If a path to a valid model is provided (for TF models without ``.meta`` extension),
use this to initialize the weights for training. If you do not want to start a new training, see ``load``.
If a path to a valid model is provided (for TF models without ``.meta`` extension),
use this to load the specified model and training state. The training is continued from the last position.
Specifies the epoch index to load, based on the prefix given in ``model``.
If not set, RETURNN will use the latest epoch.
......@@ -13,3 +21,4 @@ preload_from_files
For all layers in your network whose layer name starts with prefix, it will load the parameters from
the checkpoint specified by filename
(It will look for the corresponding layer name without the prefix in the given checkpoint).
......@@ -30,6 +30,10 @@ cleanup_old_models
- ``keep``: list or set of integers defining which checkpoints to keep
A dict with string:integer pairs. The string must be a valid data key,
and the integer specifies the upper bound for this data object. Batches, where the specified data object exceeds
the upper bound are discarded. Note that some datasets (e.g ``OggZipDataset``) load and process the data
to determine the length, so even for discarded sequences data processing might be performed.
An integer specifying the upper limit of sequences in a batch (can be used in addition to ``batch_size``).
......@@ -40,6 +44,11 @@ num_epochs
An integer specifying after how many epochs the model is saved.
An integer or string specifying the epoch to start the training at. The default is 'auto'.
If set to ``False``, the training will not be interupted if a single update step has a loss with NaN of Inf
......@@ -8,3 +8,9 @@ Extern Sprint Dataset
.. automodule:: SprintDataset.ExternSprintDataset
Ogg Zip Dataset
.. automodule:: GeneratingDataset.OggZipDataset
......@@ -28,6 +28,11 @@ You can run RETURNN via::
ipython --pdb rnn.py your-config.py
That will give you the IPython debugger shell once you hit an unhandled exception.
You can summon the interactive shell by explicitly calling the following from the
source code or from the config::
import Debug
Debug.debug_shell(user_ns=locals(), user_global_ns=globals(), exit_afterwards=False)
Shapes and :class:`Data`
......@@ -48,6 +48,8 @@ but another layer in the network, add 'layer:' as prefix.
**spatial_smoothing** [:class:`float`] if specified, add spatial-smoothing loss of the layer output with the given factor to the total constraints.
**register_as_extern_data** [:class:`str`] register the output of the layer as an accessable entry of extern_data.
.. _connecting:
Connecting Layers
......@@ -1020,6 +1020,149 @@ def test_attention_search_in_train_then_search():
def check_train_and_search_two_targets(net_dict):
Tests training and search for network architectures having two targets ("classes_0", "classes_1")
and two corresponding output layers ("decision_0", "decision_1").
from MetaDataset import MetaDataset
from TFUtil import DimensionTag
from test_HDFDataset import generate_hdf_from_other
n_data_dim = 2
n_classes_dim_0 = 7
n_classes_dim_1 = 8
data_0 = {"class": "DummyDataset", "input_dim": n_data_dim, "output_dim": n_classes_dim_0,
"num_seqs": 2, "seq_len": 5}
data_0 = generate_hdf_from_other(data_0)
data_1 = {"class": "DummyDataset", "input_dim": n_data_dim, "output_dim": n_classes_dim_1,
"num_seqs": 2, "seq_len": 5}
data_1 = generate_hdf_from_other(data_1)
data = MetaDataset(datasets={"data_0": data_0, "data_1": data_1},
"data": ("data_1", "data"),
"classes_0": ("data_0", "classes"),
"data_1": ("data_1", "data"),
"classes_1": ("data_1", "classes")},
dec_time = DimensionTag(kind=DimensionTag.Types.Spatial, description="dec time")
config = Config()
"model": "%s/model" % _get_tmp_dir(),
"batch_size": 5000,
"max_seqs": 2,
"extern_data": {"data": {"dim": n_data_dim, "sparse": False},
"classes_0": {"dim": n_classes_dim_0, "sparse": True, "same_dim_tags_as": {"t": dec_time}},
"classes_1": {"dim": n_classes_dim_1, "sparse": True, "same_dim_tags_as": {"t": dec_time}},
"num_epochs": 1,
"network": net_dict,
"debug_print_layer_output_template": True,
engine = Engine(config=config)
engine.init_train_from_config(config=config, train_data=data, dev_data=None)
engine.use_search_flag = True
engine.use_dynamic_train_flag = False
engine.search(dataset=data, output_layer_names=["decision_0", "decision_1"])
assert engine.network.total_objective is not None
assert "decision_0" in engine.network.losses_dict
assert "decision_1" in engine.network.losses_dict
def test_attention_two_targets():
Tests training and search when using a ChoiceLayer with two targets.
net_dict = {
"encoder": {"class": "linear", "activation": "tanh", "n_out": 5},
"output": {
"class": "rec",
"from": [],
"target": "classes_1", "max_seq_len": 10,
"unit": {
"end": {"class": "compare", "from": ["output_0"], "value": 0},
"orth_embed_0": {'class': 'linear', 'activation': None, 'from': ['output_0'], "n_out": 7},
"orth_embed_1": {'class': 'linear', 'activation': None, 'from': ['output_1'], "n_out": 7},
"orth_embed": {"class": "copy", "from": ["orth_embed_0", "orth_embed_1"]},
"s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:c", "prev:orth_embed"], "n_out": 7},
"c_in": {"class": "linear", "activation": "tanh", "from": ["s", "prev:orth_embed"], "n_out": 5},
"c": {"class": "dot_attention", "from": ["c_in"], "base": "base:encoder", "base_ctx": "base:encoder"},
"output_prob_0": {"class": "softmax", "from": ["prev:s", "c"], "target": "classes_0", "loss": "ce"},
"output_prob_1": {"class": "softmax", "from": ["prev:s", "c"], "target": "classes_1", "loss": "ce"},
"output": {'class': 'choice', 'target': ['classes_0', "classes_1"], 'beam_size': 4,
'from': ["output_prob_0", "output_prob_1"], "source_beam_sizes": [2, 6]},
"output_0": {"class": "copy", "from": ["output/out_0"], "is_output_layer": True},
"output_1": {"class": "copy", "from": ["output/out_1"], "is_output_layer": True},
"output_0": {"class": "copy", "from": ["output/output_0"], "target": "classes_0"},
"output_1": {"class": "copy", "from": ["output/output_1"], "target": "classes_1"},
"decision_0": {"class": "decide", "from": ["output_0"], "loss": "edit_distance", "target": "classes_0"},
"decision_1": {"class": "decide", "from": ["output_1"], "loss": "edit_distance", "target": "classes_1"},
def test_attention_two_dependent_targets():
Tests training and search when having two ChoiceLayers in the loop that depend on each other.
Note, there will be different beam sizes in different parts of the recurrent unit.
beam_size_0 = 5
beam_size_1 = 3
net_dict = {
"encoder": {"class": "linear", "activation": "tanh", "n_out": 5},
"output": {
"class": "rec",
"from": [],
"target": "classes_1", "max_seq_len": 10,
"unit": {
"end": {"class": "compare", "from": ["output_0"], "value": 0},
"orth_embed_0": {'class': 'linear', 'activation': None, 'from': ['output_0'], "n_out": 7},
"orth_embed_1": {'class': 'linear', 'activation': None, 'from': ['output_1'], "n_out": 7},
"orth_embed": {"class": "copy", "from": ["orth_embed_0", "orth_embed_1"]},
"s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:c", "prev:orth_embed"], "n_out": 7},
"c_in": {"class": "linear", "activation": "tanh", "from": ["s", "prev:orth_embed"], "n_out": 5},
"c": {"class": "dot_attention", "from": ["c_in"], "base": "base:encoder", "base_ctx": "base:encoder"},
"output_prob_0": {"class": "softmax", "from": ["prev:s", "c"], "target": "classes_0", "loss": "ce"},
"output_prob_1": {"class": "softmax", "from": ["prev:s", "c", "orth_embed_0"],
"target": "classes_1", "loss": "ce"},
# Important for real experiments: apply length normalization only once (in last choice layer).
"output_0": {'class': 'choice', 'target': 'classes_0', 'beam_size': beam_size_0, 'from': "output_prob_0",
"is_output_layer": True, "length_normalization": False},
"output_1": {'class': 'choice', 'target': 'classes_1', 'beam_size': beam_size_1, 'from': "output_prob_1",
"is_output_layer": True},
"output": {"class": "copy", "from": "output_1"},
"output_0": {"class": "copy", "from": ["output/output_0"], "target": "classes_0"},
"output_1": {"class": "copy", "from": ["output/output_1"], "target": "classes_1"},
"decision_0": {"class": "decide", "from": ["output_0"], "loss": "edit_distance", "target": "classes_0"},
"decision_1": {"class": "decide", "from": ["output_1"], "loss": "edit_distance", "target": "classes_1"},
def test_rec_optim_all_out():
from GeneratingDataset import DummyDataset
from TFNetworkRecLayer import RecLayer, _SubnetworkRecCell
This diff is collapsed.
......@@ -1845,6 +1845,164 @@ def test_search_multi_choice_simple_keep_beams():
assert numpy.any(output_value[:, 0] != output_value[:, 1])
def test_rec_layer_multi_choice_search_resolve():
AttNumHeads = 1
EncKeyTotalDim = 10
beam_size = 3
target = "classes"
net_dict = {
"lstm0_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (3,), "from": "data"},
"encoder0": {"class": "copy", "from": "lstm0_pool"},
"encoder": {"class": "copy", "from": "encoder0"},
"enc_ctx": {"class": "linear", "activation": None, "with_bias": True, "from": ["encoder"], "n_out": EncKeyTotalDim},
"enc_value": {"class": "copy", "from": "encoder"}, # (B, enc-T, D)
"inv_fertility": {"class": "linear", "activation": "sigmoid", "with_bias": False, "from": ["encoder"],
"n_out": AttNumHeads},
"enc_seq_len": {"class": "length", "from": "encoder", "sparse": True},
"output": {"class": "rec", "from": [], "back_prop": False, "unit": {
"weight_feedback": {"class": "linear", "activation": None, "with_bias": False,
"from": ["prev:accum_att_weights"], "n_out": EncKeyTotalDim},
"s_transformed": {"class": "linear", "activation": None, "with_bias": False, "from": ["s"],
"n_out": EncKeyTotalDim},
"energy_in": {"class": "combine", "kind": "add",
"from": ["base:enc_ctx", "s_transformed", "weight_feedback"], "n_out": EncKeyTotalDim},
"energy_tanh": {"class": "activation", "activation": "tanh", "from": "energy_in"},
"energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"],
"n_out": AttNumHeads}, # (B, enc-T, H)
"energy1": {"class": "squeeze", "axis": "f", "from": "energy"}, # (B, enc-T)
"energy2": {"class": "reinterpret_data", "from": "energy1", "set_axes": {"t": "stag:lstm"}},
# Segment boundaries:
# - t0/t1/t is the right side (inclusive)
# - prev:t is the left side (exclusive)
# - t_start/prev_t_plus1 is the left side (inclusive)
"prev_t_plus1": {"class": "eval", "from": "prev:t", "eval": "source(0) + 1"},
"t_start": {
"class": "eval", "from": ["prev_t_plus1", "base:enc_seq_len"],
"eval": "tf.minimum(source(0), source(1) - 1)"}, # to avoid nans
"t_weights": {
"class": "softmax_over_spatial", "from": "energy2", "axis": "stag:lstm",
"start": "t_start"},
"t_weights1": {
# ChoiceLayer works on the feature axis.
"class": "reinterpret_data", "from": "t_weights", "set_axes": {"f": "stag:lstm"},
"t0": {
"class": "choice", "from": "t_weights1", "target": None,
"beam_size": beam_size * 4,
"keep_beams": True,
"length_normalization": False, "initial_output": -1}, # (B,)
# Note: If beam-size > enc_seq_len, we end up with invalid t in the beam. Fix that.
"t1": {
"class": "eval", "from": ["t0", "t_start", "base:enc_seq_len"],
"eval": "tf.clip_by_value(source(0), source(1), source(2) - 1)"},
"t": {
"class": "copy",
"from": "t1", "initial_output": -1, "is_output_layer": True},
"window_start": {"class": "eval", "from": "t", "eval": "source(0) - 5"},
"att_weights": {
"class": "softmax_over_spatial", "from": "energy2", "axis": "stag:lstm",
"window_start": "window_start",
"window_size": 10}, # (B, enc-T)
"att_soft": {"class": "generic_attention", "weights": "att_weights", "base": "base:enc_value"}, # (B, V)
"att": {"class": "copy", "from": "att_soft"},
"accum_att_weights": {"class": "eval",
"from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
"eval": "source(0) + source(1) * source(2) * 0.5",
"out_type": {"dim": AttNumHeads, "shape": (None, AttNumHeads)}},
"s": {"class": "rec", "unit": "nativelstm2", "from": ["prev:target_embed", "prev:att"], "n_out": 1000,
"dropout": 0.2},
"readout_in": {"class": "linear", "from": ["s", "prev:target_embed", "att"], "activation": None,
"n_out": 1000, "dropout": 0.3},
"readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]},
"output_prob": {"class": "softmax", "from": ["readout"], "dropout": 0.3, "target": target, "loss": None},
'target_embed': {'class': 'linear', 'activation': None, "with_bias": False, 'from': ['output'],
"n_out": 621, "initial_output": "var"},
'output': {
'class': 'choice', 'target': target, 'beam_size': beam_size, 'from': ["output_prob"], "initial_output": 0,
'search': True, "length_normalization": True},
# "end": {"class": "compare", "from": ["t", "base:enc_seq_len"], "kind": "greater"},
"end": {"class": "compare", "from": "output", "value": 0},
"target": [target],
"max_seq_len": "max_len_from('base:encoder0')"}
config = Config({
"debug_print_layer_output_template": True,
print("Create network")
n_src_dim = 7
n_tgt_dim = 11
from GeneratingDataset import StaticDataset
from TFDataPipeline import FeedDictDataProvider
from EngineBatch import Batch, BatchSetGenerator
dataset = StaticDataset(
{"data": numpy.random.normal(size=(11, n_src_dim)).astype("float32"),
target: numpy.array([3, 6, 0])},
{"data": numpy.random.normal(size=(13, n_src_dim)).astype("float32"),
target: numpy.array([3, 6, 2, 1, 4, 5, 0])}],
output_dim={"data": [n_src_dim, 2], "classes": [n_tgt_dim, 1]})
batch = Batch()
batch.add_sequence_as_slice(seq_idx=0, seq_start_frame=0, length=dataset.get_seq_length(0))
batch.add_sequence_as_slice(seq_idx=1, seq_start_frame=0, length=dataset.get_seq_length(1))
print("batch:", batch, "num frames:", batch.get_total_num_frames())
print("batch dims:", batch.max_num_frames_per_slice * batch.num_slices)
batch_generator = iter([batch])
batches = BatchSetGenerator(dataset, generator=batch_generator)
with make_scope() as session:
extern_data = ExternData({
"data": {"dim": n_src_dim},
target: {"dim": n_tgt_dim, "sparse": True, "available_for_inference": False}})
net = TFNetwork(extern_data=extern_data, search_flag=True, train_flag=False, config=config)
out_layer = net.layers["output"]
assert isinstance(out_layer, RecLayer)
data_provider = FeedDictDataProvider(
tf_session=session, extern_data=extern_data,
dataset=dataset, batches=batches)
feed_dict, meta_step_info = data_provider.get_feed_dict(single_threaded=True)
out = session.run(out_layer.output.placeholder, feed_dict=feed_dict)
except Exception as exc:
print("EXCEPTION " + "-" * 60)
print("Exception happened:", str(exc).splitlines()[0])
print("TF debug info:")
exception=exc, fetches=out_layer.output.placeholder, feed_dict=feed_dict, meta_step_info=meta_step_info,
def test_rec_layer_move_out_of_loop():
from TFNetworkRecLayer import _SubnetworkRecCell
from TFUtil import get_global_train_flag_placeholder
#!/usr/bin/env python3
Goes through a dataset (similar as dump-dataset.py), but does the training batch creation,
and calculate statistics about how much seqs we have per batch, how much zero padding there is, etc.
from __future__ import print_function, division
import os
import sys
import time
import typing
import argparse
import numpy
from pprint import pformat
my_dir = os.path.dirname(os.path.abspath(__file__))
returnn_dir = os.path.dirname(my_dir)
import rnn
from Log import log
import Util
from Util import Stats, hms, NumbersDict
from Dataset import Batch, Dataset, init_dataset
from Config import Config
config = None # type: typing.Optional[Config]
dataset = None # type: typing.Optional[Dataset]
def analyze_dataset(options):
:param options: argparse.Namespace
print("Epoch: %i" % options.epoch, file=log.v3)
print("Dataset keys:", dataset.get_data_keys(), file=log.v3)
print("Dataset target keys:", dataset.get_target_list(), file=log.v3)
assert options.key in dataset.get_data_keys()
terminal_width, _ = Util.terminal_size()
show_interactive_process_bar = (log.verbose[3] and (not log.verbose[5]) and terminal_width >= 0)
start_time = time.time()
num_seqs_stats = Stats()
if options.endseq < 0:
options.endseq = float("inf")
recurrent = True
used_data_keys = dataset.get_data_keys()
batch_size = config.typed_value('batch_size', 1)
max_seqs = config.int('max_seqs', -1)
seq_drop = config.float('seq_drop', 0.0)
max_seq_length = config.typed_value('max_seq_length', None) or config.float('max_seq_length', 0)
batches = dataset.generate_batches(
step = 0
total_num_seqs = 0
total_num_frames = NumbersDict()
total_num_used_frames = NumbersDict()
while batches.has_more():
# See FeedDictDataProvider.
batch, = batches.peek_next_n(1)
assert isinstance(batch, Batch)
if batch.start_seq > options.endseq:
dataset.load_seqs(batch.start_seq, batch.end_seq)
complete_frac = batches.completed_frac()
start_elapsed = time.time() - start_time
num_seqs_s = str(dataset.num_seqs)
except NotImplementedError:
num_seqs_s = "~%i" % dataset.estimated_num_seqs
except TypeError: # a number is required, not NoneType
num_seqs_s = "?"
progress_prefix = "%i/%s" % (batch.start_seq, num_seqs_s)
progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100)
if complete_frac > 0:
total_time_estimated = start_elapsed / complete_frac
remaining_estimated = total_time_estimated - start_elapsed
progress += " (%s)" % hms(remaining_estimated)
batch_max_time = NumbersDict.max([seq.frame_length for seq in batch.seqs]) * len(batch.seqs)
batch_num_used_frames = sum([seq.frame_length for seq in batch.seqs], NumbersDict())
total_num_seqs += len(batch.seqs)
total_num_frames += batch_max_time
total_num_used_frames += batch_num_used_frames
"%s, batch %i, num seqs %i, frames %s, used %s (%s)" % (
progress, step, len(batch.seqs),
batch_max_time, batch_num_used_frames, batch_num_used_frames / batch_max_time),
if show_interactive_process_bar:
Util.progress_bar_with_time(complete_frac, prefix=progress_prefix)
step += 1
print("Done. Total time %s. More seqs which we did not dumped: %s" % (
hms(time.time() - start_time), batches.has_more()), file=log.v2)
print("Num batches (steps): %i" % step, file=log.v1)
print("Num seqs: %i" % total_num_seqs, file=log.v1)
num_seqs_stats.dump(stream=log.v1, stream_prefix="Batch num seqs ")
for key in used_data_keys:
print("Data key %r:" % key, file=log.v1)
print(" Num frames: %s" % total_num_frames[key], file=log.v1)
print(" Num used frames: %s" % total_num_used_frames[key], file=log.v1)
print(" Fraction used frames: %s" % (total_num_used_frames / total_num_frames)[key], file=log.v1)
def init(config_str, config_dataset, use_pretrain, epoch, verbosity):
:param str config_str: either filename to config-file, or dict for dataset
:param str|None config_dataset:
:param bool use_pretrain: might overwrite config options, or even the dataset
:param int epoch:
:param int verbosity:
dataset_opts = None
config_filename = None
if config_str.strip().startswith("{"):
print("Using dataset %s." % config_str)
dataset_opts = eval(config_str.strip())
elif config_str.endswith(".hdf"):
dataset_opts = {"class": "HDFDataset", "files": [config_str]}
print("Using dataset %r." % dataset_opts)
assert os.path.exists(config_str)
config_filename = config_str
print("Using config file %r." % config_filename)
assert os.path.exists(config_filename)
rnn.init_config(config_filename=config_filename, default_config={"cache_size": "0"})
global config
config = rnn.config
config.set("log", None)
config.set("log_verbosity", verbosity)
print("Returnn dump-dataset starting up.", file=log.v2)
if not dataset_opts:
if config_dataset:
dataset_opts = "config:%s" % config_dataset
dataset_opts = "config:train"
if use_pretrain:
from Pretrain import pretrain_from_config
pretrain = pretrain_from_config(config)
if pretrain:
print("Using pretrain %s, epoch %i" % (pretrain, epoch), file=log.v2)
net_dict = pretrain.get_network_json_for_epoch(epoch=epoch)
if "#config" in net_dict:
config_overwrites = net_dict["#config"]
print("Pretrain overwrites these config options:", file=log.v2)
assert isinstance(config_overwrites, dict)
for key, value in sorted(config_overwrites.items()):
assert isinstance(key, str)
orig_value = config.typed_dict.get(key, None)
if isinstance(orig_value, dict) and isinstance(value, dict):
diff_str = "\n" + Util.dict_diff_str(orig_value, value)
elif isinstance(value, dict):
diff_str = "\n%r ->\n%s" % (orig_value, pformat(value))
diff_str = " %r -> %r" % (orig_value, value)
print("Config key %r for epoch %i:%s" % (key, epoch, diff_str), file=log.v2)
config.set(key, value)
print("No config overwrites for this epoch.", file=log.v2)
print("No pretraining used.", file=log.v2)
elif config.typed_dict.get("pretrain", None):
print("Not using pretrain.", file=log.v2)
dataset_default_opts = {}
Dataset.kwargs_update_from_config(config, dataset_default_opts)
print("Using dataset:", dataset_opts, file=log.v2)
global dataset
dataset = init_dataset(dataset_opts, default_kwargs=dataset_default_opts)
assert isinstance(dataset, Dataset)
def main():
argparser = argparse.ArgumentParser(description='Anaylize dataset batches.')
argparser.add_argument('crnn_config', help="either filename to config-file, or dict for dataset")
argparser.add_argument("--dataset", help="if given the config, specifies the dataset. e.g. 'dev'")
argparser.add_argument('--epoch', type=int, default=1)
argparser.add_argument('--endseq', type=int, default=-1, help='end seq idx (inclusive) or -1 (default: 10)')
argparser.add_argument("--verbosity", type=int, default=5, help="overwrites log_verbosity (default: 4)")
argparser.add_argument("--key", default="data", help="data-key, e.g. 'data' or 'classes'. (default: 'data')")
argparser.add_argument("--use_pretrain", action="store_true")
args = argparser.parse_args()
config_str=args.crnn_config, config_dataset=args.dataset, epoch=args.epoch, use_pretrain=args.use_pretrain,
except KeyboardInterrupt:
if __name__ == '__main__':
#!/usr/bin/env python3
from __future__ import print_function
import os
import sys
from argparse import ArgumentParser
import gzip
from xml.etree import ElementTree
import itertools
my_dir = os.path.dirname(os.path.abspath(__file__))
returnn_dir = os.path.dirname(my_dir)
sys.path.insert(0, returnn_dir)
class BlissItem:
def __init__(self, segment_name, recording_filename, start_time, end_time, orth):
:param str segment_name:
:param str recording_filename:
:param float start_time:
:param float end_time:
:param str orth:
self.segment_name = segment_name
self.recording_filename = recording_filename
self.start_time = start_time
self.end_time = end_time
self.orth = orth
def __repr__(self):
keys = ["segment_name", "recording_filename", "start_time", "end_time", "orth"]
return "BlissItem(%s)" % ", ".join(["%s=%r" % (key, getattr(self, key)) for key in keys])
def delta_time(self):
return self.end_time - self.start_time
def iter_bliss(filename):
:param str filename:
:return: yields BlissItem
:rtype: list[BlissItem]
corpus_file = open(filename, 'rb')
if filename.endswith(".gz"):
corpus_file = gzip.GzipFile(fileobj=corpus_file)
context = iter(ElementTree.iterparse(corpus_file, events=('start', 'end')))
_, root = next(context) # get root element
name_tree = [root.attrib["name"]]
elem_tree = [root]
count_tree = [0]
recording_filename = None
for event, elem in context:
if elem.tag == "recording":
recording_filename = elem.attrib["audio"] if event == "start" else None
if event == 'end' and elem.tag == "segment":
elem_orth = elem.find("orth")
orth_raw = elem_orth.text # should be unicode
orth_split = orth_raw.split()
orth = " ".join(orth_split)
segment_name = "/".join(name_tree)
yield BlissItem(