discrete fixed

parent b5619f23
......@@ -1335,22 +1335,13 @@ class TranslationDataset(CachedDataset2):
cur_seq_difficulty = [x + y for x, y in zip(source_diff, target_diff)]
elif mode == 'word_rarity_only_source':
cur_seq_difficulty = self.get_word_rarity('source', curriculum_learning)
#TODO delete print
print('cur_seq_difficulty')
print(cur_seq_difficulty)
elif mode == 'word_rarity_only_target':
cur_seq_difficulty = self.get_word_rarity('target', curriculum_learning)
#TODO delete print
print('cur_seq_difficulty')
print(cur_seq_difficulty)
elif mode == 'word_rarity':
source_seq_difficulty = self.get_word_rarity('source', curriculum_learning)
target_seq_difficulty = self.get_word_rarity('target', curriculum_learning)
# just add them; will be normalized later on
cur_seq_difficulty = [x + y for x, y in zip(source_seq_difficulty, target_seq_difficulty)]
#TODO delete print
print('cur_seq_difficulty')
print(cur_seq_difficulty)
elif mode == 'neg_log_likelihood':
# neg log likelihood should be 0 <= x < infity; the bigger x the easier the sentence -> reverse order
with open(curriculum_learning['neg_log_likelihood_file'], "r") as f:
......@@ -1364,20 +1355,6 @@ class TranslationDataset(CachedDataset2):
curriculum_learning['reverse'] = False
else:
curriculum_learning['reverse'] = True
elif mode == 'kmeans_only_source':
# TODO delete this mode
print("kmeans_only_source computes difficulty")
# needs to be an hdf file
source_embed = h5py.File(curriculum_learning['source_embed_file'], 'r')
source_embed = source_embed.get('inputs').value # is now an nd-array
kmeans = KMeans(n_clusters=curriculum_learning['number_of_clusters'], random_state=0).fit(source_embed)
print("kmeans")
print(kmeans)
cur_seq_difficulty = kmeans.labels
# align the right values to the sequences
source_tags = h5py.File(curriculum_learning['source_embed_file'], 'r')
source_tags = source_tags.get('inputs').value # is now an nd-array
source_tags = [int(''.join([s for s in tag if s.isdigit()])) for tag in source_tags]
elif mode == 'kmeans':
# sequence length
source_diff = [len(seq) for seq in self._data[self._main_data_key]]
......@@ -1392,24 +1369,15 @@ class TranslationDataset(CachedDataset2):
f1 = f.readlines()
f2 = [float(x) for x in f1]
neg_log_likeli = f2
# TODO uncomment and delete prints
# assert self._num_seqs == len(neg_log_likeli), "We expect the train_data and the score_file to have the" \
# " same length, but {} vs {}".format(self._num_seqs,
# len(neg_log_likeli))
assert self._num_seqs == len(neg_log_likeli), "We expect the train_data and the score_file to have the" \
" same length, but {} vs {}".format(self._num_seqs,
len(neg_log_likeli))
difficulty_arr = [[x, y, z] for x, y, z in zip(seq_len, word_rarity, neg_log_likeli)]
# print(difficulty_arr)
kmeans = KMeans(n_clusters=curriculum_learning['number_of_clusters'], random_state=0).fit(difficulty_arr)
print("kmeans")
print(kmeans)
print('kmeans.cluster_centers_')
print(kmeans.cluster_centers_)
print('kmeans.labels_')
print(kmeans.labels_)
print(kmeans.labels_.shape)
# ranges from 0 to n_clusters-1
cur_seq_difficulty = kmeans.labels_
if curriculum_learning['competence'] == 'discrete':
curriculum_learning['norm'] = 'no_norm'
else:
raise NotImplementedError("This difficulty mode is not implemented.")
# norm it between 0 and 1
......@@ -1422,8 +1390,6 @@ class TranslationDataset(CachedDataset2):
idx_sorted = numpy.argsort(cur_seq_difficulty)
max_seq = numpy.amax(idx_sorted)
cur_seq_difficulty = idx_sorted / max_seq
print('max(cur_seq_difficulty)')
print(max(cur_seq_difficulty))
elif curriculum_learning['norm'] == 'no_norm':
pass
else:
......@@ -1457,9 +1423,8 @@ class TranslationDataset(CachedDataset2):
cur_seq = self._get_data(key=data_key, line_nr=i)
score = 0
for word in cur_seq:
#TODO do scores make sense
score -= math.log(num_words[word])
if 'norm_by_len' in curriculum_learning and curriculum_learning['norm_by_len']:
if not 'norm_by_len' in curriculum_learning or curriculum_learning['norm_by_len']:
score = score / len(cur_seq)
cur_seq_difficulty.append(score)
return cur_seq_difficulty
......@@ -1483,10 +1448,9 @@ class TranslationDataset(CachedDataset2):
p = curriculum_learning['p']
competence = min(1, (self.epoch * ((1 - c0 ** p) / T) + c0 ** p) ** (1. / p))
elif mode == 'discrete':
assert curriculum_learning['difficulty'] == 'kmeans', "Only use discrete competence with kmeans"
competence = self.epoch * curriculum_learning['number_of_clusters']\
/ curriculum_learning['total_number_of_iterations']
print('competence')
print(self.epoch * curriculum_learning['number_of_clusters'] / curriculum_learning['total_number_of_iterations'])
else:
raise NotImplementedError("This competence mode is not implemented.")
return competence
......@@ -1498,12 +1462,7 @@ class TranslationDataset(CachedDataset2):
+ str(self.epoch) + ".")
quit(2222)
self._seq_order = [i for i in range(len(self._data[self._main_data_key])) if (self.seq_difficulty[i] <= competence)]
print("end of seq order")
print(self._seq_order[10:])
print(self._seq_order[-10:])
print(competence)
print(self.epoch)
self._seq_order = [i for i in range(len(self._data[self._main_data_key])) if (self.seq_difficulty[i] < competence)]
self._num_seqs = len(self._seq_order)
if 'seq_order_len' in curriculum_learning:
if self._num_seqs > curriculum_learning['seq_order_len']:
......@@ -1512,44 +1471,11 @@ class TranslationDataset(CachedDataset2):
rnd.shuffle(self._seq_order)
self._seq_order = self._seq_order[:curriculum_learning['seq_order_len']]
self._num_seqs = len(self._seq_order)
# TODO does it make a difference if I change epochsplit in the config? see ntimespresplit/*split
if 'n_times_per_slice' in curriculum_learning:
self._seq_order = numpy.tile(self._seq_order, curriculum_learning['n_times_per_slice'])
self._num_seqs = len(self._seq_order)
return True
#
# def get_part_of_data(self, max_difficulty):
# data = copy.copy(self)
# data._data = {}
# for key in self._data:
# data._data[key] = copy.deepcopy(
# [self._data[key][i] for i in range(len(self._data[self._main_data_key])) if (data.seq_difficulty[i] < max_difficulty)])
# data._data_len = len(data._data[data._main_data_key])
# # rnd_seed = (self.epoch - 1) / max_difficulty + 1
# # rnd = Random(rnd_seed)
# # rnd.shuffle()
# # data.init_seq_order()
# data._num_seqs = data._data_len
# # data.num_seqs = data._data_len
# # data.estimated_num_seqs = data._num_seqs
# data._seq_order = [i for i in range(self._data_len) if (data.seq_difficulty[i] < max_difficulty)]
# #data._seq_order = copy.deepcopy(self._seq_order[:data._data_len])
# return data
#
# def unused_get_part_of_data(self, max_seq_nr):
# data = copy.copy(self)
# data._data = {}
# for key in self._data:
# data._data[key] = copy.deepcopy(
# [self._data[key][i] for i in range(0, data._data_len) if (data._seq_order[i] < max_seq_nr)])
# data._data_len = max_seq_nr
# # data._num_seqs = end_seq - start_seq
# # data.num_seqs = data._num_seqs
# # data.estimated_num_seqs = data._num_seqs
# # data.seq_order = copy.deepcopy(self._seq_order[start_seq:end_seq])
# return data
def _collect_single_seq(self, seq_idx):
if seq_idx >= self._num_seqs:
return None
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment