Commit cfc09b4d authored by Christian Fuß's avatar Christian Fuß

changed create_dataset.py to save images in channels first format. Added Show,...

changed create_dataset.py to save images in channels first format. Added Show, attend and tell model that uses images as input instead of features


Former-commit-id: 870fb676
parent fa92c358
......@@ -6,8 +6,8 @@ configuration Show_attend_tell{
loss:softmax_cross_entropy
save_attention_image:true
optimizer:adam{
learning_rate:0.1
learning_rate_decay:0.8
learning_rate:0.005
learning_rate_decay:0.9
step_size:1000
weight_decay:0.0001
}
......
......@@ -2,11 +2,11 @@ package showAttendTell;
component Show_attend_tell{
ports in Z(-oo:oo)^{64,2048} data,
in Z(0:255)^{3,224,224} images,
out Z(0:37758)^{1} target[25];
implementation CNN{
layer LSTM(units=512) encoder;
layer LSTM(units=512) decoder;
layer FullyConnected(units = 256) features;
......@@ -14,11 +14,7 @@ component Show_attend_tell{
1 -> target[0];
data -> encoder -> FullyConnected(units=256) ->
Relu() ->
features;
encoder.state -> decoder.state;
data -> features;
timed <t> GreedySearch(max_length=25){
(
......@@ -44,8 +40,7 @@ component Show_attend_tell{
ExpandDims(axis=0)
|
target[t-1] ->
Embedding(output_dim=256) ->
Dropout(p=0.25)
Embedding(output_dim=256)
) ->
Concatenate(axis=1) ->
decoder ->
......@@ -59,3 +54,4 @@ component Show_attend_tell{
};
}
}
package showAttendTell;
component Show_attend_tell_images_as_input{
ports in Z(-oo:oo)^{64,2048} data,
in Z(0:255)^{3,224,224} images,
out Z(0:37758)^{1} target[25];
implementation CNN{
layer LSTM(units=512) decoder;
layer FullyConnected(units = 256) features;
layer FullyConnected(units = 1, flatten=false) attention;
1 -> target[0];
images ->
Convolution(kernel=(7,7), channels=64, stride=(7,7), padding="valid") ->
Convolution(kernel=(4,4), channels=64, stride=(4,4), padding="valid") ->
GlobalPooling(pool_type="max") ->
features;
timed <t> GreedySearch(max_length=25){
(
(
(
features.output ->
FullyConnected(units=512, flatten=false)
|
decoder.state[0] ->
FullyConnected(units=512, flatten=false)
) ->
BroadcastAdd() ->
Tanh() ->
FullyConnected(units=1, flatten=false) ->
Softmax(axis=0) ->
Dropout(p=0.25) ->
attention
|
features.output
)->
BroadcastMultiply() ->
ReduceSum(axis=0) ->
ExpandDims(axis=0)
|
target[t-1] ->
Embedding(output_dim=256)
) ->
Concatenate(axis=1) ->
decoder ->
Relu() ->
FullyConnected(units=37758) ->
Relu() ->
Dropout(p=0.25) ->
Softmax() ->
ArgMax() ->
target[t]
};
}
}
......@@ -17,7 +17,7 @@ if __name__ == "__main__":
num_epoch=11,
batch_size=64,
context='gpu',
eval_metric='accuracy',
eval_metric='accuracy{{}}',
opt_type='adam',
epsilon=1.0E-8,
weight_decay=0.001,
......
......@@ -143,7 +143,7 @@ class Net_0(gluon.HybridBlock):
relu2_ = self.relu2_(fc2_)
fc3_ = self.fc3_(relu2_)
softmax3_ = F.softmax(fc3_, axis=-1)
predictions_ = F.identity(softmax3_)
predictions_ = softmax3_
return predictions_
......@@ -320,7 +320,7 @@ class CNNSupervisedTrainer_mnist_mnistClassifier_net:
train_test_iter.reset()
metric = mx.metric.create(eval_metric, **eval_metric_params)
for batch_i, batch in enumerate(train_test_iter):
if True:
if True:
labels = [batch.label[i].as_in_context(mx_context) for i in range(1)]
image_ = batch.data[0].as_in_context(mx_context)
......@@ -354,7 +354,7 @@ class CNNSupervisedTrainer_mnist_mnistClassifier_net:
attention_resized = np.resize(attention.asnumpy(), (8, 8))
ax = fig.add_subplot(max_length//3, max_length//4, l+1)
ax.set_title(dict[int(labels[l+1][0].asscalar())])
img = ax.imshow(train_images[0+test_batch_size*(batch_i)])
img = ax.imshow(train_images[0+test_batch_size*(batch_i)].transpose(1,2,0))
ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())
......@@ -378,7 +378,7 @@ class CNNSupervisedTrainer_mnist_mnistClassifier_net:
test_iter.reset()
metric = mx.metric.create(eval_metric, **eval_metric_params)
for batch_i, batch in enumerate(test_iter):
if True:
if True:
labels = [batch.label[i].as_in_context(mx_context) for i in range(1)]
image_ = batch.data[0].as_in_context(mx_context)
......@@ -406,7 +406,7 @@ class CNNSupervisedTrainer_mnist_mnistClassifier_net:
attention_resized = np.resize(attention.asnumpy(), (8, 8))
ax = fig.add_subplot(max_length//3, max_length//4, l+1)
ax.set_title(dict[int(mx.nd.slice_axis(mx.nd.argmax(outputs[l+1], axis=1), axis=0, begin=0, end=1).asscalar())])
img = ax.imshow(test_images[0+test_batch_size*(batch_i)])
img = ax.imshow(test_images[0+test_batch_size*(batch_i)].transpose(1,2,0))
ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())
......@@ -437,4 +437,4 @@ class CNNSupervisedTrainer_mnist_mnistClassifier_net:
network.export(self.parameter_path(i) + '_newest', epoch=0)
def parameter_path(self, index):
return self._net_creator._model_dir_ + self._net_creator._model_prefix_ + '_' + str(index)
\ No newline at end of file
return self._net_creator._model_dir_ + self._net_creator._model_prefix_ + '_' + str(index)
......@@ -8,9 +8,9 @@ class ZScoreNormalization(gluon.HybridBlock):
super(ZScoreNormalization, self).__init__(**kwargs)
with self.name_scope():
self.data_mean = self.params.get('data_mean', shape=data_mean.shape,
init=mx.init.Constant(data_mean.asnumpy().tolist()), differentiable=False)
init=mx.init.Constant(data_mean.asnumpy().tolist()), differentiable=False)
self.data_std = self.params.get('data_std', shape=data_mean.shape,
init=mx.init.Constant(data_std.asnumpy().tolist()), differentiable=False)
init=mx.init.Constant(data_std.asnumpy().tolist()), differentiable=False)
def hybrid_forward(self, F, x, data_mean, data_std):
x = F.broadcast_sub(x, data_mean)
......@@ -26,9 +26,9 @@ class Padding(gluon.HybridBlock):
def hybrid_forward(self, F, x):
x = F.pad(data=x,
mode='constant',
pad_width=self.pad_width,
constant_value=0)
mode='constant',
pad_width=self.pad_width,
constant_value=0)
return x
......@@ -93,7 +93,7 @@ class Net_0(gluon.HybridBlock):
if data_mean:
assert(data_std)
self.input_normalization_state_ = ZScoreNormalization(data_mean=data_mean['state_'],
data_std=data_std['state_'])
data_std=data_std['state_'])
else:
self.input_normalization_state_ = NoNormalization()
......@@ -118,7 +118,7 @@ class Net_0(gluon.HybridBlock):
fc2_ = self.fc2_(tanh1_)
tanh2_ = self.tanh2_(fc2_)
fc3_ = self.fc3_(tanh2_)
qvalues_ = F.identity(fc3_)
qvalues_ = fc3_
return qvalues_
......@@ -120,7 +120,7 @@ class Net_0(gluon.HybridBlock):
relu2_ = self.relu2_(fc2_)
fc3_ = self.fc3_(relu2_)
tanh3_ = self.tanh3_(fc3_)
action_ = F.identity(tanh3_)
action_ = tanh3_
return action_
......@@ -131,7 +131,7 @@ class Net_0(gluon.HybridBlock):
add4_ = fc3_1_ + fc2_2_
relu4_ = self.relu4_(add4_)
fc4_ = self.fc4_(relu4_)
qvalues_ = F.identity(fc4_)
qvalues_ = fc4_
return qvalues_
......@@ -118,7 +118,7 @@ class Net_0(gluon.HybridBlock):
fc2_ = self.fc2_(tanh1_)
tanh2_ = self.tanh2_(fc2_)
fc3_ = self.fc3_(tanh2_)
qvalues_ = F.identity(fc3_)
qvalues_ = fc3_
return qvalues_
......@@ -120,7 +120,7 @@ class Net_0(gluon.HybridBlock):
relu2_ = self.relu2_(fc2_)
fc3_ = self.fc3_(relu2_)
tanh3_ = self.tanh3_(fc3_)
commands_ = F.identity(tanh3_)
commands_ = tanh3_
return commands_
......@@ -127,7 +127,7 @@ class Net_0(gluon.HybridBlock):
fc4_ = self.fc4_(relu3_)
relu4_ = self.relu4_(fc4_)
fc5_ = self.fc5_(relu4_)
qvalues_ = F.identity(fc5_)
qvalues_ = fc5_
return qvalues_
......@@ -8,60 +8,71 @@ import pickle
import cv2
import h5py
import string, re
import random
# half of this goes into train.h5, the other half into test.h5
number_of_images = 1000
# Half of this goes into train.h5, the other half into test.h5
number_of_images = 20000
# max length of sentences
max_length=25
# Max length of sentences
max_length = 25
datasets_to_create = ["train_2.h5", "test_2.h5"]
# Shuffle data?
shuffle = True
datasets_to_create = ["train.h5", "test.h5"]
# Download COCO dataset
annotation_zip = tf.keras.utils.get_file('captions.zip',
cache_subdir=os.path.abspath('.'),
origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
annotation_file = "./annotations/captions_train2014.json"
annotation_zip = tf.keras.utils.get_file("captions.zip",
cache_subdir=os.path.abspath("."),
origin = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
extract = True)
annotation_file = './annotations/captions_train2014.json'
name_of_zip = 'train2014.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
name_of_zip = "train2014.zip"
if not os.path.exists(os.path.abspath(".") + "/" + name_of_zip):
image_zip = tf.keras.utils.get_file(name_of_zip,
cache_subdir=os.path.abspath('.'),
origin = 'http://images.cocodataset.org/zips/train2014.zip',
cache_subdir=os.path.abspath("."),
origin = "http://images.cocodataset.org/zips/train2014.zip",
extract = True)
PATH = os.path.dirname(image_zip)+'/train2014/'
PATH = os.path.dirname(image_zip)+"/train2014/"
else:
PATH = os.path.abspath('.')+'/train2014/'
PATH = os.path.abspath(".")+"/train2014/"
with open(annotation_file, 'r') as f:
with open(annotation_file, "r") as f:
annotations = json.load(f)
# Shuffle
annotations = list(annotations["annotations"])
if shuffle:
random.shuffle(annotations)
all_captions = []
all_img_name_vector = []
all_ids = set()
# Load captions and images
for annot in annotations['annotations']:
caption = '<start> ' + annot['caption'] + ' <end>'
image_id = annot['image_id']
for annot in annotations:
caption = "<start> " + annot["caption"] + " <end>"
image_id = annot["image_id"]
if image_id not in all_ids:
all_ids.add(image_id)
full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)
full_coco_image_path = PATH + "COCO_train2014_" + "%012d.jpg" % (image_id)
all_img_name_vector.append(full_coco_image_path)
all_captions.append(caption)
# Get the amount of data we want
captions = all_captions[:number_of_images]
img_name_vector = all_img_name_vector[:number_of_images]
# Load Inception V3 model
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
image_model = tf.keras.applications.InceptionV3(include_top=False, weights="imagenet")
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
......@@ -75,7 +86,7 @@ def load_image(image_path):
return img, image_path
# Map images to features using pretrained Inception V3)
# Map images to features using pretrained Inception V3
image_dataset = tf.data.Dataset.from_tensor_slices(img_name_vector)
image_dataset = image_dataset.map(
load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
......@@ -90,17 +101,28 @@ for img, path in image_dataset:
for i in range(batch_features.shape[0]):
feature_tensors.append(batch_features[i][:][:].numpy())
#eliminate dots and commas with this regex
regex = re.compile("[%s]" % re.escape(string.punctuation))
# Create vocabulary from all words used in captions
vocabulary = {'<start>': 0, '<end>': 1, '<pad>': 2}
# Create vocabulary from all words used in captions, and create labels
vocabulary = {"<start>": 0, "<end>": 1, "<pad>": 2}
labels = []
for entry in captions:
for word in entry.split(' '):
sentence = []
for word in entry.split(" "):
if word not in ["<start>", "<end>", "<pad>"]:
word = regex.sub("", word)
if word.lower() not in vocabulary:
vocabulary[word.lower()] = len(vocabulary);
string = str(len(vocabulary)-1) + ': ' + word.lower()
string = str(len(vocabulary)-1) + ": " + word.lower()
sentence.append(vocabulary[word.lower()])
labels.append(sentence)
labels = [[vocabulary[word.lower()] for word in k.split(' ')] for k in captions]
# Save dict
VocabularyInverse = dict((v,k) for k,v in vocabulary.items())
with open("dict.pkl", "wb") as f:
pickle.dump(VocabularyInverse, f, 2)
# Pad to max length
for sentence in labels:
......@@ -113,13 +135,8 @@ for i in range(len(img_name_vector)):
img = cv2.imread(img_name_vector[i])
res = cv2.resize(img, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
res = cv2.cvtColor(res, cv2.COLOR_BGR2RGB)
res = res.transpose(2,0,1)
images.append(res)
# Save dict
VocabularyInverse = dict((v,k) for k,v in vocabulary.items())
with open("dict_2.pkl", "wb") as f:
pickle.dump(VocabularyInverse, f, 2)
# Save .h5 files
......@@ -137,9 +154,9 @@ for index, file in enumerate(datasets_to_create):
images_temp = images[len(labels)//2:len(labels)]
train_shape = (len(captions_temp), 64,2048)
image_shape = (len(captions_temp), 224,224, 3)
image_shape = (len(captions_temp), 3,224,224)
hdf5_file = h5py.File(file, mode='w')
hdf5_file = h5py.File(file, mode="w")
hdf5_file.create_dataset("data", train_shape, np.float32)
hdf5_file.create_dataset("images", image_shape, np.uint8)
......@@ -149,7 +166,7 @@ for index, file in enumerate(datasets_to_create):
for i in range(len(captions_temp)):
if i % 1000 == 0 and i > 1:
print('Processed: {}/{}'.format(i, len(captions_temp)))
print("Processed: {}/{}".format(i, len(captions_temp)))
hdf5_file["data"][i, ...] = features[i]
hdf5_file["images"][i, ...] = images_temp[i]
hdf5_file.close()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment