Commit ba74e6af authored by Evgeny Kusmenko's avatar Evgeny Kusmenko
Browse files

Merge branch 'dataLoaderFix' into 'master'

added batchwise data loading to python template

See merge request !49
parents 6ad5fa7e bf659541
Pipeline #566758 passed with stages
in 2 minutes and 8 seconds
......@@ -9,7 +9,7 @@
<groupId>de.monticore.lang.monticar</groupId>
<artifactId>cnnarch-gluon-generator</artifactId>
<version>0.4.11-SNAPSHOT</version>
<version>0.4.12-SNAPSHOT</version>
<!-- == PROJECT DEPENDENCIES ============================================= -->
<properties>
......
......@@ -5,7 +5,6 @@ import de.monticore.lang.monticar.cnntrain.annotations.RewardFunctionParameter;
import de.monticore.lang.monticar.generator.pythonwrapper.symbolservices.data.ComponentPortInformation;
import de.monticore.lang.monticar.generator.pythonwrapper.symbolservices.data.EmadlType;
import de.monticore.lang.monticar.generator.pythonwrapper.symbolservices.data.PortVariable;
import jdk.nashorn.internal.runtime.options.Option;
import java.util.List;
import java.util.Optional;
......
......@@ -25,10 +25,27 @@ class ${tc.fileNameWithoutEnding}:
data_std = {}
train_images = {}
for input_name in self._input_names_:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -25,8 +25,25 @@ class CNNDataLoader_Alexnet:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -25,8 +25,25 @@ class CNNDataLoader_CifarClassifierNetwork:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -25,8 +25,25 @@ class CNNDataLoader_EpisodicMemoryNetwork:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -26,8 +26,25 @@ class CNNDataLoader_Invariant:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -23,13 +23,30 @@ class CNNDataLoader_LoadNetworkTest:
data_std = {}
train_images = {}
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
if 'images' in train_h5:
train_images = train_h5['images']
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
train_label = {}
index = 0
......
......@@ -26,8 +26,25 @@ class CNNDataLoader_MultipleStreams:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -26,8 +26,25 @@ class CNNDataLoader_RNNencdec:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -26,12 +26,30 @@ class CNNDataLoader_RNNsearch:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
train_label = {}
index = 0
for output_name in self._output_names_:
......
......@@ -26,8 +26,25 @@ class CNNDataLoader_RNNtest:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -24,13 +24,30 @@ class CNNDataLoader_ResNeXt50:
data_std = {}
train_images = {}
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
if 'images' in train_h5:
train_images = train_h5['images']
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
train_label = {}
index = 0
......
......@@ -26,8 +26,25 @@ class CNNDataLoader_Show_attend_tell:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
......@@ -26,11 +26,29 @@ class CNNDataLoader_ThreeInputCNN_M14:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
train_images = train_h5['images']
train_label = {}
index = 0
......
......@@ -25,8 +25,25 @@ class CNNDataLoader_VGG16:
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name + '_'] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name + '_'] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_dataset = train_h5[input_name]
train_dataset_shape = train_data[input_name].shape
# slice_size limits the memory consumption, by only loading slices of size <500MB into memory
slice_size = min(train_dataset_shape[0] - 1, int(500e6 / (train_h5[input_name][0].size * \
train_h5[input_name][0].itemsize)))
num_slices = max(1, int(train_h5[input_name].shape[0] / slice_size))
mean = np.zeros(train_dataset_shape[1: ])
std = np.zeros(train_dataset_shape[1: ])
for i in range(int(train_dataset_shape[0] / slice_size)):
mean += train_dataset[i * slice_size: (i + 1) * slice_size].mean(axis=0) / num_slices
std += train_dataset[i * slice_size: (i + 1) * slice_size].std(axis=0) / num_slices
if slice_size > train_dataset_shape[0] - 1:
mean += train_dataset[num_slices * slice_size: ].mean(axis=0) / (slice_size - num_slices % slice_size)
std += train_dataset[num_slices * slice_size: ].std(axis=0) / (slice_size - num_slices % slice_size)
std += 1e-5
data_mean[input_name + '_'] = nd.array(mean)
data_std[input_name + '_'] = nd.array(std)
if 'images' in train_h5:
train_images = train_h5['images']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment