Skip to content
Snippets Groups Projects
Commit a418d6d2 authored by Jammer, Tim's avatar Jammer, Tim
Browse files

add material for workflows module

parent 4082aab5
Branches
No related tags found
No related merge requests found
File added
The Material for the Workflow Module will appear on Friday (March, 15. 2024)
# Steps of the example deep learning workflow
* [File Transfer] create a directory where the scripts are located on the cluster
* [File Transfer] transfer the scripts to the cluster (`rsync`) or clone the repository with the scripts on the cluster (`git clone`)
* [File Transfer] Download the data :
```
wget https://hessenbox.tu-darmstadt.de/dl/fi7C8UBvBLLMwLFcsk7UkRWC/hkhlr_dl_workflow_sample_data.tar.gz
```
* [General Preproccessing] untar the data into a directory ```tar -xzf hkhlr_dl_workflow_sample_data.tar.gz```
* [General Preproccessing] setup environment (at least ``python 3.9``):
```
python3 -m venv ./venv
source venv/bin/activate
pip install --upgrade pip
pip install --upgrade keras-cv tensorflow pandas matplotlib scipy
```
* [General Preproccessing] start a SMALL test run to check if setup is correct
* use ``--not_use_all_data`` as a command line argument to have only a small run
* Other important command line args to ``cnn_tf.py`` script:
* ``--data_dir``: Path to where the data was extracted
* ``--result_dir``: directroy, wehre the result files will be written
* [Run Specific Preprocessing] Prepare parameters for multiple runs:
* run ``param_generator.py`` script, the location of the parameter files can be specified with the ``--result_dir`` argument
* [Run Specific Preprocessing] Review ``job_script.sh`` adjust settings if necessary
* one may need to adjust the paths to the environment/data/result directories
* [Run Specific Preprocessing] run a SMALL test job to check setup
* remember to use ``--not_use_all_data``
* do not yet launch the ful SLURM job array, this step is only for testing if the script was set up correctly
* the ``job_script.sh`` hase some errors. (hint: check the python version)
* be mindful of efficiency: does the ``job_script.sh`` use the cpu ressources efficiently?
* [Run calculations] run all jobs by launching the full SLURM job array
* [General Preprocessing/Aggregation] run ``visualization.py``. This script tires to read al subdirectories of ``--input_dir`` and collect the results into one big plot.
* [File transfer] download the resulting image
Note:
The original data can be found here:
http://ufldl.stanford.edu/housenumbers/
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
import pandas as pd
from input_data import *
from helper import parse_command_line_args
import matplotlib.pyplot as plt
os.environ["KERAS_BACKEND"] = "tensorflow"
def define_cnn_model(input_shape, num_units, num_filters,
kernel_size=(2, 2), pool_size=(2, 2), conv_layers=1, dense_layers=1):
model = tf.keras.models.Sequential()
# Feature extraction region of model
model.add(Conv2D(filters=num_filters, kernel_size=kernel_size,
activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=pool_size))
for i in range(1, conv_layers):
model.add(Conv2D(filters=num_filters, kernel_size=kernel_size,
activation='relu'))
model.add(MaxPooling2D(pool_size=pool_size))
# Classification region of model
model.add(Flatten())
for i in range(dense_layers):
model.add(Dense(units=num_units, activation='relu'))
model.add(Dense(units=11, activation='softmax'))
print(model.summary())
return model
def main():
# Parse command line argments
ARGS = parse_command_line_args()
# Get the data
train_data, val_data, test_data = \
get_input_data(ARGS.data_dir, ARGS.not_use_all_data, ARGS.validation_split_size)
# Define model
model = define_cnn_model(input_shape=IMAGE_SIZE,
num_units=ARGS.num_units,
num_filters=ARGS.num_filters,
conv_layers=ARGS.conv_layers,
dense_layers=ARGS.dense_layers)
# Set up the model
opt = tf.keras.optimizers.SGD(learning_rate=ARGS.learning_rate)
model.compile(optimizer=opt,
loss='categorical_crossentropy',
metrics=['accuracy'])
keras_callbacks = []
if ARGS.early_stopping:
keras_callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='loss', patience=ARGS.early_stopping_patience))
if ARGS.augment_data:
train_data = train_data.map(
augment_tf,
num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
# tf.py_function can execute arbitrary python code
# train_data = train_data.map(
# lambda x, y: (tf.py_function(augment_py, [x], [tf.float32]), y),
# num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
# num_parallel_calls=tf.data.experimental.AUTOTUNE
# autotune = num of CPUs available
# need to set shape as tf can not infer it (an augmentation may result in another img size)
train_data = train_data.map(set_shape)
if ARGS.augment_test:
test_data = test_data.repeat(3).map(augment_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE,
deterministic=False).map(set_shape)
if ARGS.show_image:
subset = train_data.take(3)
print(subset)
for image, label in subset:
print(image.shape)
### #show first img from batch
plt.imshow(image)
plt.show()
exit()
train_data = train_data.shuffle(buffer_size=10000, reshuffle_each_iteration=True).batch(ARGS.batch_size)
train_data = train_data.prefetch(20)
val_data = val_data.batch(ARGS.batch_size)
# Train the model using the train dataset
train_history = model.fit(
train_data,
shuffle=True,
validation_data=val_data,
epochs=ARGS.num_epochs,
verbose=1,
callbacks=keras_callbacks
)
# Evaluate model performance on the test dataset
test_data = test_data.batch(ARGS.batch_size)
test_loss, test_accuracy = model.evaluate(test_data, verbose=True)
os.makedirs(ARGS.result_dir, exist_ok=True)
with open(ARGS.result_dir + '/' + 'test_accuracy.txt', 'w') as f:
print("Test accuracy : {:.2f}%".format(test_accuracy * 100.0), file=f)
pd.DataFrame(data=train_history.history,
index=range(1, len(train_history.history['loss']) + 1)).to_csv(
ARGS.result_dir + '/' + 'train_hist.csv',
index_label='epoch'
)
if __name__ == "__main__":
main()
import argparse
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
# helper function used in the excercises
def parse_command_line_args():
parser = argparse.ArgumentParser(
description='train a basic Convolutional Neural Network')
parser.add_argument("--num_epochs", type=int, default=1,
required=False, action='store',
help="Number of epochs to train the NN.")
parser.add_argument("--batch_size", type=int, default='32',
required=False, action='store',
help="Number of examples in batch.")
parser.add_argument("--learning_rate", type=float, default=1e-2,
required=False, action='store',
help="Learning rate used for gradient descent.")
parser.add_argument("--validation_split_size", action='store',
type=float, required=False, default=0.2)
parser.add_argument("--conv_layers", type=int, default=1,
required=False, action='store',
help="Number of conv layers.")
parser.add_argument("--dense_layers", type=int, default=1,
required=False, action='store',
help="Number of dense Layers.")
parser.add_argument("--early_stopping", action='store_true',
default=False, required=False,
help="use early stopping")
parser.add_argument("--early_stopping_patience",type=int, action='store',
default=3, required=False,
help="patience to use for early stopping")
parser.add_argument("--show_image", action='store_true',
default=False, required=False,
help="show an image of the training dataset and exit")
parser.add_argument("--not_use_all_data", action='store_true',
default=False, required=False,
help="Not Use all available training data")
parser.add_argument("--result_dir", type=str, default='./result',
required=False, action='store',
help="where to store the results (result= recorded history of loss and accuracy)")
parser.add_argument("--data_dir", type=str, default='./DATA',
required=False, action='store',
help="Path to the data location")
parser.add_argument("--augment_data", required=False, default=False,
action='store_true',
help="Activate data augmentation by rotating and shifting images")
parser.add_argument("--augment_test", required=False, default=False,
action='store_true',
help="Activate test data augmentation by rotating and shifting images")
parser.add_argument("--num_filters", type=int, default=16,
required=False, action='store')
parser.add_argument("--num_units", type=int, default=64,
required=False, action='store')
args = parser.parse_args()
return args
def plot_lossacc_vs_epochs(histdict, export_to_pdf=False):
train_acc = histdict['accuracy']
train_loss = histdict['loss']
try:
val_loss = histdict['val_loss']
val_acc = histdict['val_accuracy']
except KeyError:
pass
epochs = range(1, len(train_acc) + 1)
fig = plt.figure(figsize=(10, 5))
# Loss
ax1 = fig.add_subplot(1, 2, 1)
ax1.set_title('Training and validation loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.minorticks_on()
ax1.plot(epochs, train_loss, 'bo-', label='Train loss')
ax1.plot(epochs, val_loss, 'rd--', label='Val loss')
ax1.legend()
# Accurracy
ax2 = fig.add_subplot(1, 2, 2)
ax2.set_title('Training and validation accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.minorticks_on()
ax2.plot(epochs, train_acc, 'bo-', label='Train acc')
ax2.plot(epochs, val_acc, 'rd--', label='Val acc')
ax2.legend()
if export_to_pdf:
plt.savefig('results_lossacc_vs_epochs.pdf',
bbox_inches='tight')
else:
plt.show()
import tensorflow as tf
import keras_cv
import scipy.io
import numpy as np
import os
# Handles the loading of the input data
# Constants
IMAGE_SIZE = (32, 32, 3)
# to specify the shape of our data, as it cannot be computed by tf automatically if we use augmentation
@tf.function
def set_shape(x, y):
x = tf.reshape(x, (32, 32, 3))
y.set_shape([11])
return x, y
@tf.function
def normalize(image, label):
return image / 255.0, label
def get_input_data(DATA_PATH, not_use_extra=True, val_percent=0.1):
# load svhn data from the specified folder
fname = DATA_PATH + '/train_32x32.mat'
assert os.path.isfile(fname) and "Did not find Input data at the specified location"
input = scipy.io.loadmat(fname)
x_train = input["X"][:]
y_train = input["y"][:].flatten()
# first dim is batch dimension
# need to reshape as input has last dim as batch size
x_train = np.moveaxis(x_train, -1, 0)
if not not_use_extra:
fname = DATA_PATH + '/extra_32x32.mat'
assert os.path.isfile(fname) and "Did not find Input data at the specified location"
test = scipy.io.loadmat(fname)
x_extra = test["X"][:, :, :, :]
y_extra = test["y"][:].flatten()
x_extra = np.moveaxis(x_extra, -1, 0)
# just include the extra data at the end (tf will shuffle anyway)
x_train = np.concatenate((x_train, x_extra))
y_train = np.concatenate((y_train, y_extra))
# use some part of the data as validation data
val_samples = int(len(y_train) * val_percent)
x_val = x_train[:val_samples]
y_val = y_train[:val_samples]
x_train = x_train[val_samples:]
y_train = y_train[val_samples:]
fname = DATA_PATH + '/test_32x32.mat'
assert os.path.isfile(fname) and "Did not find Input data at the specified location"
test = scipy.io.loadmat(fname)
x_test = test["X"][:, :, :, :]
y_test = test["y"][:].flatten()
y_train = tf.keras.utils.to_categorical(y_train, num_classes=11)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=11)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=11)
x_test = np.moveaxis(x_test, -1, 0)
# also works as a sanity check to test if len(y)=len(x)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
x_test = x_test.astype(np.float32)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(normalize)
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val)).map(normalize)
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test)).map(normalize)
return train_data, val_data, test_data
max_angle = -45
min_angle = 45
min_translate = -5
max_translate = +5
augmenter = keras_cv.layers.Augmenter([keras_cv.layers.RandAugment(value_range=(0, 1),magnitude=0.25)])
@tf.function
def augment_tf(image, label):
return augmenter(image), label
#!/bin/bash
#SBATCH --ntasks 1
#SBATCH --cpus-per-task 8
#SBATCH --mem-per-cpu=3800
#SBATCH --time 00:30:00
# these parameters determining the size of our job
# meaning the amount of ressources needed to process it
#SBATCH --array 1-6
# This is a job Array of 6 jobs
#SBATCH --job-name Tensorflow
# A name for this job to be displayed in the queue
#SBATCH --output job_%A_%a.out
# filename where the terminal output of this job goes
# deactivate any current environment
if [ ! -z "$VIRTUAL_ENV" ]; then
deactivate
fi
#load the required modules
module purge
module load gcc python/3.9
# activate the correct environment
source ./venv/bin/activate
echo $(cat parameter/$SLURM_ARRAY_TASK_ID.params)
python3 cnn_tf.py $(cat ./parameter/$SLURM_ARRAY_TASK_ID.params) --data_dir=./DATA --result_dir=./results/result_$SLURM_ARRAY_TASK_ID
# so that we know which parameters where used when viewing the output
cp parameter/$SLURM_ARRAY_TASK_ID.params results/result_$SLURM_ARRAY_TASK_ID/input_parameter.params
deactivate
#!/bin/bash
#SBATCH --ntasks 1
#SBATCH --cpus-per-task 8
#SBATCH --mem-per-cpu=3800
#SBATCH --time 00:30:00
# these parameters determining the size of our job
# meaning the amount of ressources needed to process it
#SBATCH --array 1-6
# This is a job Array of 6 jobs
#SBATCH --job-name Tensorflow
# A name for this job to be displayed in the queue
#SBATCH --output job_%A_%a.out
# filename where the terminal output of this job goes
# change to where the files are located
EXPERIMENT_DIR=./
# deactivate any current environment
if [ ! -z "$VIRTUAL_ENV" ]; then
deactivate
fi
#load the required modules
module purge
module load gcc/8.5 python/3.10
# activate the correct environment
source $EXPERIMENT_DIR/venv/bin/activate
echo $(cat $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params)
python3 $EXPERIMENT_DIR/cnn_tf.py $(cat $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params) --data_dir=$EXPERIMENT_DIR/DATA --result_dir=$EXPERIMENT_DIR/results/result_$SLURM_ARRAY_TASK_ID
# so that we know which parameters where used when viewing the output
cp $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params $EXPERIMENT_DIR/results/result_$SLURM_ARRAY_TASK_ID/input_parameter.params
deactivate
import argparse
import random
import os
## definition of search area
min_batch_size = 16
max_batch_size = 1024
min_lr = 0.0001
max_lr = 0.5
min_dense_layers = 1
max_dense_layers = 5
min_conv_layers = 1
max_conv_layers = 3
min_num_filters = 2
max_num_filters = 32
min_num_units = 11
max_num_units = 128
## settings for all parameter sets
use_augmentation = True
num_epochs = 10
def write_parameter_file(
job_number=1,
num_epochs=20,
batch_size=128,
lr=0.01,
num_units=30,
dense_layers=1,
num_filters=16,
conv_layers=1,
augment=False,
path="./parameter/",
):
if augment:
augment_str = "--augment_data"
else:
augment_str = ""
parameters = (
"--num_epochs %d --batch_size %d --learning_rate %f"
" --num_units %d --dense_layers %d --conv_layers %d"
" --num_filters %d %s\n"
% (
# " --early_stopping --trainfile_suffix %d --num_filters %d %s\n" % (
num_epochs,
batch_size,
lr,
num_units,
dense_layers,
conv_layers,
num_filters,
augment_str,
)
)
with open(path + '/' + str(job_number) + ".params", "w") as file:
file.write(parameters)
def main():
parser = argparse.ArgumentParser(
description='generates parameters for a random parameterscan. The range of parameters to generate can be setup in the script')
parser.add_argument("--output_dir", type=str, default="./parameter",
required=False, action='store',
help="The directroy, where the output should be stored")
parser.add_argument("-n", type=int, default=5,
required=False, action='store',
help="Number of params to generate")
ARGS = parser.parse_args()
os.makedirs(ARGS.output_dir, exist_ok=True)
# use range +1 will lead to less errors when setting the slurm array boundary
for i in range(ARGS.n + 1):
# check random parameters
write_parameter_file(
job_number=i,
num_epochs=num_epochs,
batch_size=random.randint(min_batch_size, max_batch_size),
lr=random.uniform(min_lr, max_lr),
num_units=random.randint(min_num_units, max_num_units),
dense_layers=random.randint(min_dense_layers, max_dense_layers),
num_filters=random.randint(min_num_filters, max_num_filters),
conv_layers=random.randint(min_conv_layers, max_conv_layers),
augment=use_augmentation,
path=ARGS.output_dir
)
if __name__ == "__main__":
main()
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import argparse
# which parameter to use for caption and order of the subplots
caption_regex = p = re.compile("--learning_rate (\d+\.\d+)")
def main():
parser = argparse.ArgumentParser(
description='Visualizes the results of a parameterscan')
parser.add_argument("--input_dir", type=str, default="results",
required=True, action='store',
help="Input directory")
parser.add_argument("--num_rows", type=int, default=4,
required=False, action='store',
help="Plot Layout: Number of rows")
parser.add_argument("--max_loss", type=float, default=0,
required=False, action='store',
help="maximum loss to display (yscale of plot)")
parser.add_argument("--output", type=str, default="plot.pdf",
required=False, action='store',
help="output filename")
ARGS = parser.parse_args()
cases = []
for path, directories, files in os.walk(ARGS.input_dir):
for dir in directories:
if (os.path.isfile(os.path.join(path, dir, "train_hist.csv")) and
os.path.isfile(os.path.join(path, dir, "test_accuracy.txt")) and
os.path.isfile(os.path.join(path, dir, "input_parameter.params"))):
cases.append(os.path.join(path, dir))
pass
else:
print("%s has not the expected files" % dir)
num_results = len(cases)
if num_results == 0:
print("Nothing to visualize")
exit()
data = {}
for case in cases:
param_string = "unknown"
# read last line from file
with open(os.path.join(case, "input_parameter.params"), 'r') as file:
param_string = file.readline()
train_hist = pd.read_csv(os.path.join(case, "train_hist.csv"))
title = caption_regex.findall(param_string)[0]
data[title] = train_hist
num_cols = int(num_results / ARGS.num_rows)
if num_results % ARGS.num_rows != 0:
num_cols += 1
fig, axes = plt.subplots(ARGS.num_rows, num_cols, sharex=True, sharey=True)
if ARGS.max_loss != 0:
plt.ylim([0, ARGS.max_loss])
i = 0
for title in sorted(data):
train_hist = data[title]
row = int(i / num_cols)
col = i % num_cols
if num_cols == 1:
this_plot = axes[row]
elif ARGS.num_rows == 1:
this_plot = axes[col]
else:
this_plot = axes[row, col]
this_plot.set_title(title)
this_plot.plot(train_hist['epoch'], train_hist['loss'], 'bo-', label='Train loss')
this_plot.plot(train_hist['epoch'], train_hist['val_loss'], 'rd--', label='Val loss')
this_plot.set_xlabel('Epoch')
this_plot.set_ylabel('Loss')
this_plot.legend()
i += 1
plt.show()
plt.savefig(ARGS.output)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment