add material for workflows module

a418d6d2 · Jammer, Tim · 4082aab5 · a418d6d2 · 4082aab5 · a418d6d2
Commit a418d6d2 authored 1 year ago by Jammer, Tim
--- a/workflows-module/ProTHPC-workflow.slides.pdf
+++ b/workflows-module/ProTHPC-workflow.slides.pdf
--- a/workflows-module/Readme.md
+++ b/workflows-module/Readme.md
-The Material for the Workflow Module will appear on Friday (March, 15. 2024)
--- a/workflows-module/example/Readme.md
+++ b/workflows-module/example/Readme.md
+
+# Steps of the example deep learning workflow
+
+* [File Transfer] create a directory where the scripts are located on the cluster
+* [File Transfer] transfer the scripts to the cluster (`rsync`) or clone the repository with the scripts on the cluster (`git clone`)
+* [File Transfer] Download the data : 
+```
+wget https://hessenbox.tu-darmstadt.de/dl/fi7C8UBvBLLMwLFcsk7UkRWC/hkhlr_dl_workflow_sample_data.tar.gz
+```
+* [General Preproccessing] untar the data into a directory ```tar -xzf hkhlr_dl_workflow_sample_data.tar.gz```
+* [General Preproccessing] setup environment (at least ``python 3.9``): 
+```
+python3 -m venv ./venv
+source venv/bin/activate
+pip install --upgrade pip
+pip install --upgrade keras-cv tensorflow pandas matplotlib scipy
+```
+* [General Preproccessing] start a SMALL test run to check if setup is correct
+  * use ``--not_use_all_data`` as a command line argument to have only a small run
+  * Other important command line args to ``cnn_tf.py`` script:
+    * ``--data_dir``: Path to where the data was extracted
+    * ``--result_dir``: directroy, wehre the result files will be written
+* [Run Specific Preprocessing] Prepare parameters for multiple runs: 
+  * run ``param_generator.py`` script, the location of the parameter files can be specified with the ``--result_dir`` argument
+* [Run Specific Preprocessing] Review ``job_script.sh`` adjust settings if necessary
+  * one may need to adjust the paths to the environment/data/result directories 
+* [Run Specific Preprocessing] run a SMALL test job to check setup
+  * remember to use ``--not_use_all_data``
+  * do not yet launch the ful SLURM job array, this step is only for testing if the script was set up correctly
+  * the ``job_script.sh`` hase some errors. (hint: check the python version)
+  * be mindful of efficiency: does the ``job_script.sh`` use the cpu ressources efficiently?
+* [Run calculations] run all jobs by launching the full SLURM job array
+* [General Preprocessing/Aggregation] run ``visualization.py``. This script tires to read al subdirectories of ``--input_dir`` and collect the results into one big plot.
+* [File transfer] download the resulting image
+
+
+
+Note:
+The original data can be found here:
+http://ufldl.stanford.edu/housenumbers/
--- a/workflows-module/example/cnn_tf.py
+++ b/workflows-module/example/cnn_tf.py
+import tensorflow as tf
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
+
+import pandas as pd
+
+from input_data import *
+
+from helper import parse_command_line_args
+
+import matplotlib.pyplot as plt
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+
+def define_cnn_model(input_shape, num_units, num_filters,
+                     kernel_size=(2, 2), pool_size=(2, 2), conv_layers=1, dense_layers=1):
+    model = tf.keras.models.Sequential()
+    # Feature extraction region of model
+    model.add(Conv2D(filters=num_filters, kernel_size=kernel_size,
+                     activation='relu', input_shape=input_shape))
+    model.add(MaxPooling2D(pool_size=pool_size))
+
+    for i in range(1, conv_layers):
+        model.add(Conv2D(filters=num_filters, kernel_size=kernel_size,
+                         activation='relu'))
+        model.add(MaxPooling2D(pool_size=pool_size))
+
+    # Classification region of model
+    model.add(Flatten())
+    for i in range(dense_layers):
+        model.add(Dense(units=num_units, activation='relu'))
+    model.add(Dense(units=11, activation='softmax'))
+    print(model.summary())
+    return model
+
+
+def main():
+    # Parse command line argments
+    ARGS = parse_command_line_args()
+
+    # Get the data
+    train_data, val_data, test_data = \
+        get_input_data(ARGS.data_dir, ARGS.not_use_all_data, ARGS.validation_split_size)
+
+    # Define model
+    model = define_cnn_model(input_shape=IMAGE_SIZE,
+                             num_units=ARGS.num_units,
+                             num_filters=ARGS.num_filters,
+                             conv_layers=ARGS.conv_layers,
+                             dense_layers=ARGS.dense_layers)
+
+    # Set up the model
+    opt = tf.keras.optimizers.SGD(learning_rate=ARGS.learning_rate)
+    model.compile(optimizer=opt,
+                  loss='categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    keras_callbacks = []
+
+    if ARGS.early_stopping:
+        keras_callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='loss', patience=ARGS.early_stopping_patience))
+
+    if ARGS.augment_data:
+        train_data = train_data.map(
+            augment_tf,
+            num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
+
+        # tf.py_function can execute arbitrary python code
+        # train_data = train_data.map(
+        #    lambda x, y: (tf.py_function(augment_py, [x], [tf.float32]), y),
+        #    num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
+
+        # num_parallel_calls=tf.data.experimental.AUTOTUNE
+        # autotune = num of CPUs available
+
+        # need to set shape as tf can not infer it (an augmentation may result in another img size)
+        train_data = train_data.map(set_shape)
+
+    if ARGS.augment_test:
+        test_data = test_data.repeat(3).map(augment_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE,
+                                            deterministic=False).map(set_shape)
+
+    if ARGS.show_image:
+        subset = train_data.take(3)
+        print(subset)
+        for image, label in subset:
+            print(image.shape)
+            ### #show first img from batch
+            plt.imshow(image)
+            plt.show()
+        exit()
+
+    train_data = train_data.shuffle(buffer_size=10000, reshuffle_each_iteration=True).batch(ARGS.batch_size)
+    train_data = train_data.prefetch(20)
+    val_data = val_data.batch(ARGS.batch_size)
+    # Train the model using the train dataset
+    train_history = model.fit(
+        train_data,
+        shuffle=True,
+        validation_data=val_data,
+        epochs=ARGS.num_epochs,
+        verbose=1,
+        callbacks=keras_callbacks
+    )
+
+    # Evaluate model performance on the test dataset
+    test_data = test_data.batch(ARGS.batch_size)
+    test_loss, test_accuracy = model.evaluate(test_data, verbose=True)
+
+    os.makedirs(ARGS.result_dir, exist_ok=True)
+    with open(ARGS.result_dir + '/' + 'test_accuracy.txt', 'w') as f:
+        print("Test accuracy : {:.2f}%".format(test_accuracy * 100.0), file=f)
+
+    pd.DataFrame(data=train_history.history,
+                 index=range(1, len(train_history.history['loss']) + 1)).to_csv(
+        ARGS.result_dir + '/' + 'train_hist.csv',
+        index_label='epoch'
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/workflows-module/example/helper.py
+++ b/workflows-module/example/helper.py
+import argparse
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import pandas as pd
+import os
+
+
+# helper function used in the excercises
+
+def parse_command_line_args():
+    parser = argparse.ArgumentParser(
+        description='train a basic Convolutional Neural Network')
+    parser.add_argument("--num_epochs", type=int, default=1,
+                        required=False, action='store',
+                        help="Number of epochs to train the NN.")
+    parser.add_argument("--batch_size", type=int, default='32',
+                        required=False, action='store',
+                        help="Number of examples in batch.")
+    parser.add_argument("--learning_rate", type=float, default=1e-2,
+                        required=False, action='store',
+                        help="Learning rate used for gradient descent.")
+    parser.add_argument("--validation_split_size", action='store',
+                        type=float, required=False, default=0.2)
+    parser.add_argument("--conv_layers", type=int, default=1,
+                        required=False, action='store',
+                        help="Number of conv layers.")
+    parser.add_argument("--dense_layers", type=int, default=1,
+                        required=False, action='store',
+                        help="Number of dense Layers.")
+    parser.add_argument("--early_stopping", action='store_true',
+                        default=False, required=False,
+                        help="use early stopping")
+    parser.add_argument("--early_stopping_patience",type=int, action='store',
+                        default=3, required=False,
+                        help="patience to use for early stopping")
+    parser.add_argument("--show_image", action='store_true',
+                        default=False, required=False,
+                        help="show an image of the training dataset and exit")
+    parser.add_argument("--not_use_all_data", action='store_true',
+                        default=False, required=False,
+                        help="Not Use all available training data")
+    parser.add_argument("--result_dir", type=str, default='./result',
+                        required=False, action='store',
+                        help="where to store the results (result= recorded history of loss and accuracy)")
+    parser.add_argument("--data_dir", type=str, default='./DATA',
+                        required=False, action='store',
+                        help="Path to the data location")
+    parser.add_argument("--augment_data", required=False, default=False,
+                        action='store_true',
+                        help="Activate data augmentation by rotating and shifting images")
+    parser.add_argument("--augment_test", required=False, default=False,
+                        action='store_true',
+                        help="Activate test data augmentation by rotating and shifting images")
+    parser.add_argument("--num_filters", type=int, default=16,
+                        required=False, action='store')
+    parser.add_argument("--num_units", type=int, default=64,
+                        required=False, action='store')
+    args = parser.parse_args()
+    return args
+
+
+def plot_lossacc_vs_epochs(histdict, export_to_pdf=False):
+    train_acc = histdict['accuracy']
+    train_loss = histdict['loss']
+    try:
+        val_loss = histdict['val_loss']
+        val_acc = histdict['val_accuracy']
+    except KeyError:
+        pass
+
+    epochs = range(1, len(train_acc) + 1)
+
+    fig = plt.figure(figsize=(10, 5))
+
+    # Loss
+    ax1 = fig.add_subplot(1, 2, 1)
+    ax1.set_title('Training and validation loss')
+    ax1.set_xlabel('Epoch')
+    ax1.set_ylabel('Loss')
+    ax1.minorticks_on()
+    ax1.plot(epochs, train_loss, 'bo-', label='Train loss')
+    ax1.plot(epochs, val_loss, 'rd--', label='Val loss')
+    ax1.legend()
+
+    # Accurracy
+    ax2 = fig.add_subplot(1, 2, 2)
+    ax2.set_title('Training and validation accuracy')
+    ax2.set_xlabel('Epoch')
+    ax2.set_ylabel('Accuracy')
+    ax2.minorticks_on()
+    ax2.plot(epochs, train_acc, 'bo-', label='Train acc')
+    ax2.plot(epochs, val_acc, 'rd--', label='Val acc')
+    ax2.legend()
+
+    if export_to_pdf:
+        plt.savefig('results_lossacc_vs_epochs.pdf',
+                    bbox_inches='tight')
+    else:
+        plt.show()
--- a/workflows-module/example/input_data.py
+++ b/workflows-module/example/input_data.py
+import tensorflow as tf
+import keras_cv
+import scipy.io
+import numpy as np
+import os
+
+# Handles the loading of the input data
+
+# Constants
+IMAGE_SIZE = (32, 32, 3)
+
+
+# to specify the shape of our data, as it cannot be computed by tf automatically if we use augmentation
+@tf.function
+def set_shape(x, y):
+    x = tf.reshape(x, (32, 32, 3))
+    y.set_shape([11])
+    return x, y
+
+
+@tf.function
+def normalize(image, label):
+    return image / 255.0, label
+
+
+def get_input_data(DATA_PATH, not_use_extra=True, val_percent=0.1):
+    # load svhn data from the specified folder
+    fname = DATA_PATH + '/train_32x32.mat'
+    assert os.path.isfile(fname) and "Did not find Input data at the specified location"
+
+    input = scipy.io.loadmat(fname)
+
+    x_train = input["X"][:]
+    y_train = input["y"][:].flatten()
+    # first dim is batch dimension
+    # need to reshape as input has last dim as batch size
+    x_train = np.moveaxis(x_train, -1, 0)
+
+    if not not_use_extra:
+        fname = DATA_PATH + '/extra_32x32.mat'
+        assert os.path.isfile(fname) and "Did not find Input data at the specified location"
+        test = scipy.io.loadmat(fname)
+        x_extra = test["X"][:, :, :, :]
+        y_extra = test["y"][:].flatten()
+        x_extra = np.moveaxis(x_extra, -1, 0)
+        # just include the extra data at the end (tf will shuffle anyway)
+        x_train = np.concatenate((x_train, x_extra))
+        y_train = np.concatenate((y_train, y_extra))
+
+    # use some part of the data as validation data
+    val_samples = int(len(y_train) * val_percent)
+    x_val = x_train[:val_samples]
+    y_val = y_train[:val_samples]
+    x_train = x_train[val_samples:]
+    y_train = y_train[val_samples:]
+
+    fname = DATA_PATH + '/test_32x32.mat'
+    assert os.path.isfile(fname) and "Did not find Input data at the specified location"
+    test = scipy.io.loadmat(fname)
+    x_test = test["X"][:, :, :, :]
+    y_test = test["y"][:].flatten()
+
+    y_train = tf.keras.utils.to_categorical(y_train, num_classes=11)
+    y_val = tf.keras.utils.to_categorical(y_val, num_classes=11)
+    y_test = tf.keras.utils.to_categorical(y_test, num_classes=11)
+
+    x_test = np.moveaxis(x_test, -1, 0)
+    # also works as a sanity check to test if len(y)=len(x)
+
+    x_train = x_train.astype(np.float32)
+    x_val = x_val.astype(np.float32)
+    x_test = x_test.astype(np.float32)
+
+    train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(normalize)
+    val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val)).map(normalize)
+    test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test)).map(normalize)
+
+    return train_data, val_data, test_data
+
+
+max_angle = -45
+min_angle = 45
+min_translate = -5
+max_translate = +5
+
+augmenter = keras_cv.layers.Augmenter([keras_cv.layers.RandAugment(value_range=(0, 1),magnitude=0.25)])
+
+
+@tf.function
+def augment_tf(image, label):
+    return augmenter(image), label
--- a/workflows-module/example/job_script.sh
+++ b/workflows-module/example/job_script.sh
+#!/bin/bash
+
+#SBATCH --ntasks 1
+#SBATCH --cpus-per-task 8
+#SBATCH --mem-per-cpu=3800
+#SBATCH --time 00:30:00
+# these parameters determining the size of our job
+# meaning the amount of ressources needed to process it
+
+#SBATCH --array 1-6
+# This is a job Array of 6 jobs
+
+#SBATCH --job-name Tensorflow
+# A name for this job to be displayed in the queue
+#SBATCH --output job_%A_%a.out
+# filename where the terminal output of this job goes
+
+# deactivate any current environment
+if [ ! -z "$VIRTUAL_ENV" ]; then
+deactivate
+fi
+
+#load the required modules
+module purge
+module load gcc python/3.9
+
+# activate the correct environment
+source ./venv/bin/activate
+
+echo $(cat parameter/$SLURM_ARRAY_TASK_ID.params)
+python3 cnn_tf.py $(cat ./parameter/$SLURM_ARRAY_TASK_ID.params) --data_dir=./DATA --result_dir=./results/result_$SLURM_ARRAY_TASK_ID
+# so that we know which parameters where used when viewing the output
+cp parameter/$SLURM_ARRAY_TASK_ID.params results/result_$SLURM_ARRAY_TASK_ID/input_parameter.params
+
+deactivate
--- a/workflows-module/example/job_script_solution.sh
+++ b/workflows-module/example/job_script_solution.sh
+#!/bin/bash
+
+#SBATCH --ntasks 1
+#SBATCH --cpus-per-task 8
+#SBATCH --mem-per-cpu=3800
+#SBATCH --time 00:30:00
+# these parameters determining the size of our job
+# meaning the amount of ressources needed to process it
+
+#SBATCH --array 1-6
+# This is a job Array of 6 jobs
+
+#SBATCH --job-name Tensorflow
+# A name for this job to be displayed in the queue
+#SBATCH --output job_%A_%a.out
+# filename where the terminal output of this job goes
+
+# change to where the files are located
+EXPERIMENT_DIR=./
+
+# deactivate any current environment
+if [ ! -z "$VIRTUAL_ENV" ]; then
+deactivate
+fi
+
+#load the required modules
+module purge
+module load gcc/8.5 python/3.10
+
+# activate the correct environment
+source $EXPERIMENT_DIR/venv/bin/activate
+
+echo $(cat $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params)
+python3 $EXPERIMENT_DIR/cnn_tf.py $(cat $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params) --data_dir=$EXPERIMENT_DIR/DATA --result_dir=$EXPERIMENT_DIR/results/result_$SLURM_ARRAY_TASK_ID
+# so that we know which parameters where used when viewing the output
+cp $EXPERIMENT_DIR/parameter/$SLURM_ARRAY_TASK_ID.params $EXPERIMENT_DIR/results/result_$SLURM_ARRAY_TASK_ID/input_parameter.params
+
+deactivate
--- a/workflows-module/example/param_generator.py
+++ b/workflows-module/example/param_generator.py
+import argparse
+import random
+import os
+
+## definition of search area
+min_batch_size = 16
+max_batch_size = 1024
+min_lr = 0.0001
+max_lr = 0.5
+min_dense_layers = 1
+max_dense_layers = 5
+min_conv_layers = 1
+max_conv_layers = 3
+min_num_filters = 2
+max_num_filters = 32
+min_num_units = 11
+max_num_units = 128
+
+## settings for all parameter sets
+use_augmentation = True
+num_epochs = 10
+
+
+def write_parameter_file(
+        job_number=1,
+        num_epochs=20,
+        batch_size=128,
+        lr=0.01,
+        num_units=30,
+        dense_layers=1,
+        num_filters=16,
+        conv_layers=1,
+        augment=False,
+        path="./parameter/",
+):
+    if augment:
+        augment_str = "--augment_data"
+    else:
+        augment_str = ""
+
+    parameters = (
+            "--num_epochs %d --batch_size %d --learning_rate %f"
+            " --num_units %d --dense_layers %d --conv_layers %d"
+            " --num_filters %d %s\n"
+            % (
+                # " --early_stopping --trainfile_suffix %d --num_filters %d %s\n" % (
+                num_epochs,
+                batch_size,
+                lr,
+                num_units,
+                dense_layers,
+                conv_layers,
+                num_filters,
+                augment_str,
+            )
+    )
+
+    with open(path + '/' + str(job_number) + ".params", "w") as file:
+        file.write(parameters)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='generates parameters for a random parameterscan. The range of parameters to generate can be setup in the script')
+    parser.add_argument("--output_dir", type=str, default="./parameter",
+                        required=False, action='store',
+                        help="The directroy, where the output should be stored")
+    parser.add_argument("-n", type=int, default=5,
+                        required=False, action='store',
+                        help="Number of params to generate")
+    ARGS = parser.parse_args()
+    os.makedirs(ARGS.output_dir, exist_ok=True)
+    # use range +1 will lead to less errors when setting the slurm array boundary
+    for i in range(ARGS.n + 1):
+        # check random parameters
+        write_parameter_file(
+            job_number=i,
+            num_epochs=num_epochs,
+            batch_size=random.randint(min_batch_size, max_batch_size),
+            lr=random.uniform(min_lr, max_lr),
+            num_units=random.randint(min_num_units, max_num_units),
+            dense_layers=random.randint(min_dense_layers, max_dense_layers),
+            num_filters=random.randint(min_num_filters, max_num_filters),
+            conv_layers=random.randint(min_conv_layers, max_conv_layers),
+            augment=use_augmentation,
+            path=ARGS.output_dir
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/workflows-module/example/visualization.py
+++ b/workflows-module/example/visualization.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import re
+import argparse
+
+# which parameter to use for caption and order of the subplots
+caption_regex = p = re.compile("--learning_rate (\d+\.\d+)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Visualizes the results of a parameterscan')
+    parser.add_argument("--input_dir", type=str, default="results",
+                        required=True, action='store',
+                        help="Input directory")
+    parser.add_argument("--num_rows", type=int, default=4,
+                        required=False, action='store',
+                        help="Plot Layout: Number of rows")
+    parser.add_argument("--max_loss", type=float, default=0,
+                        required=False, action='store',
+                        help="maximum loss to display (yscale of plot)")
+    parser.add_argument("--output", type=str, default="plot.pdf",
+                        required=False, action='store',
+                        help="output filename")
+
+    ARGS = parser.parse_args()
+
+    cases = []
+
+    for path, directories, files in os.walk(ARGS.input_dir):
+        for dir in directories:
+            if (os.path.isfile(os.path.join(path, dir, "train_hist.csv")) and
+                    os.path.isfile(os.path.join(path, dir, "test_accuracy.txt")) and
+                    os.path.isfile(os.path.join(path, dir, "input_parameter.params"))):
+                cases.append(os.path.join(path, dir))
+                pass
+            else:
+                print("%s has not the expected files" % dir)
+
+    num_results = len(cases)
+    if num_results == 0:
+        print("Nothing to visualize")
+        exit()
+
+    data = {}
+    for case in cases:
+        param_string = "unknown"
+        # read last line from file
+        with open(os.path.join(case, "input_parameter.params"), 'r') as file:
+            param_string = file.readline()
+
+        train_hist = pd.read_csv(os.path.join(case, "train_hist.csv"))
+
+        title = caption_regex.findall(param_string)[0]
+        data[title] = train_hist
+
+    num_cols = int(num_results / ARGS.num_rows)
+    if num_results % ARGS.num_rows != 0:
+        num_cols += 1
+    fig, axes = plt.subplots(ARGS.num_rows, num_cols, sharex=True, sharey=True)
+    if ARGS.max_loss != 0:
+        plt.ylim([0, ARGS.max_loss])
+
+    i = 0
+    for title in sorted(data):
+        train_hist = data[title]
+
+        row = int(i / num_cols)
+        col = i % num_cols
+        if num_cols == 1:
+            this_plot = axes[row]
+        elif ARGS.num_rows == 1:
+            this_plot = axes[col]
+        else:
+            this_plot = axes[row, col]
+
+        this_plot.set_title(title)
+        this_plot.plot(train_hist['epoch'], train_hist['loss'], 'bo-', label='Train loss')
+        this_plot.plot(train_hist['epoch'], train_hist['val_loss'], 'rd--', label='Val loss')
+        this_plot.set_xlabel('Epoch')
+        this_plot.set_ylabel('Loss')
+        this_plot.legend()
+
+        i += 1
+
+    plt.show()
+    plt.savefig(ARGS.output)
+
+
+if __name__ == "__main__":
+    main()