From 33f04293da513fa6f125513dda5ec4bdd8a7b3b4 Mon Sep 17 00:00:00 2001
From: ssibirtsev <sibi_ballad@gmx.de>
Date: Wed, 15 Nov 2023 14:49:31 +0100
Subject: [PATCH] Upload New File

---
 ...and_cross_validation_training_template.job | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 manual/cluster_job_GPU_and_cross_validation_training_template.job

diff --git a/manual/cluster_job_GPU_and_cross_validation_training_template.job b/manual/cluster_job_GPU_and_cross_validation_training_template.job
new file mode 100644
index 0000000..66cd301
--- /dev/null
+++ b/manual/cluster_job_GPU_and_cross_validation_training_template.job
@@ -0,0 +1,105 @@
+#!/usr/bin/zsh
+
+##############################################
+##### Batch script for the MRCNN training ####
+##############################################
+
+#### CREATE SBATCH ENTRIES ####
+#### Paths and parameters must be adapted accordingly. 
+
+#### job name 
+#SBATCH --job-name=<JobName>
+
+#### Path and name of the output file of the job execution 
+#SBATCH --output=/home/<UserID>/.../<JobOutputFolderName>/%x_%J_output.txt
+
+#### Job runtime determined by testing jobs on the GPU node (see manual). 
+#### Multiply the computing time per epoch resulting from the test by the number of epochs to be trained. 
+#### Add a safety factor, e.g. multiply with 1.2 
+#SBATCH --time=0-00:00:00
+
+#### Memory requirement per GPU determined by testing jobs on the GPU node (see manual). 
+#### Add a safety factor, e.g. multiply with 1.2. 
+#### For example: resulting value is 5GB --> --mem-per-gpu=5G 
+#SBATCH --mem-per-gpu=5G
+
+#### E-mail address 
+#SBATCH --mail-user=<EmailAdress>
+
+#### E-mails to be received 
+#SBATCH --mail-type=ALL
+
+#### Number of tasks to be performed 
+#SBATCH --ntasks=1
+
+#### Number of GPUs required per node 
+#SBATCH --gres=gpu:1
+
+#### Definition of the job array starting at 0. ###
+#### This parameter is only required if you want to perform several jobs in parallel 
+#### from one job script, e.g. 5-fold cross-validation. 
+#### In this example we perform a 5-fold cross-validation.
+#### Thus, we will run 5 jobs in parallel from one job script --> array=0-4 
+#SBATCH --array=0-4
+
+#### CREATE TERMINAL ENTRIES ####
+#### Paths and parameters must be adapted accordingly 
+
+#### Definition of the job parameter, which is varied 
+#### if several jobs are executed in parallel from one job script. 
+#### This job parameter is only required if you have specified the #SBATCH parameter --array above. 
+#### In this example, we perform a 5-fold cross-validation.
+#### Thus, we will run 5 jobs in parallel from one job script: 
+#### the parameter kfoldval corresponds to the validation fold number of the current training, 
+#### which is varied for each job.
+kfoldval="$SLURM_ARRAY_TASK_ID"
+
+#### Loading the Cuda module 
+module load cuda/10.0
+
+#### Export path in which Anaconda is located 
+export PATH=$PATH:/home/<UserID>/anaconda3/bin
+
+#### Activate environment 
+source activate env_mrcnn_gpu
+
+#### Navigate to the path where the droplet.py script is located
+cd /home/<UserID>/.../samples/droplet/
+
+#### Run the droplet.py script. 
+#### These are the required training parameters to be specified
+#### with additional parameters required for the execution of parallel jobs from one job script. 
+#### In this example, we perform a 5-fold cross-validation.
+#### Thus, 5 jobs are executed in parallel (#SBATCH --array=0-4).
+#### In each job the job parameter kfoldval is varied, starting with 0 and ending with 4.
+#### First, we need to assign the job parameter kfoldval to the training parameter k_fold_val (--k_fold_val=$kfoldval). 
+#### Moreover, we need 5 output weights folders defined by the training parameter -new_weights_path.  
+#### Optional training parameters can be found below. 
+#### Description/default settings of all training parameters see manual. 
+python train_droplet.py --dataset_path=<InputFolderName> --new_weights_path=<WeightsFolderName>_"$kfoldval" --file_format=<FileFormat> --image_max=<Integer> --images_gpu=<Integer> --device=True --cross_validation=True --k_fold=5 --k_fold_val=$kfoldval
+
+#### Optional training parameters: 
+#### --name_result_file  
+#### --base_weights 
+#### --train_all_layers 
+#### --masks 
+#### --dataset_quantity 
+#### --epochs 
+#### --early_stopping 
+#### --early_loss 
+#### --use_wandb 
+#### --wandb_entity 
+#### --wandb_project 
+#### --wandb_group 
+#### --wandb_run 
+#### --backbone_type 
+#### --learning
+#### --momentum
+#### --w_decay
+#### --augmentation 
+#### --flip
+#### --cropandpad
+#### --rotate
+#### --noise
+#### --gamma
+#### --contrast 
\ No newline at end of file
-- 
GitLab