From 33f04293da513fa6f125513dda5ec4bdd8a7b3b4 Mon Sep 17 00:00:00 2001 From: ssibirtsev <sibi_ballad@gmx.de> Date: Wed, 15 Nov 2023 14:49:31 +0100 Subject: [PATCH] Upload New File --- ...and_cross_validation_training_template.job | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 manual/cluster_job_GPU_and_cross_validation_training_template.job diff --git a/manual/cluster_job_GPU_and_cross_validation_training_template.job b/manual/cluster_job_GPU_and_cross_validation_training_template.job new file mode 100644 index 0000000..66cd301 --- /dev/null +++ b/manual/cluster_job_GPU_and_cross_validation_training_template.job @@ -0,0 +1,105 @@ +#!/usr/bin/zsh + +############################################## +##### Batch script for the MRCNN training #### +############################################## + +#### CREATE SBATCH ENTRIES #### +#### Paths and parameters must be adapted accordingly. + +#### job name +#SBATCH --job-name=<JobName> + +#### Path and name of the output file of the job execution +#SBATCH --output=/home/<UserID>/.../<JobOutputFolderName>/%x_%J_output.txt + +#### Job runtime determined by testing jobs on the GPU node (see manual). +#### Multiply the computing time per epoch resulting from the test by the number of epochs to be trained. +#### Add a safety factor, e.g. multiply with 1.2 +#SBATCH --time=0-00:00:00 + +#### Memory requirement per GPU determined by testing jobs on the GPU node (see manual). +#### Add a safety factor, e.g. multiply with 1.2. +#### For example: resulting value is 5GB --> --mem-per-gpu=5G +#SBATCH --mem-per-gpu=5G + +#### E-mail address +#SBATCH --mail-user=<EmailAdress> + +#### E-mails to be received +#SBATCH --mail-type=ALL + +#### Number of tasks to be performed +#SBATCH --ntasks=1 + +#### Number of GPUs required per node +#SBATCH --gres=gpu:1 + +#### Definition of the job array starting at 0. ### +#### This parameter is only required if you want to perform several jobs in parallel +#### from one job script, e.g. 5-fold cross-validation. +#### In this example we perform a 5-fold cross-validation. +#### Thus, we will run 5 jobs in parallel from one job script --> array=0-4 +#SBATCH --array=0-4 + +#### CREATE TERMINAL ENTRIES #### +#### Paths and parameters must be adapted accordingly + +#### Definition of the job parameter, which is varied +#### if several jobs are executed in parallel from one job script. +#### This job parameter is only required if you have specified the #SBATCH parameter --array above. +#### In this example, we perform a 5-fold cross-validation. +#### Thus, we will run 5 jobs in parallel from one job script: +#### the parameter kfoldval corresponds to the validation fold number of the current training, +#### which is varied for each job. +kfoldval="$SLURM_ARRAY_TASK_ID" + +#### Loading the Cuda module +module load cuda/10.0 + +#### Export path in which Anaconda is located +export PATH=$PATH:/home/<UserID>/anaconda3/bin + +#### Activate environment +source activate env_mrcnn_gpu + +#### Navigate to the path where the droplet.py script is located +cd /home/<UserID>/.../samples/droplet/ + +#### Run the droplet.py script. +#### These are the required training parameters to be specified +#### with additional parameters required for the execution of parallel jobs from one job script. +#### In this example, we perform a 5-fold cross-validation. +#### Thus, 5 jobs are executed in parallel (#SBATCH --array=0-4). +#### In each job the job parameter kfoldval is varied, starting with 0 and ending with 4. +#### First, we need to assign the job parameter kfoldval to the training parameter k_fold_val (--k_fold_val=$kfoldval). +#### Moreover, we need 5 output weights folders defined by the training parameter -new_weights_path. +#### Optional training parameters can be found below. +#### Description/default settings of all training parameters see manual. +python train_droplet.py --dataset_path=<InputFolderName> --new_weights_path=<WeightsFolderName>_"$kfoldval" --file_format=<FileFormat> --image_max=<Integer> --images_gpu=<Integer> --device=True --cross_validation=True --k_fold=5 --k_fold_val=$kfoldval + +#### Optional training parameters: +#### --name_result_file +#### --base_weights +#### --train_all_layers +#### --masks +#### --dataset_quantity +#### --epochs +#### --early_stopping +#### --early_loss +#### --use_wandb +#### --wandb_entity +#### --wandb_project +#### --wandb_group +#### --wandb_run +#### --backbone_type +#### --learning +#### --momentum +#### --w_decay +#### --augmentation +#### --flip +#### --cropandpad +#### --rotate +#### --noise +#### --gamma +#### --contrast \ No newline at end of file -- GitLab