minor bug fixes

578a3833 · Tobias Seibel · ce99d71b · 578a3833 · 578a3833 · 578a3833
Commit 578a3833 authored Jun 21, 2023 by Tobias Seibel
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
 */__pycache__
 */trained_ddpm
 root
+experiments
+trainer/__pycache__
\ No newline at end of file
--- a/dataloader/load.py
+++ b/dataloader/load.py
@@ -62,7 +62,7 @@ class UnconditionalDataset(Dataset):
        return len(self.df)
    def __getitem__(self,idx):
-        path =  self.df.iloc[idx].Filepaths
+        path =  self.df.iloc[idx].Filepath
        img = Image.open(path)
        return self.transform(img),0

--- a/experiment_creator.ipynb
+++ b/experiment_creator.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
@@ -33,7 +33,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -47,23 +47,23 @@
    "datapath = \"/work/lect0100/lhq_256\"\n",
    "\n",
    "# Experiment setup\n",
-    "run_name = 'batch_timesteps' # WANDB and experiment folder Name!\n",
+    "run_name = 'main_test0' # WANDB and experiment folder Name!\n",
    "checkpoint = None #'model_epoch_8.pth' # Name of checkpoint pth file or None \n",
-    "experiment_path = '/work/lect0100/experiments_gonzalo/'+ run_name +'/'\n",
+    "experiment_path = \"/work/lect0100/main_experiment/\" + run_name +'/'\n",
    "\n",
    "# Path to save generated experiment folder on local machine\n",
-    "local_path =\"/Users/gonzalo/Desktop/\" + run_name + '/settings'\n",
+    "local_path =\"experiments/\" + run_name + '/settings'\n",
    "\n",
    "# Diffusion Model Settings\n",
-    "diffusion_steps = 200\n",
+    "diffusion_steps = 500\n",
-    "image_size = 64\n",
+    "image_size = 128\n",
    "channels = 3\n",
    "\n",
    "# Training\n",
    "batchsize = 32\n",
-    "epochs = 30\n",
+    "epochs = 20\n",
-    "store_iter = 1\n",
+    "store_iter = 5\n",
-    "eval_iter = 500\n",
+    "eval_iter = 2\n",
    "learning_rate = 0.0001\n",
    "optimizername = \"torch.optim.AdamW\"\n",
    "optimizer_params = None\n",
@@ -71,7 +71,7 @@
    "# checkpoint = None #(If no checkpoint training, ie. random weights)\n",
    "\n",
    "# Sampling \n",
-    "sample_size = 10\n",
+    "sample_size = 20\n",
    "intermediate = False # True if you want to sample one image and all ist intermediate latents\n",
    "\n",
    "\n",
@@ -103,6 +103,7 @@
    "model_setting = dict( n_channels=64,\n",
    "                      fctr = [1,2,4,4,8],\n",
    "                      time_dim=256,\n",
+    "                      attention = True,\n",
    "                    )\n",
    "\"\"\"\n",
    "outdated\n",
@@ -145,8 +146,8 @@
    "                checkpoint= checkpoint,\n",
    "                experiment_path = experiment_path,\n",
    "                verbose = verbose,\n",
-    "                T_max = 5*10000, # cosine lr param\n",
+    "                T_max = 0.8*90000/32*150, # cosine lr param   len(train_ds)/batchsize * total epochs to 0 \n",
-    "                eta_min= 1e-5, # cosine lr param\n",
+    "                eta_min= 1e-10, # cosine lr param\n",
    "                )\n",
    "sampling_setting = dict( \n",
    "                checkpoint = checkpoint, \n",
@@ -163,7 +164,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -173,13 +174,13 @@
      "create folder\n",
      "folder created \n",
      "stored json files in folder\n",
-      "{'modelname': 'UNet_Unconditional_Diffusion_Bottleneck_Variant', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 32}\n",
+      "{'modelname': 'UNet_Res', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 32}\n",
-      "{'fpath': '/work/lect0100/lhq_256', 'img_size': 64, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}\n",
+      "{'fpath': '/work/lect0100/lhq_256', 'img_size': 128, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}\n",
-      "{'channels_in': 3, 'channels_out': 3, 'activation': 'relu', 'weight_init': 'he', 'projection_features': 64, 'time_dim': 32, 'time_channels': 200, 'num_stages': 4, 'stage_list': None, 'num_blocks': 1, 'num_groupnorm_groups': 32, 'dropout': 0.1, 'attention_list': None, 'num_attention_heads': 1}\n",
+      "{'n_channels': 64, 'fctr': [1, 2, 4, 4, 8], 'time_dim': 256}\n",
-      "{'diffusion_steps': 200, 'out_shape': (3, 64, 64), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'simplified', 'recon_loss': 'nll'}\n",
+      "{'diffusion_steps': 500, 'out_shape': (3, 128, 128), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'simplified', 'recon_loss': 'nll'}\n",
-      "{'epochs': 30, 'store_iter': 1, 'eval_iter': 500, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'run_name': 'batch_timesteps', 'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/', 'verbose': True}\n",
+      "{'epochs': 10, 'store_iter': 2, 'eval_iter': 2, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'run_name': 'main_testing', 'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/', 'verbose': True, 'T_max': 9000000, 'eta_min': 1e-10}\n",
-      "{'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/', 'batch_size': 10, 'intermediate': False}\n",
+      "{'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/', 'batch_size': 10, 'intermediate': False}\n",
-      "{'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/'}\n"
+      "{'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/'}\n"
     ]
    }
   ],
@@ -227,13 +228,6 @@
    "    "
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "code",
   "execution_count": null,

 %% Cell type:code id: tags:
 ``` python
 from trainer.train import *
 from dataloader.load import  *
 from models.Framework import *
 from models.all_unets import *
 import torch
 from torch import nn
 ```
 %% Cell type:markdown id: tags:
 # Prepare experiment
 1. Choose Hyperparameter Settings
 2. Run notebook on local maschine to generate experiment folder with the JSON files containing the settings
 3. scp experiment folder to the HPC
 4. Run Pipeline by adding following to batch file:
 - Train Model: &emsp;&emsp;&emsp;&emsp;&emsp; `python main.py train "<absolute path of experiment folder in hpc>"`
 - Sample Images: &emsp;&emsp;&emsp; `python main.py sample "<absolute path of experiment folder in hpc>"`
 - Evaluate Model: &emsp;&emsp;&emsp; `python main.py evaluate "<absolute path of experiment folder in hpc>"`
 %% Cell type:code id: tags:
 ``` python
 import torch
 ####
 # Settings
 ####
 # Dataset path
 datapath = "/work/lect0100/lhq_256"
 # Experiment setup
-run_name = 'batch_timesteps' # WANDB and experiment folder Name!
+run_name = 'main_test0' # WANDB and experiment folder Name!
 checkpoint = None #'model_epoch_8.pth' # Name of checkpoint pth file or None
-experiment_path = '/work/lect0100/experiments_gonzalo/'+ run_name +'/'
+experiment_path = "/work/lect0100/main_experiment/" + run_name +'/'
 # Path to save generated experiment folder on local machine
-local_path ="/Users/gonzalo/Desktop/" + run_name + '/settings'
+local_path ="experiments/" + run_name + '/settings'
 # Diffusion Model Settings
-diffusion_steps = 200
+diffusion_steps = 500
-image_size = 64
+image_size = 128
 channels = 3
 # Training
 batchsize = 32
-epochs = 30
+epochs = 20
-store_iter = 1
+store_iter = 5
-eval_iter = 500
+eval_iter = 2
 learning_rate = 0.0001
 optimizername = "torch.optim.AdamW"
 optimizer_params = None
 verbose = True
 # checkpoint = None #(If no checkpoint training, ie. random weights)
 # Sampling
-sample_size = 10
+sample_size = 20
 intermediate = False # True if you want to sample one image and all ist intermediate latents
 # Evaluating
 ...
 ###
 # Advanced Settings Dictionaries
 ###
 meta_setting = dict(modelname = "UNet_Res",
                    dataset = "UnconditionalDataset",
                    framework = "DDPM",
                    trainloop_function = "ddpm_trainer",
                    sampling_function = 'ddpm_sampler',
                    evaluation_function = 'ddpm_evaluator',
                    batchsize = batchsize
                    )
 dataset_setting = dict(fpath = datapath,
                                img_size = image_size,
                                frac =0.8,
                                skip_first_n = 0,
                                ext = ".png",
                                transform=True
                                )
 model_setting = dict( n_channels=64,
                      fctr = [1,2,4,4,8],
                      time_dim=256,
+                      attention = True,
                    )
 """
 outdated
 model_setting = dict( channels_in=channels,
               channels_out =channels ,
               activation='relu',           # activation function. Options: {'relu', 'leakyrelu', 'selu', 'gelu', 'silu'/'swish'}
               weight_init='he',            # weight initialization. Options: {'he', 'torch'}
               projection_features=64,      # number of image features after first convolution layer
               time_dim=batchsize,                 #dont chnage!!!
               time_channels=diffusion_steps,           # number of time channels #TODO same as diffusion steps?
               num_stages=4,                # number of stages in contracting/expansive path
               stage_list=None,             # specify number of features produced by stages
               num_blocks=1,                # number of ConvResBlock in each contracting/expansive path
               num_groupnorm_groups=32,     # number of groups used in Group Normalization inside a ConvResBlock
               dropout=0.1,                 # drop-out to be applied inside a ConvResBlock
               attention_list=None,         # specify MHA pattern across stages
               num_attention_heads=1,
               )
 """
 framework_setting = dict(
                 diffusion_steps = diffusion_steps,  # dont change!!
                 out_shape = (channels,image_size,image_size),  # dont change!!
                 noise_schedule = 'linear',
                 beta_1 = 1e-4,
                 beta_T = 0.02,
                 alpha_bar_lower_bound = 0.9,
                 var_schedule = 'same',
                 kl_loss = 'simplified',
                 recon_loss = 'nll',
                 )
 training_setting = dict(
                epochs = epochs,
                store_iter = store_iter,
                eval_iter = eval_iter,
                optimizer_class=optimizername,
                optimizer_params = optimizer_params,
                #optimizer_params=dict(lr=learning_rate), # don't change!
                learning_rate = learning_rate,
                run_name=run_name,
                checkpoint= checkpoint,
                experiment_path = experiment_path,
                verbose = verbose,
-                T_max = 5*10000, # cosine lr param
+                T_max = 0.8*90000/32*150, # cosine lr param   len(train_ds)/batchsize * total epochs to 0
-                eta_min= 1e-5, # cosine lr param
+                eta_min= 1e-10, # cosine lr param
                )
 sampling_setting = dict(
                checkpoint = checkpoint,
                experiment_path = experiment_path,
                batch_size = sample_size,
                intermediate = intermediate
                )
 # TODO
 evaluation_setting = dict(
                    checkpoint = checkpoint,
                    experiment_path = experiment_path,
                    )
 ```
 %% Cell type:code id: tags:
 ``` python
 import os
 import json
 f =  local_path
 if os.path.exists(f):
    print("path already exists, pick a new name!")
    print("break")
 else:
    print("create folder")
    #os.mkdir(f)
    os.makedirs(f, exist_ok=True)
    print("folder created ")
    with open(f+"/meta_setting.json","w+") as fp:
        json.dump(meta_setting,fp)
    with open(f+"/dataset_setting.json","w+") as fp:
        json.dump(dataset_setting,fp)
    with open(f+"/model_setting.json","w+") as fp:
        json.dump(model_setting,fp)
    with open(f+"/framework_setting.json","w+") as fp:
        json.dump(framework_setting,fp)
    with open(f+"/training_setting.json","w+") as fp:
        json.dump(training_setting,fp)
    with open(f+"/sampling_setting.json","w+") as fp:
        json.dump(sampling_setting,fp)
    with open(f+"/evaluation_setting.json","w+") as fp:
        json.dump(evaluation_setting,fp)
    print("stored json files in folder")
    print(meta_setting)
    print(dataset_setting)
    print(model_setting)
    print(framework_setting)
    print(training_setting)
    print(sampling_setting)
    print(evaluation_setting)
 ```
 %% Output
    create folder
    folder created
    stored json files in folder
-    {'modelname': 'UNet_Unconditional_Diffusion_Bottleneck_Variant', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 32}
+    {'modelname': 'UNet_Res', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 32}
-    {'fpath': '/work/lect0100/lhq_256', 'img_size': 64, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}
+    {'fpath': '/work/lect0100/lhq_256', 'img_size': 128, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}
-    {'channels_in': 3, 'channels_out': 3, 'activation': 'relu', 'weight_init': 'he', 'projection_features': 64, 'time_dim': 32, 'time_channels': 200, 'num_stages': 4, 'stage_list': None, 'num_blocks': 1, 'num_groupnorm_groups': 32, 'dropout': 0.1, 'attention_list': None, 'num_attention_heads': 1}
+    {'n_channels': 64, 'fctr': [1, 2, 4, 4, 8], 'time_dim': 256}
-    {'diffusion_steps': 200, 'out_shape': (3, 64, 64), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'simplified', 'recon_loss': 'nll'}
+    {'diffusion_steps': 500, 'out_shape': (3, 128, 128), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'simplified', 'recon_loss': 'nll'}
-    {'epochs': 30, 'store_iter': 1, 'eval_iter': 500, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'run_name': 'batch_timesteps', 'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/', 'verbose': True}
+    {'epochs': 10, 'store_iter': 2, 'eval_iter': 2, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'run_name': 'main_testing', 'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/', 'verbose': True, 'T_max': 9000000, 'eta_min': 1e-10}
-    {'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/', 'batch_size': 10, 'intermediate': False}
+    {'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/', 'batch_size': 10, 'intermediate': False}
-    {'checkpoint': None, 'experiment_path': '/work/lect0100/experiments_gonzalo/batch_timesteps/'}
+    {'checkpoint': None, 'experiment_path': '/work/lect0100/tobi/main_test/main_testing/'}
-%% Cell type:code id: tags:
-``` python
-```
 %% Cell type:code id: tags:
 ``` python
 ```

--- a/playground.ipynb
+++ b/playground.ipynb
--- a/trainer/train.py
+++ b/trainer/train.py
@@ -174,7 +174,7 @@ def ddpm_trainer(model,
        ema = ModelEmaV2(model, decay=decay, device = model.device)
    # Using W&B
-    with wandb.init(project='test-project', name=run_name, entity='gonzalomartingarcia0', id=run_name, resume=True) as run:
+    with wandb.init(project='Unconditional Landscapes', name=run_name, entity='deep-lab-', id=run_name, resume=True) as run:
        # Log some info
        run.config.learning_rate = learning_rate