Skip to content
Snippets Groups Projects
Commit 89776614 authored by Gonzalo Martin Garcia's avatar Gonzalo Martin Garcia
Browse files

More intuitive creator notebook. Experiment folders have the name of the wandb...

More intuitive creator notebook. Experiment folders have the name of the wandb run; samples are associated with the model epoch they were generated from. Other small changes
parent 8f2613b1
Branches
No related tags found
No related merge requests found
......@@ -3,15 +3,15 @@ from evaluation.sample import ddpm_sampler
def ddpm_evaluator(model,
device,
dataloader,
checkpoint_path,
checkpoint,
experiment_path
):
'''
Takes a trained diffusion model from 'checkpoint_path' and evaluates its performance on the test
Takes a trained diffusion model from 'checkpoint' and evaluates its performance on the test
dataset 'dataloader'. ....
model: Properly initialized DDPM model
checkpoint_path: Path to the checkpoint of the trained diffusion model weights and biases
checkpoint: Name of the saved pth. file containing the trained weights and biases
experiment_path: Path to the experiment folder where the evaluation results will be stored
testloader: Loads the test dataset
TODO ...
......
import os
import torch
from torchvision import transforms
import re
def ddpm_sampler(model, checkpoint_path, experiment_path, device, intermediate=False, batch_size=15):
def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False, batch_size=15):
'''
Samples a tensor of 'batch_size' images from a trained diffusion model on 'checkpoint_path'. The generated
images are stored in the directory 'experiment_path/samples/sample_{j}. Where j is an index
Samples a tensor of 'batch_size' images from a trained diffusion model with 'checkpoint'. The generated
images are stored in the directory 'experiment_path/samples/epoch_{e}/sample_{j}. Where e is the epoch
w.r.t. the model which we are sampling form and j is an index separating images from each call of the
sampling function for the given mode.
model: Diffusion model
checkpoint_path: Path to the .pth checkpoint containing the trained weights and biases
checkpoint: Name of the saved pth. file containing the trained weights and biases
experiment_path: Path to the experiment directory where the samples will saved under the diectory samples
batch_size: The number of images to sample
intermediate: Bool value. If False the sampling function will draw a batch of images, else it will just
......@@ -17,6 +20,7 @@ def ddpm_sampler(model, checkpoint_path, experiment_path, device, intermediate=F
# load model
try:
checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
checkpoint = torch.load(checkpoint_path)
# load weights and biases of the U-Net
net_state_dict = checkpoint['model']
......@@ -25,15 +29,26 @@ def ddpm_sampler(model, checkpoint_path, experiment_path, device, intermediate=F
except Exception as e:
print("Error loading checkpoint. Exception:", e)
# create samples directory (if first time sampling images)
output_dir = f'{experiment_path}/samples/'
# create samples directory for the complete experiment (if first time sampling images)
output_dir = f'{experiment_path}samples/'
#output_dir = os.path.join(experiment_path,'/samples/')
os.makedirs(output_dir, exist_ok=True)
# create the sample directory for this sampling run
sample_dir_list = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))]
# create sample directory for the current version of the trained model
model_name = os.path.basename(checkpoint_path)
epoch = re.findall(r'\d+', model_name)
if epoch:
e = int(epoch[0])
else:
raise ValueError(f"No digit found in the filename: {filename}")
model_dir = os.path.join(output_dir,f'epoch_{e}')
os.makedirs(model_dir, exist_ok=True)
# create the sample directory for this sampling run for the current version of the model
sample_dir_list = [d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d))]
indx_list = [int(d.split('_')[1]) for d in sample_dir_list if d.startswith('sample_')]
j = max(indx_list, default=-1) + 1
sample_dir = os.path.join(output_dir, f'sample_{j}')
sample_dir = os.path.join(model_dir, f'sample_{j}')
os.makedirs(sample_dir, exist_ok=True)
# transform
......
%% Cell type:code id: tags:
``` python
from trainer.train import *
from dataloader.load import *
from models.Framework import *
from models.unet_unconditional_diffusion import *
import torch
from torch import nn
```
%% Cell type:markdown id: tags:
# Prepare experiment
1. Choose Hyperparameter Settings
2. Run notebook on local maschine to generate experiment folder with the JSON files containing the settings
3. scp experiment folder to the HPC
4. Run Pipeline by adding following to batch file:
- Train Model: &emsp;&emsp;&emsp;&emsp;&emsp; `python main.py train "<absolute path of folder in hpc>"`
- Generate Images: &emsp;&emsp; `python main.py generate "<absolute path of folder in hpc>"`
- Evaluate Model: &emsp;&emsp;&emsp; `python main.py evaluate "<absolute path of folder in hpc>"`
- Train Model: &emsp;&emsp;&emsp;&emsp;&emsp; `python main.py train "<absolute path of experiment folder in hpc>"`
- Sample Images: &emsp;&emsp;&emsp; `python main.py sample "<absolute path of experiment folder in hpc>"`
- Evaluate Model: &emsp;&emsp;&emsp; `python main.py evaluate "<absolute path of experiment folder in hpc>"`
%% Cell type:code id: tags:
``` python
import torch
####
# Settings
####
# Dataset path
datapath = "/work/lect0100/lhq_256"
# Experiment setup
experiment_name = "exp_0"
model_name = 'model_epoch_20.pth'
checkpoint_path = '/work/lect0100/experiments_gonzalo/'+ experiment_name + '/trained_ddpm/' + model_name
experiment_path = '/work/lect0100/experiments_gonzalo/'+ experiment_name +'/'
run_name = 'big_diffusers_fin' # WANDB and experiment folder Name!
checkpoint = 'model_epoch_8.pth' # Name of checkpoint pth file or None
experiment_path = '/work/lect0100/experiments_gonzalo/'+ run_name +'/'
# Path to save generated experiment folder on local machine
local_path ="/Users/gonzalo/Desktop/experiments/" + experiment_name + '/'
local_path ="/Users/gonzalo/Desktop/" + run_name + '/settings'
# Diffusion Model Settings
diffusion_steps = 200
image_size = 64
image_size = 128
channels = 3
# Training
batchsize = 64
epochs = 5
store_iter = 6
eval_iter = 0
batchsize = 8
epochs = 30
store_iter = 3
eval_iter = 500
learning_rate = 0.0001
lr_schedule = False
optimizername = "torch.optim.AdamW"
optimizer_params = None
run_name = 'diffusers_recon_20'# id for WANDB
verbose = True
# checkpoint_path = None #(If no checkpoint training, ie. random weights)
# checkpoint = None #(If no checkpoint training, ie. random weights)
# Sampling
sample_size = 10
intermediate = False # True if you want to sample one image and all ist intermediate latents
# Evaluating
...
###
# Advanced Settings Dictionaries
###
meta_setting = dict(modelname = "UNet_Unconditional_Diffusion_Bottleneck_Variant",
dataset = "UnconditionalDataset",
framework = "DDPM",
trainloop_function = "ddpm_trainer",
sampling_function = 'ddpm_sampler',
evaluation_function = 'ddpm_evaluator',
batchsize = batchsize
)
dataset_setting = dict(fpath = datapath,
img_size = image_size,
frac =0.8,
skip_first_n = 0,
ext = ".png",
transform=True
)
model_setting = dict( channels_in=channels,
channels_out =channels ,
activation='relu', # activation function. Options: {'relu', 'leakyrelu', 'selu', 'gelu', 'silu'/'swish'}
weight_init='he', # weight initialization. Options: {'he', 'torch'}
projection_features=64, # number of image features after first convolution layer
time_dim=batchsize, #dont chnage!!!
time_channels=diffusion_steps, # number of time channels #TODO same as diffusion steps?
num_stages=4, # number of stages in contracting/expansive path
stage_list=None, # specify number of features produced by stages
num_blocks=1, # number of ConvResBlock in each contracting/expansive path
num_groupnorm_groups=32, # number of groups used in Group Normalization inside a ConvResBlock
dropout=0.1, # drop-out to be applied inside a ConvResBlock
attention_list=None, # specify MHA pattern across stages
num_attention_heads=1,
)
framework_setting = dict(
diffusion_steps = diffusion_steps, # dont change!!
out_shape = (channels,image_size,image_size), # dont change!!
noise_schedule = 'linear',
beta_1 = 1e-4,
beta_T = 0.02,
alpha_bar_lower_bound = 0.9,
var_schedule = 'same',
kl_loss = 'weighted',
kl_loss = 'simplified',
recon_loss = 'nll',
)
training_setting = dict(
epochs = epochs,
store_iter = store_iter,
eval_iter = eval_iter,
optimizer_class=optimizername,
optimizer_params = optimizer_params,
#optimizer_params=dict(lr=learning_rate), # don't change!
learning_rate = learning_rate,
lr_schedule = lr_schedule,
run_name=run_name,
checkpoint_path= checkpoint_path,
checkpoint= checkpoint,
experiment_path = experiment_path,
verbose = verbose,
)
sampling_setting = dict(
checkpoint_path = checkpoint_path ,
checkpoint = checkpoint,
experiment_path = experiment_path,
batch_size = sample_size,
intermediate = intermediate
)
# TODO
evaluation_setting = dict(
checkpoint_path = checkpoint_path,
checkpoint = checkpoint,
experiment_path = experiment_path,
)
```
%% Cell type:code id: tags:
``` python
import os
import json
f = local_path
if os.path.exists(f):
print("path already exists, pick a new name!")
print("break")
else:
print("create folder")
os.mkdir(f)
#os.mkdir(f)
os.makedirs(f, exist_ok=True)
print("folder created ")
with open(f+"/meta_setting.json","w+") as fp:
json.dump(meta_setting,fp)
with open(f+"/dataset_setting.json","w+") as fp:
json.dump(dataset_setting,fp)
with open(f+"/model_setting.json","w+") as fp:
json.dump(model_setting,fp)
with open(f+"/framework_setting.json","w+") as fp:
json.dump(framework_setting,fp)
with open(f+"/training_setting.json","w+") as fp:
json.dump(training_setting,fp)
with open(f+"/sampling_setting.json","w+") as fp:
json.dump(sampling_setting,fp)
with open(f+"/evaluation_setting.json","w+") as fp:
json.dump(evaluation_setting,fp)
print("stored json files in folder")
print(meta_setting)
print(dataset_setting)
print(model_setting)
print(framework_setting)
print(training_setting)
print(sampling_setting)
print(evaluation_setting)
```
%% Output
create folder
folder created
stored json files in folder
{'modelname': 'UNet_Unconditional_Diffusion_Bottleneck_Variant', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 64}
{'fpath': '/work/lect0100/lhq_256', 'img_size': 64, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}
{'channels_in': 3, 'channels_out': 3, 'activation': 'relu', 'weight_init': 'he', 'projection_features': 64, 'time_dim': 64, 'time_channels': 200, 'num_stages': 4, 'stage_list': None, 'num_blocks': 1, 'num_groupnorm_groups': 32, 'dropout': 0.1, 'attention_list': None, 'num_attention_heads': 1}
{'diffusion_steps': 200, 'out_shape': (3, 64, 64), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'weighted', 'recon_loss': 'nll'}
{'epochs': 5, 'store_iter': 6, 'eval_iter': 0, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'lr_schedule': False, 'run_name': 'diffusers_recon_20', 'checkpoint_path': '/work/lect0100/experiments_377031/exp_0/trained_ddpm/model_epoch_20.pth', 'verbose': True}
{'checkpoint_path': '/work/lect0100/experiments_377031/exp_0/trained_ddpm/model_epoch_20.pth', 'experiment_path': '/work/lect0100/experiments_377031/exp_0/', 'batch_size': 10, 'intermediate': False}
{'checkpoint_path': '/work/lect0100/experiments_377031/exp_0/trained_ddpm/model_epoch_20.pth', 'experiment_path': '/work/lect0100/experiments_377031/exp_0/'}
{'modelname': 'UNet_Unconditional_Diffusion_Bottleneck_Variant', 'dataset': 'UnconditionalDataset', 'framework': 'DDPM', 'trainloop_function': 'ddpm_trainer', 'sampling_function': 'ddpm_sampler', 'evaluation_function': 'ddpm_evaluator', 'batchsize': 8}
{'fpath': '/work/lect0100/lhq_256', 'img_size': 128, 'frac': 0.8, 'skip_first_n': 0, 'ext': '.png', 'transform': True}
{'channels_in': 3, 'channels_out': 3, 'activation': 'relu', 'weight_init': 'he', 'projection_features': 64, 'time_dim': 8, 'time_channels': 200, 'num_stages': 4, 'stage_list': None, 'num_blocks': 1, 'num_groupnorm_groups': 32, 'dropout': 0.1, 'attention_list': None, 'num_attention_heads': 1}
{'diffusion_steps': 200, 'out_shape': (3, 128, 128), 'noise_schedule': 'linear', 'beta_1': 0.0001, 'beta_T': 0.02, 'alpha_bar_lower_bound': 0.9, 'var_schedule': 'same', 'kl_loss': 'simplified', 'recon_loss': 'nll'}
{'epochs': 30, 'store_iter': 3, 'eval_iter': 500, 'optimizer_class': 'torch.optim.AdamW', 'optimizer_params': None, 'learning_rate': 0.0001, 'lr_schedule': False, 'run_name': 'big_diffusers_fin', 'checkpoint': 'model_epoch_8.pth', 'experiment_path': '/work/lect0100/experiments_gonzalo/big_diffusers_fin/', 'verbose': True}
{'checkpoint': 'model_epoch_8.pth', 'experiment_path': '/work/lect0100/experiments_gonzalo/big_diffusers_fin/', 'batch_size': 10, 'intermediate': False}
{'checkpoint': 'model_epoch_8.pth', 'experiment_path': '/work/lect0100/experiments_gonzalo/big_diffusers_fin/'}
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
......
......@@ -73,7 +73,7 @@ def train_func(f):
def generate_func(f):
def sample_func(f):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}\n\n")
print(f"folderpath: {f}\n\n")
......@@ -224,7 +224,7 @@ if __name__ == '__main__':
from torch import nn
print(sys.argv)
functions = {'train': train_func,'generate': generate_func,'evaluate': evaluate_func,"hello":hello}
functions = {'train': train_func,'sample': sample_func,'evaluate': evaluate_func,"hello":hello}
functions[sys.argv[1]](sys.argv[2])
......
......@@ -371,8 +371,9 @@ class DDPM(nn.Module):
x (tensor): Contains the self.diffusion_steps+1 denoised image tensors
'''
# start with an image of pure noise and store it as part of the output
x_t_1 = torch.randn(self.out_shape, device=self.device)
x = torch.empty((self.diffusion_steps+1,) + self.out_shape, device=self.device)
x_t_1 = torch.randn((1,) + tuple(self.out_shape), device=self.device)
#x_t_1 = torch.randn(self.out_shape, device=self.device)
x = torch.empty((self.diffusion_steps+1,) + tuple(self.out_shape), device=self.device)
x[-1] = x_t_1
# apply reverse trajectory
for t in reversed(range(1, self.diffusion_steps+1)):
......@@ -387,7 +388,9 @@ class DDPM(nn.Module):
x_t_1 = mean + std*noise
# store noised image
x[t-1] = x_t_1
return x
x_sq = x.squeeze(1)
return x_sq
#return x
# Loss functions
......
......@@ -67,7 +67,7 @@ def ddpm_trainer(model,
lr_schedule = False,
verbose = False,
run_name=None,
checkpoint_path= None,
checkpoint= None,
experiment_path = None,
**args
):
......@@ -86,7 +86,7 @@ def ddpm_trainer(model,
'run_name' FOR THE DATA TO BE LOGGED ON THE SAME WANDB RUN!
trainloader: Loads the train dataset
testloader: Loads the test dataset
checkpoint_path: Path to where the checkpoint of a trained ddpm to resume training
checkpoint: Name of the saved pth. file containing the trained weights and biases
'''
# set optimizer parameters and learning rate
......@@ -98,8 +98,9 @@ def ddpm_trainer(model,
# if checkpoint path is given, load the model from checkpoint
last_epoch = -1
if checkpoint_path:
if checkpoint:
try:
checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
# Load the checkpoint
checkpoint = torch.load(checkpoint_path)
# update last_epoch
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment