updated some comments

910ae792 · Gonzalo Martin Garcia · 1f5e2446 · 910ae792 · 910ae792 · 910ae792
Commit 910ae792 authored Jul 4, 2023 by Gonzalo Martin Garcia
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ The pipeline is initiated via the experiment_creator.ipynb notebook, which is se

 Upon execution, the notebook generates individual JSON files, encapsulating all the hyperparameter information. When running the model on the HPC, we can choose between the operations 'train', 'sample', and 'evaluate'. These operations automatically extract the necessary hyperparameters from the JSON files and perform their respective tasks. This process is managed by the main.py file. The remaining files contain all the necessary functions optimized for HPC to perform the aforementioned tasks.

-Every uniquely trained diffusion model has its own experiment folder, given by its WANDB run name. It holds four different directories: settings, trained_ddpm, samples, and evaluations. The settings folder holds the JSON files specifying the diffusion model's configurations as well as the arguments for the training, sampling, and evaluation functions. The trained_ddpm folder contains .pth files storing the weights and biases of this experiment's diffusion model, which have been saved at different epoch milestones while training. Upon resuming training, the pipeline takes the specified model in trained_ddpm and continues training from there. When sampling images from these trained diffusion models, the samples are stored in different directories for the milestones under the names epoch_{i}. This is done so we know what epoch i version of the diffusion model was used to generate these samples.
+Every uniquely trained diffusion model has its own experiment folder, given by its WANDB run name. It holds four different directories: settings, trained_ddpm, samples, and evaluations. The settings folder holds the JSON files specifying the diffusion model's configurations as well as the arguments for the training, sampling, and evaluation functions. The trained_ddpm folder contains .pth files storing the weights and biases of this experiment's diffusion model, which have been saved at different epoch milestones while training. Upon resuming training, the pipeline takes the specified model in trained_ddpm and continues training from there. When sampling images from these trained diffusion models, the samples are stored in different directories for the milestones under the names epoch_{i}. This is done so we know what epoch i version of the diffusion model was used to generate these samples. This is done analogously for the evaluation.

 

--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -17,12 +17,13 @@ def ddpm_evaluator(model,
                  ):
  '''
  Takes a trained diffusion model from 'checkpoint' and evaluates its performance on the test 
-  dataset 'dataloader'. ....
+  dataset 'dataloader' w.r.t. the three most important perfromance metrics; FID, IS, KID. We continue
+  the progress of our evaluation function for the LDM upscalaer and may update this function accordingly.
    
-  model:           Properly initialized DDPM model
  checkpoint:      Name of the saved pth. file containing the trained weights and biases  
  experiment_path: Path to the experiment folder where the evaluation results will be stored  
-  testloader:      Loads the test dataset
+  dataloader:      Loads the test dataset for evaluation
+  sample_idx:      Integer that denotes which sample directory sample_{sample_idx} from the checkpoint model shall be used for evaluation
  '''

  checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
@@ -30,7 +31,7 @@ def ddpm_evaluator(model,
  output_dir = f'{experiment_path}evaluations/'
  os.makedirs(output_dir, exist_ok=True)

-  # create sample directory for the current version of the trained model
+  # create evaluation directory for the current version of the trained model
  model_name = os.path.basename(checkpoint_path)
  epoch = re.findall(r'\d+', model_name)
  if epoch:
@@ -40,14 +41,14 @@ def ddpm_evaluator(model,
  model_dir = os.path.join(output_dir,f'epoch_{e}')
  os.makedirs(model_dir, exist_ok=True)

-  # create the sample directory for this sampling run for the current version of the model    
+  # create the evaluation directory for this evaluation run for the current version of the model    
  eval_dir_list = [d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d))]
  indx_list = [int(d.split('_')[1]) for d in eval_dir_list if d.startswith('evaluation_')]
  j = max(indx_list, default=-1) + 1
  eval_dir = os.path.join(model_dir, f'evaluation_{j}')
  os.makedirs(eval_dir, exist_ok=True)
  
-  # Compute FID SCORE  
+  # Compute metrics  
  eval_path = os.path.join(eval_dir, 'eval.txt')
 
  # get sampled images
@@ -61,17 +62,18 @@ def ddpm_evaluator(model,
        img = Image.open(os.path.join(sample_path, samplename))
        img = transform(img)
        images.append(img)
+  # split them into batches for GPU memory
  generated = torch.stack(images).to(device)
  generated_batches = torch.split(generated, dataloader.batch_size)
  nr_generated_batches = len(generated_batches)
  nr_real_batches = len(dataloader)

-  # Init FID and IS
+  # Init FID, IS and KID scores
  fid = FrechetInceptionDistance(normalize = False).to(device)
  iscore = InceptionScore(normalize=False).to(device)
  kid = KernelInceptionDistance(normalize=False, subset_size=32).to(device)

-  # Update FID score for full testing dataset and the sampled batch
+  # Update scores for the full testing dataset w.r.t. the sampled batches
  for idx,(data, _) in enumerate(dataloader):
    data = data.to(device)
    fid.update(data, real=True)
@@ -82,19 +84,19 @@ def ddpm_evaluator(model,
        kid.update(gen, real=False)
        iscore.update(gen)

-  # If there are more generated images left, add them too
+  # If there are sampled images left, add them too
  for idx in range(nr_real_batches, nr_generated_batches):
    gen = generated_batches[idx].to(device)
    fid.update(gen, real=False)
    kid.update(gen, real=False)
    iscore.update(gen)
 
-  # compute total FID and IS 
+  # compute total FID, IS and KID 
  fid_score = fid.compute()
  i_score = iscore.compute()
  kid_score = kid.compute()

-  # store in txt file
+  # store results in txt file
  with open(str(eval_path), 'a') as txt:
    result = f'FID_epoch_{e}_sample_{sample_idx}:'
    txt.write(result + str(fid_score.item()) + '\n')

--- a/evaluation/sample.py
+++ b/evaluation/sample.py
@@ -16,9 +16,12 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
    batch_size:      The number of images to sample
    intermediate:    Bool value. If False the sampling function will draw a batch of images, else it will just 
                     sample a single image, but store all the intermediate noised latents along the reverse chain
+    sample_all:      If True, samples a batch of images for the given model at every stored checkpoint at once
+    n_times:         Integer denoting how many times we draw a batch of 'batch_size'. If we want to draw 10k images
+                     the GPU will draw batches of 512 images 20 times to reach this goal.  
    '''

-
+    # If we want to sample from every checkpoint of the current model, recursively call this function for all checkpoints
    if sample_all:
        f = f'{experiment_path}trained_ddpm/'
        checkpoint_list = [checkpoint_i for checkpoint_i in os.listdir(f) if checkpoint_i.endswith(".pth")]
@@ -26,6 +29,7 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
            if checkpoint_i.endswith(".pth"):
                ddpm_sampler(model, checkpoint_i, experiment_path, device, sample_all=False)
        return 0
+ 
    # load model
    try:
        checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
@@ -62,7 +66,6 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
    # transform
    back2pil = transforms.Compose([transforms.Normalize(mean=(-1,-1,-1),std=(2,2,2)),transforms.ToPILImage()])
        
-    
    n = n_times
    for k in range(n):
        # generate batch_size images

--- a/trainer/train.py
+++ b/trainer/train.py
-
 import numpy as np
 import copy
 import torch