Skip to content
Snippets Groups Projects
Commit 910ae792 authored by Gonzalo Martin Garcia's avatar Gonzalo Martin Garcia
Browse files

updated some comments

parent 1f5e2446
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ The pipeline is initiated via the experiment_creator.ipynb notebook, which is se
Upon execution, the notebook generates individual JSON files, encapsulating all the hyperparameter information. When running the model on the HPC, we can choose between the operations 'train', 'sample', and 'evaluate'. These operations automatically extract the necessary hyperparameters from the JSON files and perform their respective tasks. This process is managed by the main.py file. The remaining files contain all the necessary functions optimized for HPC to perform the aforementioned tasks.
Every uniquely trained diffusion model has its own experiment folder, given by its WANDB run name. It holds four different directories: settings, trained_ddpm, samples, and evaluations. The settings folder holds the JSON files specifying the diffusion model's configurations as well as the arguments for the training, sampling, and evaluation functions. The trained_ddpm folder contains .pth files storing the weights and biases of this experiment's diffusion model, which have been saved at different epoch milestones while training. Upon resuming training, the pipeline takes the specified model in trained_ddpm and continues training from there. When sampling images from these trained diffusion models, the samples are stored in different directories for the milestones under the names epoch_{i}. This is done so we know what epoch i version of the diffusion model was used to generate these samples.
Every uniquely trained diffusion model has its own experiment folder, given by its WANDB run name. It holds four different directories: settings, trained_ddpm, samples, and evaluations. The settings folder holds the JSON files specifying the diffusion model's configurations as well as the arguments for the training, sampling, and evaluation functions. The trained_ddpm folder contains .pth files storing the weights and biases of this experiment's diffusion model, which have been saved at different epoch milestones while training. Upon resuming training, the pipeline takes the specified model in trained_ddpm and continues training from there. When sampling images from these trained diffusion models, the samples are stored in different directories for the milestones under the names epoch_{i}. This is done so we know what epoch i version of the diffusion model was used to generate these samples. This is done analogously for the evaluation.
......@@ -17,12 +17,13 @@ def ddpm_evaluator(model,
):
'''
Takes a trained diffusion model from 'checkpoint' and evaluates its performance on the test
dataset 'dataloader'. ....
dataset 'dataloader' w.r.t. the three most important perfromance metrics; FID, IS, KID. We continue
the progress of our evaluation function for the LDM upscalaer and may update this function accordingly.
model: Properly initialized DDPM model
checkpoint: Name of the saved pth. file containing the trained weights and biases
experiment_path: Path to the experiment folder where the evaluation results will be stored
testloader: Loads the test dataset
dataloader: Loads the test dataset for evaluation
sample_idx: Integer that denotes which sample directory sample_{sample_idx} from the checkpoint model shall be used for evaluation
'''
checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
......@@ -30,7 +31,7 @@ def ddpm_evaluator(model,
output_dir = f'{experiment_path}evaluations/'
os.makedirs(output_dir, exist_ok=True)
# create sample directory for the current version of the trained model
# create evaluation directory for the current version of the trained model
model_name = os.path.basename(checkpoint_path)
epoch = re.findall(r'\d+', model_name)
if epoch:
......@@ -40,14 +41,14 @@ def ddpm_evaluator(model,
model_dir = os.path.join(output_dir,f'epoch_{e}')
os.makedirs(model_dir, exist_ok=True)
# create the sample directory for this sampling run for the current version of the model
# create the evaluation directory for this evaluation run for the current version of the model
eval_dir_list = [d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d))]
indx_list = [int(d.split('_')[1]) for d in eval_dir_list if d.startswith('evaluation_')]
j = max(indx_list, default=-1) + 1
eval_dir = os.path.join(model_dir, f'evaluation_{j}')
os.makedirs(eval_dir, exist_ok=True)
# Compute FID SCORE
# Compute metrics
eval_path = os.path.join(eval_dir, 'eval.txt')
# get sampled images
......@@ -61,17 +62,18 @@ def ddpm_evaluator(model,
img = Image.open(os.path.join(sample_path, samplename))
img = transform(img)
images.append(img)
# split them into batches for GPU memory
generated = torch.stack(images).to(device)
generated_batches = torch.split(generated, dataloader.batch_size)
nr_generated_batches = len(generated_batches)
nr_real_batches = len(dataloader)
# Init FID and IS
# Init FID, IS and KID scores
fid = FrechetInceptionDistance(normalize = False).to(device)
iscore = InceptionScore(normalize=False).to(device)
kid = KernelInceptionDistance(normalize=False, subset_size=32).to(device)
# Update FID score for full testing dataset and the sampled batch
# Update scores for the full testing dataset w.r.t. the sampled batches
for idx,(data, _) in enumerate(dataloader):
data = data.to(device)
fid.update(data, real=True)
......@@ -82,19 +84,19 @@ def ddpm_evaluator(model,
kid.update(gen, real=False)
iscore.update(gen)
# If there are more generated images left, add them too
# If there are sampled images left, add them too
for idx in range(nr_real_batches, nr_generated_batches):
gen = generated_batches[idx].to(device)
fid.update(gen, real=False)
kid.update(gen, real=False)
iscore.update(gen)
# compute total FID and IS
# compute total FID, IS and KID
fid_score = fid.compute()
i_score = iscore.compute()
kid_score = kid.compute()
# store in txt file
# store results in txt file
with open(str(eval_path), 'a') as txt:
result = f'FID_epoch_{e}_sample_{sample_idx}:'
txt.write(result + str(fid_score.item()) + '\n')
......
......@@ -16,9 +16,12 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
batch_size: The number of images to sample
intermediate: Bool value. If False the sampling function will draw a batch of images, else it will just
sample a single image, but store all the intermediate noised latents along the reverse chain
sample_all: If True, samples a batch of images for the given model at every stored checkpoint at once
n_times: Integer denoting how many times we draw a batch of 'batch_size'. If we want to draw 10k images
the GPU will draw batches of 512 images 20 times to reach this goal.
'''
# If we want to sample from every checkpoint of the current model, recursively call this function for all checkpoints
if sample_all:
f = f'{experiment_path}trained_ddpm/'
checkpoint_list = [checkpoint_i for checkpoint_i in os.listdir(f) if checkpoint_i.endswith(".pth")]
......@@ -26,6 +29,7 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
if checkpoint_i.endswith(".pth"):
ddpm_sampler(model, checkpoint_i, experiment_path, device, sample_all=False)
return 0
# load model
try:
checkpoint_path = f'{experiment_path}trained_ddpm/{checkpoint}'
......@@ -62,7 +66,6 @@ def ddpm_sampler(model, checkpoint, experiment_path, device, intermediate=False,
# transform
back2pil = transforms.Compose([transforms.Normalize(mean=(-1,-1,-1),std=(2,2,2)),transforms.ToPILImage()])
n = n_times
for k in range(n):
# generate batch_size images
......
import numpy as np
import copy
import torch
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment