Skip to content
Snippets Groups Projects
Commit df668ae6 authored by Christian Schmidt's avatar Christian Schmidt
Browse files

Intro to pytorch

parent 469f76a2
No related branches found
No related tags found
No related merge requests found
Showing
with 1886 additions and 5 deletions
.idea/
.ipynb_checkpoints/
.DS_Store
__pycache__/
datasets/
runs/
.vscode/
*.egg-info/
*.pt
dev.py
dev.ipynb
.mise.toml
*.aux
*.toc
*.fdb_latexmk
*.fls
*.log
*,out
*.pdf
*.rar
*.synctex.gz
*.tar.bz2
*.tar.gz
*.zip
autograded
feedback
release
solution
submitted
submitted*/
gradebook.db
File added
This diff is collapsed.
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from collections import OrderedDict
import random
from torch.utils.data import DataLoader, Subset
from sklearn.datasets import make_moons, make_circles
class ToyDataset:
@staticmethod
def _add_bias_column(X):
return torch.cat((torch.ones(X.shape[0], 1), X), 1)
@classmethod
def sine(cls):
"""
1D regression dataset with a sine wave.
"""
X = torch.linspace(0, 1, 32).view(-1, 1)
y = torch.sin(2 * torch.pi * 2 * X)
return X, y
@classmethod
def sinc(cls):
"""
1D regression dataset with a sinc function.
"""
X = torch.linspace(-10, 10, 100).view(-1, 1)
y = torch.where(X != 0, torch.sin(X) / X, torch.tensor(1.0))
return X, y
@classmethod
def binary_blob(cls):
"""
2D binary classification dataset with two blobs.
"""
data1 = 0.4 * torch.randn(200, 2) + 3
data2 = 0.8 * torch.randn(200, 2) - 1
X = torch.cat((data1, data2), 0)
X = cls._add_bias_column(X)
y = torch.cat((torch.zeros(200), torch.ones(200)))
return X, y
@classmethod
def two_moons(cls, n_samples=400, noise=0.1):
"""
2D binary classification dataset with two moons.
"""
X, y = make_moons(n_samples=n_samples, noise=noise, random_state=0)
X = torch.tensor(X, dtype=torch.float32)
X = cls._add_bias_column(X)
y = torch.tensor(y, dtype=torch.float32)
return X, y
@classmethod
def xor(cls, n_samples=400):
"""
2D binary classification dataset with XOR pattern.
"""
X = torch.rand(n_samples, 2) * 2 - 1
y = ((X[:, 0] > 0) != (X[:, 1] > 0)).float()
X = cls._add_bias_column(X)
return X, y
@classmethod
def concentric_circles(cls, n_samples=400, noise=0.1, factor=0.3):
"""
2D binary classification dataset with two concentric circles.
"""
X, y = make_circles(
n_samples=n_samples, noise=noise, factor=factor, random_state=0
)
X = torch.tensor(X, dtype=torch.float32)
X = cls._add_bias_column(X)
y = torch.tensor(y, dtype=torch.float32)
return X, y
@classmethod
def list_datasets(cls):
torch.manual_seed(0)
return [
method
for method in dir(cls)
if not method.startswith("__")
and method != "get_dataset"
and callable(getattr(cls, method))
]
@classmethod
def get_dataset(cls, name, **kwargs):
if hasattr(cls, name):
return getattr(cls, name)(**kwargs)
else:
raise ValueError(
f"Dataset '{name}' not found. Available datasets: {cls.list_datasets()}"
)
def get_dataset(name, **kwargs):
return ToyDataset.get_dataset(name, **kwargs)
def plot_logistic_regression(data, labels, weights, feature_transform_fn=None):
colors = ["#1f77b4", "#ff7f0e"] # Blue for Class 0, Orange for Class 1
plt.figure(figsize=(5, 4))
plt.scatter(
data[labels == 0, 1],
data[labels == 0, 2],
c=colors[0],
label="Class 0",
alpha=0.6,
)
plt.scatter(
data[labels == 1, 1],
data[labels == 1, 2],
c=colors[1],
label="Class 1",
alpha=0.6,
)
x_min, x_max = data[:, 1].min().item() - 1, data[:, 1].max().item() + 1
y_min, y_max = data[:, 2].min().item() - 1, data[:, 2].max().item() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
grid = torch.tensor(
np.c_[np.ones_like(xx.ravel()), xx.ravel(), yy.ravel()], dtype=torch.float32
)
if feature_transform_fn is not None:
grid = feature_transform_fn(grid)
with torch.no_grad():
z = torch.sigmoid(grid @ weights).reshape(xx.shape)
custom_cmap = plt.cm.colors.LinearSegmentedColormap.from_list("custom", colors)
plt.contourf(xx, yy, z.numpy(), levels=8, cmap=custom_cmap, alpha=0.2)
plt.contour(
xx,
yy,
z.numpy(),
levels=[0.5],
colors="k",
linestyles="--",
linewidths=1,
alpha=0.5,
)
plt.title("Logistic Regression Decision Boundary")
plt.legend()
plt.show()
def create_small_loader(loader, num_samples=1000, batch_size=1000, seed=42):
assert isinstance(loader, DataLoader)
random.seed(seed)
torch.manual_seed(seed)
dataset = loader.dataset
total_samples = len(dataset)
num_samples = min(num_samples, total_samples)
subset_indices = random.sample(range(total_samples), num_samples)
small_dataset = Subset(dataset, subset_indices)
small_loader = DataLoader(
small_dataset,
batch_size=min(
batch_size, num_samples
), # Ensure batch_size doesn't exceed num_samples
shuffle=False,
)
return small_loader
def extract_activations(model, data_loader):
activations = OrderedDict()
def get_activation(name):
def hook(model, input, output):
activations[name] = output.detach()
return hook
# Register hooks for each layer
hooks = []
for name, module in model.named_modules():
if isinstance(module, nn.Linear) or isinstance(module, nn.ReLU):
hooks.append(module.register_forward_hook(get_activation(name)))
model.eval()
all_activations = OrderedDict()
all_labels = []
with torch.no_grad():
for inputs, labels in data_loader:
inputs = inputs.view(inputs.shape[0], -1)
_ = model(inputs)
for name, activation in activations.items():
if name not in all_activations:
all_activations[name] = []
all_activations[name].append(activation)
all_labels.append(labels)
for hook in hooks:
hook.remove()
# Combine activations from all batches
combined_activations = OrderedDict()
for name, activations_list in all_activations.items():
combined_activations[name] = torch.cat(activations_list)
labels = torch.cat(all_labels)
return combined_activations, labels
def apply_tsne(activations, perplexity=70, n_components=2):
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
tsne_results = OrderedDict()
for name, activation in activations.items():
tsne_results[name] = tsne.fit_transform(
activation.reshape(activation.shape[0], -1).numpy()
)
return tsne_results
def map_to_custom_names(activations):
layer_display_names = {
"network.0": "Input Layer",
"network.1": "Hidden Layer (ReLU)",
"network.2": "Output Layer",
}
updated_activations = OrderedDict()
for key, value in activations.items():
if key in layer_display_names:
updated_activations[layer_display_names[key]] = value
else:
raise ValueError(f"Unknown layer name: {key}")
return updated_activations
def plot_tsne(tsne_results, labels, num_classes):
num_plots = len(tsne_results)
rows = (num_plots + 2) // 3 # Ceiling division
cols = min(num_plots, 3)
fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 5 * rows))
if rows == 1 and cols == 1:
axes = [axes]
else:
axes = axes.flatten()
for i, (layer_name, tsne_result) in enumerate(tsne_results.items()):
ax = axes[i]
scatter = ax.scatter(
tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap="tab10", s=5
)
ax.set_title(layer_name)
ax.set_xticks([])
ax.set_yticks([])
# Remove any unused subplots
for i in range(num_plots, len(axes)):
fig.delaxes(axes[i])
plt.colorbar(scatter, ax=axes, label="Class", ticks=range(num_classes))
return fig
def show_tsne_plot(model, loader, num_samples=1000):
small_loader = create_small_loader(
loader, num_samples=num_samples
) # sample 1000 images from the dataset
activations, labels = extract_activations(model, small_loader)
activations = map_to_custom_names(activations)
tsne_results = apply_tsne(activations)
fig = plot_tsne(tsne_results, labels, num_classes=10)
plt.show()
def make_grid(images, predictions=None, labels=None):
"""
Helper function for logging image prediction results
"""
n = images.shape[0]
grid_n = int(n**0.5) # grid will be grid_n x grid_n
fig, axes = plt.subplots(grid_n, grid_n, figsize=(grid_n * 2, grid_n * 2))
for i in range(grid_n):
for j in range(grid_n):
idx = i * grid_n + j
axes[i, j].imshow(images[idx].squeeze(), cmap="gray")
# kids, don't code like this at home
title = []
if predictions is not None:
title.append(f"Pred: {predictions[idx]}")
if labels is not None:
title.append(f"Label: {labels[idx]}")
axes[i, j].set_title(" | ".join(title))
axes[i, j].axis("off")
plt.tight_layout()
return fig
# Deep Learning - Exercises
This repository contains the jupyter notebooks for the Mooc _Deep Learning_.
To set up your environment, you can use (for example) conda, a free package manager.
We recommend to install it via miniforge; you can use it to create an environment with conda env create -f environment.yaml.
To activate this environment, run conda activate basicsml.
\ No newline at end of file
# Deep Learning Exercises
Welcome to the **Deep Learning** exercises!
In these exercises, you'll implement fundamental concepts from scratch and recreate key ideas from influential papers.
## Getting Started
You'll need Python and several deep learning libraries, which are listed in `environment.yaml`.
You can use `conda` to set up your environment (we recommend to install it via [miniforge](https://github.com/conda-forge/miniforge)):
```bash
conda env create -f environment.yaml
conda activate deeplearning
```
## Course Overview
### Week 1: Getting Started with PyTorch
We'll begin with PyTorch fundamentals - tensors, automatic differentiation, and hardware-accelerated operations (if available).
You'll implement basic optimization tasks and build your first neural networks, culminating in a handwritten digit classifier.
### Week 2: Deep Learning Building Blocks
This week focuses on the essential components that make deep networks work.
You'll implement various initialization techniques, normalization layers, dropout, and modern optimizers.
Through experiments on Fashion-MNIST, you'll see how these details affect training dynamics.
### Week 3: Computer Vision and CNNs
You'll dive into modern computer vision by implementing ResNets and semantic segmentation models.
We'll explore different backbone architectures the tradeoffs between accuracy and computational efficiency.
### Week 4: Sequential Data and RNNs
We'll tackle sequence modeling by building RNNs from scratch.
You'll create a character-level language model to generate Shakespeare-like text and implement a German-English translation system.
This introduces key NLP concepts like tokenization and sequence handling.
### Week 5: The Transformer Architecture
You'll implement the fundamental building blocks of many modern neural networks - from attention mechanisms to complete Transformers.
The exercises cover both autoregressive text generation (GPT-style) and neural machine translation tasks.
### Week 6: Scaling and Multimodality
In the final week, we'll look at how models behave as they grow larger, examining scaling laws empirically.
You'll also build a simple system that can generate captions for images, bringing together concepts from vision and language processing.
Each week builds on the previous ones, taking you from fundamentals to current research topics. Good luck and happy learning!
bridgingai_logo.png

15.1 KiB

name: deeplearning
channels:
- pytorch
- conda-forge
- defaults
dependencies:
- python=3.10
- pytorch>=2.2
- torchvision
- ipykernel
- scikit-learn
- numpy>=1.23
- matplotlib=3.5
- peft
- pre-commit
- tensorboard
- tensorboardX
- tqdm
- scikit-image
- seaborn
- tokenizers
- transformers
- sacrebleu
- pip
- pip:
- -e exercise-utils
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "exercise-utils"
version = "0.1.0"
description = "Utility functions for the Deep Learning course exercises"
requires-python = ">=3.10"
dependencies = [
"torch>=2.0.0",
"numpy>=1.21.0",
"matplotlib>=3.4.0",
]
import torch
import torch.nn as nn
import os
from tensorboardX import SummaryWriter
import time
from typing import Tuple, Dict, Any
from abc import ABC, abstractmethod
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim.lr_scheduler import _LRScheduler
from dataclasses import dataclass
def make_summary_writer(logdir_prefix: str, log_name: str):
data_path = os.path.join(os.getcwd(), "runs")
if not (os.path.exists(data_path)):
os.makedirs(data_path)
logdir = logdir_prefix + "_" + log_name + "_" + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
if not (os.path.exists(logdir)):
os.makedirs(logdir)
return SummaryWriter(logdir, flush_secs=1, max_queue=1)
class BaseTrainer(ABC):
def __init__(
self,
model: nn.Module,
optimizer: torch.optim.Optimizer,
train_loader: DataLoader,
val_loader: DataLoader,
device: str,
max_steps: int,
eval_every_n_steps: int,
logger: SummaryWriter = None,
scheduler: _LRScheduler = None,
config: dataclass = None,
):
self.model = model.to(device)
self.optimizer = optimizer
self.train_loader = train_loader
self.val_loader = val_loader
self.device = device
self.max_steps = max_steps
self.eval_every_n_steps = eval_every_n_steps
self.logger = logger
self.scheduler = scheduler
self.config = config
self.gradient_accumulation_steps = getattr(
config, "gradient_accumulation_steps", 1
)
@abstractmethod
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
pass
@abstractmethod
def validation_step(
self, batch: Tuple[torch.Tensor, torch.Tensor]
) -> Dict[str, Any]:
pass
def train_step(self, batch: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, float]:
self.model.train()
batch = [item.to(self.device) for item in batch]
metrics = self.training_step(batch)
loss = metrics["loss"] / self.gradient_accumulation_steps
loss.backward()
if (self._step + 1) % self.gradient_accumulation_steps == 0:
self.optimizer.step()
self.optimizer.zero_grad()
if self.scheduler is not None:
self.scheduler.step()
out_metrics = {}
for key, value in metrics.items():
if isinstance(value, torch.Tensor):
out_metrics[key] = value.item()
else:
out_metrics[key] = value
return out_metrics
@torch.no_grad()
def validate(self) -> Dict[str, float]:
self.model.eval()
val_metrics = {}
for batch in self.val_loader:
batch = [item.to(self.device) for item in batch]
metrics = self.validation_step(batch)
for key, value in metrics.items():
if key not in val_metrics:
val_metrics[key] = []
val_metrics[key].append(
value.item() if isinstance(value, torch.Tensor) else value
)
return {key: sum(values) / len(values) for key, values in val_metrics.items()}
def fit(self):
train_iter = iter(self.train_loader)
pbar = tqdm(total=self.max_steps, unit="step", desc="Training", mininterval=0)
for self._step in range(self.max_steps):
try:
batch = next(train_iter)
except StopIteration:
train_iter = iter(self.train_loader)
batch = next(train_iter)
# Train step
train_metrics = self.train_step(batch)
pbar.set_description(
f"Training Step: {self._step+1}/{self.max_steps} | Loss: {train_metrics['loss']:.3f}"
)
# log metrics
if self.logger is not None:
for key, value in train_metrics.items():
self.logger.add_scalar(
f"{self.config.exp_name}/train_{key}", value, self._step
)
# TODO: log the "base" learning rate
self.logger.add_scalar(
f"{self.config.exp_name}/learning_rate",
self.optimizer.param_groups[0]["lr"],
self._step,
)
# Validation
if self._step % self.eval_every_n_steps == 0:
pbar.set_description(
f"Validating Step: {self._step+1}/{self.max_steps}"
)
val_metrics = self.validate()
if self.logger is not None:
for key, value in val_metrics.items():
self.logger.add_scalar(
f"{self.config.exp_name}/val_{key}", value, self._step
)
yield self._step, train_metrics, val_metrics
pbar.update(1)
# Final validation
val_metrics = self.validate()
if self.logger is not None:
pbar.set_description("Final Validation")
for key, value in val_metrics.items():
self.logger.add_scalar(
f"{self.config.exp_name}/val_{key}", value, self.max_steps
)
pbar.close()
yield self.max_steps, train_metrics, val_metrics
import torch
from torch.utils.data import Dataset
from typing import Tuple
from .tokenizer import LMTokenizer
class LMDataset(Dataset):
def __init__(self, text: str, context_length: int, tokenizer: LMTokenizer):
self.context_length = context_length
self.encoded_data = tokenizer.encode(text)
def __len__(self) -> int:
return len(self.encoded_data) - self.context_length
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
data = self.encoded_data[idx : idx + self.context_length + 1]
input_data = data[:-1]
target_data = data[1:]
input_tensor = torch.tensor(input_data, dtype=torch.long)
target_tensor = torch.tensor(target_data, dtype=torch.long)
return input_tensor, target_tensor
from tokenizers import SentencePieceBPETokenizer
from typing import List
class CharacterTokenizer:
def __init__(self):
self.chars = None
self.char2id = None
self.id2char = None
self._vocab_size = None
def train(self, files: list[str]):
text = ""
for file in files:
with open(file, "r") as f:
text += f.read()
self.chars = sorted(set(text))
self.char2id = {ch: i for i, ch in enumerate(self.chars)}
self.id2char = {i: ch for ch, i in self.char2id.items()}
self._vocab_size = len(self.chars)
@property
def vocab_size(self):
return self._vocab_size
def encode(self, text: str) -> list[int]:
return [self.char2id[ch] for ch in text]
def decode(self, ids: list[int]) -> str:
return "".join([self.id2char[id] for id in ids])
def encode_batch(self, texts: List[str]) -> List[List[int]]:
return [self.encode(text) for text in texts]
def decode_batch(self, sequences: List[List[int]]) -> List[str]:
return [self.decode(seq) for seq in sequences]
class LMTokenizer:
def __init__(self):
self.tokenizer = SentencePieceBPETokenizer()
@property
def vocab_size(self):
return self.tokenizer.get_vocab_size()
def train(self, files, vocab_size):
self.tokenizer.train(
files=[str(file) for file in files],
vocab_size=vocab_size,
show_progress=False,
)
def encode(self, text: str) -> List[int]:
"""Tokenize and convert tokens to IDs."""
return self.tokenizer.encode(text).ids
def decode(self, ids: List[int]) -> str:
"""Convert IDs to tokens and join to form a string."""
return self.tokenizer.decode(ids)
def encode_batch(self, texts: List[str]) -> List[List[int]]:
"""Batched version of encode."""
return [self.encode(text) for text in texts]
def decode_batch(self, sequences: List[List[int]]) -> List[str]:
"""Batched version of decode."""
return [self.decode(seq) for seq in sequences]
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from .dataset import LMDataset
def generate_text(
model,
tokenizer,
max_length: int,
prompt: str = None,
temperature: float = 1.0,
) -> str:
"""Generate text using a language model.
Args:
model: Language model (RNN or Transformer)
tokenizer: Tokenizer with encode() and decode() methods
max_length: Maximum number of tokens to generate
prompt: Initial text prompt to start generation
temperature: Controls randomness (higher = more random)
Returns:
Generated text string including the initial prompt if provided
"""
was_training = model.training
model.eval()
# Generate random token if no prompt provided
if prompt is None:
rand_id = torch.randint(0, tokenizer.vocab_size, (1,))
prompt = tokenizer.decode(rand_id.tolist())
# Convert initial text to tokens
generated = list(prompt)
input_ids = torch.tensor(tokenizer.encode(prompt))
# Generate continuation
with torch.no_grad():
generated_tokens = model.generate(input_ids, max_length, temperature)
generated += tokenizer.decode(generated_tokens.tolist())
if was_training:
model.train(was_training)
return "".join(generated)
def format_generation_logging(generated_text: str, prompt: str = None) -> str:
"""Format generated text for logging in Markdown format.
Args:
generated_text: Full generated text
prompt: Optional initial text prompt used for generation
Returns:
Formatted string with Markdown formatting
"""
formatted_lines = []
if prompt:
formatted_lines.extend(
[f"**Prompt:** {prompt}", "", "```", generated_text, "```"]
)
else:
formatted_lines.extend(["**Random Start**", "", "```", generated_text, "```"])
formatted_lines.extend(["---", ""])
return "\n".join(formatted_lines)
def create_lm_dataloaders(config, context_length):
with open(config.train_file.path, "r") as f:
data = f.read()
train_len = int(len(data) * config.train_file.train_ratio)
train_data = data[:train_len]
val_data = data[train_len:]
tokenizer = config.tokenizer
train_dataset = LMDataset(train_data, context_length, tokenizer)
val_dataset = LMDataset(val_data, context_length, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
return train_loader, val_loader
"""
The shape of data is (batch_size, seq_len) for transformer models.
This is different from the convention we used for RNNs where the shape was (seq_len, batch_size).
"""
import torch
from torch.utils.data import Dataset
from typing import List, Tuple
from pathlib import Path
import json
from .tokenizer import Tokenizer
def read_data(file_paths: List[Path]) -> tuple[List[str], List[str]]:
src_data = []
tgt_data = []
for file_path in file_paths:
with open(file_path, "r", encoding="utf-8") as file:
for i, line in enumerate(file):
json_obj = json.loads(line.strip())
src_data.append(json_obj["de"])
tgt_data.append(json_obj["en"])
return src_data, tgt_data
class Multi30kDataset(Dataset):
def __init__(
self, file_paths: List[Path], src_tokenizer: Tokenizer, tgt_tokenizer: Tokenizer
):
self.src_data, self.tgt_data = read_data(file_paths)
assert len(self.src_data) == len(
self.tgt_data
), "Number of source and target examples do not match"
self.src_tokenizer = src_tokenizer
self.tgt_tokenizer = tgt_tokenizer
# special tokens in source language
self.special_tokens = [
self.src_tokenizer.pad_id,
self.src_tokenizer.start_id,
self.src_tokenizer.end_id,
self.src_tokenizer.unk_id,
]
def __len__(self) -> int:
return len(self.src_data)
def __getitem__(self, idx: int) -> Tuple[List[int], List[int]]:
src_text = self.src_data[idx]
tgt_text = self.tgt_data[idx]
src_tokens = self.src_tokenizer.encode(src_text)
tgt_tokens = self.tgt_tokenizer.encode(tgt_text)
# Add start and end tokens to target
tgt_tokens = (
[self.tgt_tokenizer.start_id] + tgt_tokens + [self.tgt_tokenizer.end_id]
)
return src_tokens, tgt_tokens
def to_padded_tensor(
sequences: List[List[int]], pad_id: int, batch_first=True
) -> torch.Tensor:
"""Convert a list of sequences to a padded tensor.
Args:
sequences: List of token sequences.
pad_id: Token ID to use for padding.
batch_first: If True, the output tensor shape will be (batch_size, seq_len).
If False, the shape will be (seq_len, batch_size).
Returns:
torch.Tensor: Padded tensor of the specified shape.
"""
# Find maximum sequence length in the batch
max_len = max(len(seq) for seq in sequences)
# Pad each sequence to max_len
padded_sequences = []
for seq in sequences:
# Calculate padding length
pad_len = max_len - len(seq)
# Add padding to the right
padded_seq = seq + [pad_id] * pad_len
padded_sequences.append(padded_seq)
# Convert to tensor of shape (batch_size, seq_len)
padded_tensor = torch.tensor(padded_sequences, dtype=torch.long)
if not batch_first:
# Transpose to shape (seq_len, batch_size)
padded_tensor = padded_tensor.T
return padded_tensor
class BatchCollator:
"""Handles the collation of batches with padding."""
def __init__(self, src_pad_id: int, tgt_pad_id: int, batch_first: bool):
self.src_pad_id = src_pad_id
self.tgt_pad_id = tgt_pad_id
self.batch_first = batch_first
def __call__(
self, batch: List[Tuple[List[int], List[int]]]
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Convert a batch of sequences into padded tensors."""
src_sequences, tgt_sequences = zip(*batch)
return (
to_padded_tensor(list(src_sequences), self.src_pad_id, self.batch_first),
to_padded_tensor(list(tgt_sequences), self.tgt_pad_id, self.batch_first),
)
from tokenizers import SentencePieceBPETokenizer
from typing import List
class SpecialTokens:
PAD = "<pad>"
START = "<s>" # <sos>
END = "</s>" # <eos>
UNK = "<unk>"
class Tokenizer:
def __init__(self):
self.tokenizer = SentencePieceBPETokenizer()
self.special_tokens = [
SpecialTokens.PAD,
SpecialTokens.START,
SpecialTokens.END,
SpecialTokens.UNK,
]
self.pad_id = None
self.unk_id = None
self.start_id = None
self.end_id = None
@property
def vocab_size(self):
return self.tokenizer.get_vocab_size() + len(self.special_tokens)
def train(self, files, vocab_size):
self.tokenizer.train(
files=[str(file) for file in files],
vocab_size=vocab_size - len(self.special_tokens),
special_tokens=self.special_tokens,
show_progress=False,
)
# After training, get the IDs for special tokens
self.pad_id = self.tokenizer.token_to_id(SpecialTokens.PAD)
self.unk_id = self.tokenizer.token_to_id(SpecialTokens.UNK)
self.start_id = self.tokenizer.token_to_id(SpecialTokens.START)
self.end_id = self.tokenizer.token_to_id(SpecialTokens.END)
def encode(self, text: str) -> List[int]:
"""Tokenize and convert tokens to IDs."""
return self.tokenizer.encode(text).ids
def decode(self, ids: List[int]) -> str:
"""Convert IDs to tokens and join to form a string."""
return self.tokenizer.decode(ids)
def encode_batch(self, texts: List[str]) -> List[List[int]]:
"""Batched version of encode."""
return [self.encode(text) for text in texts]
def decode_batch(self, sequences: List[List[int]]) -> List[str]:
"""Batched version of decode."""
return [self.decode(seq) for seq in sequences]
import torch
from typing import List
import sacrebleu
from .dataset import BatchCollator, Multi30kDataset
from torch.utils.data import DataLoader
class TranslationExamplesMixin:
"""Mixin class for generating translation examples during training."""
def is_usable(self):
"""Check if the mixin can be used."""
assert hasattr(self, "model")
assert hasattr(self.model, "translate")
assert hasattr(self, "val_loader")
assert hasattr(self, "src_tokenizer")
assert hasattr(self, "tgt_tokenizer")
assert hasattr(self, "logger")
def get_translation_examples(self, indices, example_type: str) -> str:
"""Get translation examples for given indices."""
src_tokens = [self.val_loader.dataset[i][0] for i in indices]
tgt_tokens = [self.val_loader.dataset[i][1] for i in indices]
src_sents = self.src_tokenizer.decode_batch(src_tokens)
tgt_sents = self.tgt_tokenizer.decode_batch(tgt_tokens)
pred_sents = self.model.translate(src_sents)
return format_translation_logging(
src_sents, pred_sents, tgt_sents, f"{example_type} Examples"
)
def get_fixed_examples(self) -> str:
"""Get translations for fixed example indices."""
fixed_indices = torch.tensor([0, 1, 2, 3, 4])
return self.get_translation_examples(fixed_indices, "Fixed")
def get_random_examples(self) -> str:
"""Get translations for random example indices."""
random_indices = torch.randint(0, len(self.val_loader), (5,))
return self.get_translation_examples(random_indices, "Random")
def log_translation_examples(self, step: int) -> None:
"""Log both fixed and random translation examples."""
formatted_result = self.get_fixed_examples()
formatted_result += self.get_random_examples()
self.logger.add_text("translations", formatted_result, global_step=step)
def format_translation_logging(
src_sents: List[str],
tgt_sents: List[str],
ref_sents: List[str],
section_header: str = None,
) -> str:
"""Format translation results in a readable Markdown format for Tensorboard logging.
Args:
src_sents: List of source sentences
tgt_sents: List of model predictions/translations
ref_sents: List of reference/ground truth sentences
add_numbering: Whether to add sample numbers in output
Returns:
Formatted string combining all samples with Markdown formatting
"""
if not (len(src_sents) == len(tgt_sents) == len(ref_sents)):
raise ValueError("All input lists must have the same length")
formatted_lines = []
if section_header is not None:
formatted_lines.append(f"\n### {section_header}\n")
for idx, (src, tgt, ref) in enumerate(zip(src_sents, tgt_sents, ref_sents)):
sample = []
sample.extend(
[
f"**Source:** {src} ",
f"**Prediction:** {tgt} ",
f"**Reference:** {ref} ",
"---",
]
)
formatted_lines.extend(sample)
return "\n\n".join(formatted_lines)
def compute_bleu(
model, val_loader, src_tokenizer, tgt_tokenizer, beam_width=None, num_examples=None
):
"""Compute BLEU score on the validation dataset."""
hypotheses = []
references = []
for i, (src, tgt) in enumerate(val_loader):
if num_examples is not None and i >= num_examples:
break
# Translate the source sentence
src_sents = src_tokenizer.decode_batch(src.tolist())
tgt_sents = tgt_tokenizer.decode_batch(tgt.tolist())
pred_sents = model.translate(src_sents, beam_width=beam_width)
# Log translations
hypotheses.extend(pred_sents)
references.extend(tgt_sents)
# Compute BLEU score
score = sacrebleu.corpus_bleu(hypotheses, [references]).score
return score
def create_nmt_dataloaders(config):
src_tokenizer = config.src_tokenizer
tgt_tokenizer = config.tgt_tokenizer
colattor = BatchCollator(
src_tokenizer.pad_id, tgt_tokenizer.pad_id, batch_first=True
)
train_dataset = Multi30kDataset(config.files.train, src_tokenizer, tgt_tokenizer)
train_loader = DataLoader(
train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=colattor
)
val_dataset = Multi30kDataset(config.files.val, src_tokenizer, tgt_tokenizer)
val_loader = DataLoader(
val_dataset, batch_size=config.batch_size, collate_fn=colattor
)
return train_loader, val_loader
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment