diff --git a/.gitignore b/.gitignore index 29a2e36..721619c 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,7 @@ src/ml-adder/.env src/ml-adder/adnet tutorials/embeddings/__pycache__/ + +src/grads/__pycache__/ +src/grads/data +src/grads/training_report.json diff --git a/.vscode/settings.json b/.vscode/settings.json index c7687de..c553534 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,6 +6,7 @@ "Backpropagation", "bigram", "bigrams", + "cfgs", "dotenv", "ffwd", "Goodfellow", @@ -13,8 +14,10 @@ "kaiming", "localdata", "logsoftmax", + "maxpool", "Morphence", "multinomial", + "Nesterov", "NGRAM", "ngrams", "OPENAI", @@ -22,6 +25,7 @@ "randn", "relu", "softmax", + "sysname", "tqdm", "trange", "tril", diff --git a/src/grads/.python-version b/src/grads/.python-version new file mode 100644 index 0000000..e18b651 --- /dev/null +++ b/src/grads/.python-version @@ -0,0 +1 @@ +devops diff --git a/src/grads/config.py b/src/grads/config.py new file mode 100644 index 0000000..be2e0a7 --- /dev/null +++ b/src/grads/config.py @@ -0,0 +1,44 @@ + +import torch +import torch.nn as nn + +import model as m + + +import hashlib + + +class Conv_Configs(): + adam = torch.optim.Adam + sgd = torch.optim.SGD + sgd_kwargs = { + 'lr': 0.001, # Learning rate + 'momentum': 0.9, # Momentum value + 'weight_decay': 1e-4, # L2 regularization (weight decay) + 'dampening': 0, # Dampening for momentum + 'nesterov': True # Use Nesterov momentum + } + adam_kwargs = { + 'lr': 0.001, # Learning rate + 'betas': (0.9, 0.999), # Momentum terms (first is for momentum) + 'eps': 1e-8, # Small epsilon to avoid division by zero + 'weight_decay': 1e-4 # L2 regularization (optional) + } + + def __init__(self, data: tuple, device: torch.device, epochs: int, n_layers: int, hidden_units: int, num_classes: int, in_channels: int, out_channels: int, image_size: tuple[int, int], maxpool: bool, logits: bool = False, use_adam: bool = False) -> None: + self.model: nn.Module = m.Conv_Model(num_classes=num_classes, + n_layers=n_layers, + hidden_units=hidden_units, + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + maxpool=maxpool, + logits=logits) + self.device = device + self.loss_fn = torch.nn.CrossEntropyLoss() if logits == True else torch.nn.NLLLoss() + self.optimizer = self.adam(self.model.parameters(), **self.adam_kwargs) if use_adam == True else self.sgd( + self.model.parameters(), **self.sgd_kwargs) + self.epochs = epochs + self.train_loader, self.eval_loader = data + self.hash = hashlib.sha256( + f'{device}{epochs}{n_layers}{hidden_units}{out_channels}{logits}'.encode()).hexdigest() diff --git a/src/grads/data.py b/src/grads/data.py new file mode 100644 index 0000000..b62e96c --- /dev/null +++ b/src/grads/data.py @@ -0,0 +1,106 @@ +import torch.utils +import torch.utils.data +import torchvision + +from torch.utils.data import DataLoader +import torch + + +class MNIST: + DEFAULT_BATCH_SIZE = 256 + TF = torchvision.transforms.Compose([torchvision.transforms.ToTensor( + ), torchvision.transforms.Normalize((0.1307,), (0.3081,))]) + + def __init__(self, eval_only: bool = False): + self.eval_only = eval_only + + self.mnist = self.get_mnist() + + def get_mnist(self) -> tuple[DataLoader | None, DataLoader]: + mnist_eval = torchvision.datasets.MNIST( + "./data", download=True, train=False, transform=self.TF) + eval_loader = DataLoader( + mnist_eval, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False) + + if self.eval_only == True: + return None, eval_loader + + mnist_train = torchvision.datasets.MNIST( + "./data", download=True, train=True, transform=self.TF) + train_loader = DataLoader( + mnist_train, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True) + return train_loader, eval_loader + + +class CIFAR10: + DEFAULT_BATCH_SIZE = 256 + TF_TRAIN = torchvision.transforms.Compose([ + torchvision.transforms.RandomCrop(32, padding=4), + torchvision.transforms.RandomHorizontalFlip(), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize( + (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + TF_EVAL = torchvision.transforms.Compose([ + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize( + (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + + def __init__(self, eval_only: bool = False): + self.eval_only = eval_only + + self.cifar10 = self.get_cifar10() + + def get_cifar10(self) -> tuple[DataLoader | None, DataLoader]: + cifar10_eval = torchvision.datasets.CIFAR10( + "./data", download=True, train=False, transform=self.TF_EVAL) + eval_loader = torch.utils.data.DataLoader( + cifar10_eval, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False) + + if self.eval_only == True: + return None, eval_loader + + cifar10_train = torchvision.datasets.CIFAR10( + "./data", download=True, train=True, transform=self.TF_TRAIN) + train_loader = torch.utils.data.DataLoader( + cifar10_train, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True) + return train_loader, eval_loader + + +class CIFAR100: + DEFAULT_BATCH_SIZE = 256 + + TF_TRAIN = torchvision.transforms.Compose([ + torchvision.transforms.RandomCrop(32, padding=4), + torchvision.transforms.RandomHorizontalFlip(), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize( + (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + TF_EVAL = torchvision.transforms.Compose([ + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize( + (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + + def __init__(self, eval_only: bool = False): + self.eval_only = eval_only + + self.cifar100 = self.get_cifar100() + + def get_cifar100(self) -> tuple[DataLoader | None, DataLoader]: + eval_dataset = torchvision.datasets.CIFAR100( + "./data", download=True, train=False, transform=self.TF_EVAL) + eval_loader = torch.utils.data.DataLoader( + eval_dataset, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False) + + if self.eval_only: + return None, eval_loader + + train_dataset = torchvision.datasets.CIFAR100( + "./data", download=True, train=True, transform=self.TF_TRAIN) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True) + + return train_loader, eval_loader diff --git a/src/grads/grads.png b/src/grads/grads.png new file mode 100644 index 0000000..708a027 Binary files /dev/null and b/src/grads/grads.png differ diff --git a/src/grads/main.py b/src/grads/main.py new file mode 100644 index 0000000..e84f359 --- /dev/null +++ b/src/grads/main.py @@ -0,0 +1,202 @@ + +from typing import Mapping, Dict, Any +import torch +import torch.utils.data +import torch.nn as nn + +import numpy as np +from matplotlib import pyplot as plt + +from tqdm import trange +import time +import os +import sys +import json + +import model as models +from data import MNIST, CIFAR10, CIFAR100 +from config import Conv_Configs + + +DEFAULT_DEVICE: torch.device = torch.device("mps") if os.uname( +).sysname == "Darwin" else torch.device("cpu") + + +def collect_grads(model: nn.Module, collapsed: bool = True) -> list[float]: + grads = [] + for p in model.named_parameters(): + param_name, param_value = p + param_grad = param_value.grad.detach().cpu().flatten().numpy() + if param_grad is not None: + grads.append((param_name, param_grad)) + if collapsed == True: + from itertools import chain + return list(chain.from_iterable([g[1] for g in grads])) + return grads + + +def plot_grads(report: Mapping[str, Any]): + fig = plt.figure(1, figsize=(24, 12)) + for idx, (k, v) in enumerate(report["training"].items()): + if k == f'epoch-{idx+1}': + ax = fig.add_subplot(1, epochs, idx+1) + plt.hist(v["grads"], bins=100, log=True) + ax.set_xlim(-0.25, 0.25) + v["grads"] = str(v["grads"]) + + plt.tight_layout() + plt.savefig("grads.png", dpi=300, bbox_inches='tight') + + +def train(training_configs: Conv_Configs, with_grads: bool = False, save_state_dict: bool = False): + """ + - trains a model based on a configuration + + Args: + - `with_grads`: if set to `True` it will print grads during the training + - `save_state_dict`: if set to `True` it will save the model locally + """ + device = training_configs.device + loss_fn = training_configs.loss_fn + optimizer = training_configs.optimizer + data = training_configs.train_loader + epochs = training_configs.epochs + model = training_configs.model + + report = {} + losses = [] + best_loss = (0, torch.inf) + accuracies = [] + best_accuracy = (0, 0) + + train_start = time.time() + model.to(device) + model.train() + + for epoch in (t := trange(0, epochs)): + start = time.time() + epoch_loss = 0 + predictions = 0 + for x, y_true in data: + x, y_true = x.to(device), y_true.to(device) + optimizer.zero_grad() + + y = model(x) + loss = loss_fn(y, y_true) + epoch_loss += loss.item() + predictions += (y.argmax(dim=1) == y_true).sum().item() + + loss.backward() + optimizer.step() + + epoch_loss /= len(data) + losses.append(epoch_loss) + accuracy = predictions/len(data.dataset)*100 + accuracies.append(accuracy) + + if epoch_loss < best_loss[1]: + best_loss = (epoch, epoch_loss) + if accuracy > best_accuracy[1]: + best_accuracy = (epoch, accuracy) + + report[f'epoch-{epoch+1}'] = { + "loss": epoch_loss, + "acc": accuracy, + "epoch_time": time.time()-start, + "best_loss": best_loss[1], + "best_acc": best_accuracy[1], + "grads": collect_grads(model), + } + t.set_description( + f'Epoch: {epoch+1} -> Loss: {epoch_loss:.3f} | Acc: {accuracy:.3f} %') + + train_time = time.time()-train_start + + report["loss"] = best_loss + report["acc"] = best_accuracy + report["train_time"] = train_time + + if save_state_dict: + torch.save(model, f'{model.model_name}-trained.pth') + + return report + + +def eval(configs: Conv_Configs): + model = configs.model + device = configs.device + data = configs.eval_loader + loss_fn = configs.loss_fn + + model.to(device) + model.eval() + running_loss = 0 + acc = 0 + start = time.time() + with torch.no_grad(): + for x, y_true in data: + x, y_true = x.to(device), y_true.to(device) + + y = model(x) + + loss = loss_fn(y, y_true) + running_loss += loss.item() + + acc += (torch.argmax(y, dim=1) == y_true).sum().item() + + return { + "loss": running_loss/len(data), + "acc": float(acc/len(data.dataset)*100), + "eval_time": time.time()-start, + } + + +class DotDict(dict): + def __getattr__(self, attr): + return self.get(attr) + + def __setattr__(self, attr, value): + self[attr] = value + + def __delattr__(self, attr): + del self[attr] + + +if __name__ == "__main__": + + data_configs = DotDict({ + "cifar10": DotDict({"data": CIFAR10().cifar10, "num_classes": 10, "in_channels": 3, "out_channels": 32, "height": 32, "width": 32}), + "cifar100": DotDict({"data": CIFAR100().cifar100, "num_classes": 100, "in_channels": 3, "out_channels": 32, "height": 32, "width": 32}), + "mnist": DotDict({"data": MNIST().mnist, "num_classes": 10, "in_channels": 1, "out_channels": 32, "height": 28, "width": 28}), + }) + data_cfgs = data_configs.cifar10 + epochs = 10 + + model_configs = Conv_Configs(data_cfgs.data, + device=DEFAULT_DEVICE, + epochs=epochs, + n_layers=3, + hidden_units=128, + num_classes=data_cfgs.num_classes, + in_channels=data_cfgs.in_channels, + out_channels=data_cfgs.out_channels, + image_size=(data_cfgs.height, + data_cfgs.width), + maxpool=True, + logits=True, + use_adam=False) + + report = {} + report["hash"] = model_configs.hash + report["loss_fn"] = f'{type(model_configs.loss_fn).__name__}' + report["optimizer"] = f'{type(model_configs.optimizer).__name__}' + + # --------- perform training --------- + report["training"] = train(model_configs, save_state_dict=True) + # --------- measure model performance --------- + report["eval"] = eval(model_configs) + + plot_grads(report) + + with open("training_report.json", 'w') as dumper: + json.dump(report, dumper) diff --git a/src/grads/model.py b/src/grads/model.py new file mode 100644 index 0000000..afbb8e7 --- /dev/null +++ b/src/grads/model.py @@ -0,0 +1,74 @@ +import torch +import torch.nn as nn +import math + +from collections import OrderedDict + + +class Conv_Model(nn.Module): + def __init__(self, num_classes: int, n_layers: int, hidden_units: int, in_channels: int, out_channels: int, image_size: tuple[int, int], maxpool: bool, logits: bool): + super(Conv_Model, self).__init__() + self.model_name = self._get_name() + self.num_classes = num_classes + self.n_layers = n_layers + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_units = hidden_units + self.height, self.width = image_size + self.logits = logits + self.maxpool = maxpool + + # maxpool config + self.mp2d_padding = 0 + self.mp2d_stride = 2 + self.mp2d_kernel = 2 + self.mp2d_dilation = 1 + + self.new_size = math.floor( + (self.height+2*self.mp2d_padding-self.mp2d_kernel)/self.mp2d_stride+1) + + self.conv, self.linear = self.make_layers() + + def forward(self, x: torch.Tensor): + x = self.conv(x) + if self.maxpool == True: + x = nn.ReLU(inplace=True)(nn.MaxPool2d(kernel_size=self.mp2d_kernel, + stride=self.mp2d_stride, + padding=self.mp2d_padding)(x)) + x = x.view(x.shape[0], -1) + x = self.linear(x) + if self.logits == True: + return x + return nn.functional.log_softmax(x, dim=1) + + def _init_conv_layers(self): + layers = nn.Sequential(OrderedDict([("conv1", nn.Conv2d( + in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, padding='same', bias=False))])) + layers.add_module("bn1", nn.BatchNorm2d(self.out_channels)) + for idx in range(self.n_layers-1): + conv = nn.Conv2d(self.out_channels, self.out_channels, + kernel_size=3, padding="same", bias=False) + layers.add_module(f'conv{idx+2}', conv) + layers.add_module(f'bn{idx+2}', nn.BatchNorm2d(self.out_channels)) + layers.add_module(f'relu{idx+2}', nn.ReLU(inplace=True)) + return layers + + def _init_linear_layers(self): + layers = nn.Sequential(OrderedDict( + [("linear1", nn.Linear(self.out_channels*self.new_size**2, + self.hidden_units, bias=False)), ("relu1", nn.ReLU(inplace=True))])) + + for idx in range(self.n_layers-1): + layers.add_module( + f'linear{idx+2}', nn.Linear(self.hidden_units, self.hidden_units, bias=False)) + layers.add_module(f'relu{idx+2}', nn.ReLU(inplace=True)) + + layers.add_module(f'linear{self.n_layers+1}', nn.Linear(self.hidden_units, self.num_classes, + bias=False)) + return layers + + def make_layers(self): + conv_layers = self._init_conv_layers() + linear_layers = self._init_linear_layers() + + return conv_layers, linear_layers