Skip to content

Feature/grads #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,7 @@ src/ml-adder/.env

src/ml-adder/adnet
tutorials/embeddings/__pycache__/

src/grads/__pycache__/
src/grads/data
src/grads/training_report.json
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,26 @@
"Backpropagation",
"bigram",
"bigrams",
"cfgs",
"dotenv",
"ffwd",
"Goodfellow",
"huggingface",
"kaiming",
"localdata",
"logsoftmax",
"maxpool",
"Morphence",
"multinomial",
"Nesterov",
"NGRAM",
"ngrams",
"OPENAI",
"optim",
"randn",
"relu",
"softmax",
"sysname",
"tqdm",
"trange",
"tril",
Expand Down
1 change: 1 addition & 0 deletions src/grads/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
devops
44 changes: 44 additions & 0 deletions src/grads/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

import torch
import torch.nn as nn

import model as m


import hashlib


class Conv_Configs():
adam = torch.optim.Adam
sgd = torch.optim.SGD
sgd_kwargs = {
'lr': 0.001, # Learning rate
'momentum': 0.9, # Momentum value
'weight_decay': 1e-4, # L2 regularization (weight decay)
'dampening': 0, # Dampening for momentum
'nesterov': True # Use Nesterov momentum
}
adam_kwargs = {
'lr': 0.001, # Learning rate
'betas': (0.9, 0.999), # Momentum terms (first is for momentum)
'eps': 1e-8, # Small epsilon to avoid division by zero
'weight_decay': 1e-4 # L2 regularization (optional)
}

def __init__(self, data: tuple, device: torch.device, epochs: int, n_layers: int, hidden_units: int, num_classes: int, in_channels: int, out_channels: int, image_size: tuple[int, int], maxpool: bool, logits: bool = False, use_adam: bool = False) -> None:
self.model: nn.Module = m.Conv_Model(num_classes=num_classes,
n_layers=n_layers,
hidden_units=hidden_units,
in_channels=in_channels,
out_channels=out_channels,
image_size=image_size,
maxpool=maxpool,
logits=logits)
self.device = device
self.loss_fn = torch.nn.CrossEntropyLoss() if logits == True else torch.nn.NLLLoss()
self.optimizer = self.adam(self.model.parameters(), **self.adam_kwargs) if use_adam == True else self.sgd(
self.model.parameters(), **self.sgd_kwargs)
self.epochs = epochs
self.train_loader, self.eval_loader = data
self.hash = hashlib.sha256(
f'{device}{epochs}{n_layers}{hidden_units}{out_channels}{logits}'.encode()).hexdigest()
106 changes: 106 additions & 0 deletions src/grads/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import torch.utils
import torch.utils.data
import torchvision

from torch.utils.data import DataLoader
import torch


class MNIST:
DEFAULT_BATCH_SIZE = 256
TF = torchvision.transforms.Compose([torchvision.transforms.ToTensor(
), torchvision.transforms.Normalize((0.1307,), (0.3081,))])

def __init__(self, eval_only: bool = False):
self.eval_only = eval_only

self.mnist = self.get_mnist()

def get_mnist(self) -> tuple[DataLoader | None, DataLoader]:
mnist_eval = torchvision.datasets.MNIST(
"./data", download=True, train=False, transform=self.TF)
eval_loader = DataLoader(
mnist_eval, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False)

if self.eval_only == True:
return None, eval_loader

mnist_train = torchvision.datasets.MNIST(
"./data", download=True, train=True, transform=self.TF)
train_loader = DataLoader(
mnist_train, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True)
return train_loader, eval_loader


class CIFAR10:
DEFAULT_BATCH_SIZE = 256
TF_TRAIN = torchvision.transforms.Compose([
torchvision.transforms.RandomCrop(32, padding=4),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
TF_EVAL = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

def __init__(self, eval_only: bool = False):
self.eval_only = eval_only

self.cifar10 = self.get_cifar10()

def get_cifar10(self) -> tuple[DataLoader | None, DataLoader]:
cifar10_eval = torchvision.datasets.CIFAR10(
"./data", download=True, train=False, transform=self.TF_EVAL)
eval_loader = torch.utils.data.DataLoader(
cifar10_eval, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False)

if self.eval_only == True:
return None, eval_loader

cifar10_train = torchvision.datasets.CIFAR10(
"./data", download=True, train=True, transform=self.TF_TRAIN)
train_loader = torch.utils.data.DataLoader(
cifar10_train, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True)
return train_loader, eval_loader


class CIFAR100:
DEFAULT_BATCH_SIZE = 256

TF_TRAIN = torchvision.transforms.Compose([
torchvision.transforms.RandomCrop(32, padding=4),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
TF_EVAL = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

def __init__(self, eval_only: bool = False):
self.eval_only = eval_only

self.cifar100 = self.get_cifar100()

def get_cifar100(self) -> tuple[DataLoader | None, DataLoader]:
eval_dataset = torchvision.datasets.CIFAR100(
"./data", download=True, train=False, transform=self.TF_EVAL)
eval_loader = torch.utils.data.DataLoader(
eval_dataset, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=False)

if self.eval_only:
return None, eval_loader

train_dataset = torchvision.datasets.CIFAR100(
"./data", download=True, train=True, transform=self.TF_TRAIN)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=self.DEFAULT_BATCH_SIZE, shuffle=True)

return train_loader, eval_loader
Binary file added src/grads/grads.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
202 changes: 202 additions & 0 deletions src/grads/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@

from typing import Mapping, Dict, Any
import torch
import torch.utils.data
import torch.nn as nn

import numpy as np
from matplotlib import pyplot as plt

from tqdm import trange
import time
import os
import sys
import json

import model as models
from data import MNIST, CIFAR10, CIFAR100
from config import Conv_Configs


DEFAULT_DEVICE: torch.device = torch.device("mps") if os.uname(
).sysname == "Darwin" else torch.device("cpu")


def collect_grads(model: nn.Module, collapsed: bool = True) -> list[float]:
grads = []
for p in model.named_parameters():
param_name, param_value = p
param_grad = param_value.grad.detach().cpu().flatten().numpy()
if param_grad is not None:
grads.append((param_name, param_grad))
if collapsed == True:
from itertools import chain
return list(chain.from_iterable([g[1] for g in grads]))
return grads


def plot_grads(report: Mapping[str, Any]):
fig = plt.figure(1, figsize=(24, 12))
for idx, (k, v) in enumerate(report["training"].items()):
if k == f'epoch-{idx+1}':
ax = fig.add_subplot(1, epochs, idx+1)
plt.hist(v["grads"], bins=100, log=True)
ax.set_xlim(-0.25, 0.25)
v["grads"] = str(v["grads"])

plt.tight_layout()
plt.savefig("grads.png", dpi=300, bbox_inches='tight')


def train(training_configs: Conv_Configs, with_grads: bool = False, save_state_dict: bool = False):
"""
- trains a model based on a configuration

Args:
- `with_grads`: if set to `True` it will print grads during the training
- `save_state_dict`: if set to `True` it will save the model locally
"""
device = training_configs.device
loss_fn = training_configs.loss_fn
optimizer = training_configs.optimizer
data = training_configs.train_loader
epochs = training_configs.epochs
model = training_configs.model

report = {}
losses = []
best_loss = (0, torch.inf)
accuracies = []
best_accuracy = (0, 0)

train_start = time.time()
model.to(device)
model.train()

for epoch in (t := trange(0, epochs)):
start = time.time()
epoch_loss = 0
predictions = 0
for x, y_true in data:
x, y_true = x.to(device), y_true.to(device)
optimizer.zero_grad()

y = model(x)
loss = loss_fn(y, y_true)
epoch_loss += loss.item()
predictions += (y.argmax(dim=1) == y_true).sum().item()

loss.backward()
optimizer.step()

epoch_loss /= len(data)
losses.append(epoch_loss)
accuracy = predictions/len(data.dataset)*100
accuracies.append(accuracy)

if epoch_loss < best_loss[1]:
best_loss = (epoch, epoch_loss)
if accuracy > best_accuracy[1]:
best_accuracy = (epoch, accuracy)

report[f'epoch-{epoch+1}'] = {
"loss": epoch_loss,
"acc": accuracy,
"epoch_time": time.time()-start,
"best_loss": best_loss[1],
"best_acc": best_accuracy[1],
"grads": collect_grads(model),
}
t.set_description(
f'Epoch: {epoch+1} -> Loss: {epoch_loss:.3f} | Acc: {accuracy:.3f} %')

train_time = time.time()-train_start

report["loss"] = best_loss
report["acc"] = best_accuracy
report["train_time"] = train_time

if save_state_dict:
torch.save(model, f'{model.model_name}-trained.pth')

return report


def eval(configs: Conv_Configs):
model = configs.model
device = configs.device
data = configs.eval_loader
loss_fn = configs.loss_fn

model.to(device)
model.eval()
running_loss = 0
acc = 0
start = time.time()
with torch.no_grad():
for x, y_true in data:
x, y_true = x.to(device), y_true.to(device)

y = model(x)

loss = loss_fn(y, y_true)
running_loss += loss.item()

acc += (torch.argmax(y, dim=1) == y_true).sum().item()

return {
"loss": running_loss/len(data),
"acc": float(acc/len(data.dataset)*100),
"eval_time": time.time()-start,
}


class DotDict(dict):
def __getattr__(self, attr):
return self.get(attr)

def __setattr__(self, attr, value):
self[attr] = value

def __delattr__(self, attr):
del self[attr]


if __name__ == "__main__":

data_configs = DotDict({
"cifar10": DotDict({"data": CIFAR10().cifar10, "num_classes": 10, "in_channels": 3, "out_channels": 32, "height": 32, "width": 32}),
"cifar100": DotDict({"data": CIFAR100().cifar100, "num_classes": 100, "in_channels": 3, "out_channels": 32, "height": 32, "width": 32}),
"mnist": DotDict({"data": MNIST().mnist, "num_classes": 10, "in_channels": 1, "out_channels": 32, "height": 28, "width": 28}),
})
data_cfgs = data_configs.cifar10
epochs = 10

model_configs = Conv_Configs(data_cfgs.data,
device=DEFAULT_DEVICE,
epochs=epochs,
n_layers=3,
hidden_units=128,
num_classes=data_cfgs.num_classes,
in_channels=data_cfgs.in_channels,
out_channels=data_cfgs.out_channels,
image_size=(data_cfgs.height,
data_cfgs.width),
maxpool=True,
logits=True,
use_adam=False)

report = {}
report["hash"] = model_configs.hash
report["loss_fn"] = f'{type(model_configs.loss_fn).__name__}'
report["optimizer"] = f'{type(model_configs.optimizer).__name__}'

# --------- perform training ---------
report["training"] = train(model_configs, save_state_dict=True)
# --------- measure model performance ---------
report["eval"] = eval(model_configs)

plot_grads(report)

with open("training_report.json", 'w') as dumper:
json.dump(report, dumper)
Loading