diff --git a/wresnet-pytorch/src/distillation_train.py b/wresnet-pytorch/src/distillation_train.py index f0a5cbb..7ad9cc4 100644 --- a/wresnet-pytorch/src/distillation_train.py +++ b/wresnet-pytorch/src/distillation_train.py @@ -148,8 +148,6 @@ def main(): max_grad_norm=norm, ) - - teacher.load_state_dict(torch.load(os.path.join("wrn-1733078278-8e-1e-05d-12.0n-dict.pt"), weights_only=True)) teacher.to(device) teacher.eval() diff --git a/wresnet-pytorch/src/train.py b/wresnet-pytorch/src/train.py index 64acb55..a0db1c7 100644 --- a/wresnet-pytorch/src/train.py +++ b/wresnet-pytorch/src/train.py @@ -1,4 +1,5 @@ import os +import time import torch from torch import optim from torch.optim.lr_scheduler import MultiStepLR @@ -21,11 +22,67 @@ def set_seed(seed=42): torch.cuda.manual_seed(seed) -def _train_seed(net, loaders, device, dataset, log=False, checkpoint=False, logfile='', checkpointFile='', epochs=200, norm=1.0): +def train_no_cap(net, epochs, data_loader, device, optimizer, criterion, scheduler, test_loader, log, logfile, checkpointFile): + best_test_set_accuracy = 0 + + for epoch in range(epochs): + net.train() + #for i, data in tqdm(enumerate(train_loader, 0), leave=False): + for i, data in enumerate(data_loader, 0): + inputs, labels = data + inputs = inputs.to(device) + labels = labels.to(device) + + optimizer.zero_grad() + + wrn_outputs = net(inputs) + outputs = wrn_outputs[0] + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + scheduler.step() + + if epoch % 10 == 0 or epoch == epochs - 1: + with torch.no_grad(): + + correct = 0 + total = 0 + + net.eval() + for data in test_loader: + images, labels = data + images = images.to(device) + labels = labels.to(device) + + wrn_outputs = net(images) + outputs = wrn_outputs[0] + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + epoch_accuracy = correct / total + epoch_accuracy = round(100 * epoch_accuracy, 2) + + if log: + print('Accuracy at epoch {} is {}%'.format(epoch + 1, epoch_accuracy)) + with open(logfile, 'a') as temp: + temp.write('Accuracy at epoch {} is {}%\n'.format(epoch + 1, epoch_accuracy)) + + if epoch_accuracy > best_test_set_accuracy: + best_test_set_accuracy = epoch_accuracy + torch.save(net.state_dict(), checkpointFile) + + return best_test_set_accuracy + + +def _train_seed(net, loaders, device, dataset, log=False, logfile='', epochs=200, norm=1.0): train_loader, test_loader = loaders - dp_epsilon = 8 + dp_epsilon = None dp_delta = 1e-5 + checkpointFile = 'wrn-{}-{}e-{}d-{}n-dict.pt'.format(int(time.time()), dp_epsilon, dp_delta, norm) + if dp_epsilon is not None: print(f"DP epsilon = {dp_epsilon}, delta = {dp_delta}") #net = ModuleValidator.fix(net, replace_bn_with_in=True) @@ -36,8 +93,6 @@ def _train_seed(net, loaders, device, dataset, log=False, checkpoint=False, logf optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[int(elem*epochs) for elem in [0.3, 0.6, 0.8]], gamma=0.2) - best_test_set_accuracy = 0 - if dp_epsilon is not None: privacy_engine = opacus.PrivacyEngine() net, optimizer, train_loader = privacy_engine.make_private_with_epsilon( @@ -55,60 +110,16 @@ def _train_seed(net, loaders, device, dataset, log=False, checkpoint=False, logf print("Training without differential privacy") print(f"Training with {epochs} epochs") - #for epoch in tqdm(range(epochs)): - with BatchMemoryManager( - data_loader=train_loader, - max_physical_batch_size=1000, # Roughly 12gb vram, uses 9.4 - optimizer=optimizer - ) as memory_safe_data_loader: - for epoch in range(epochs): - net.train() - #for i, data in tqdm(enumerate(train_loader, 0), leave=False): - for i, data in enumerate(memory_safe_data_loader, 0): - inputs, labels = data - inputs = inputs.to(device) - labels = labels.to(device) - optimizer.zero_grad() - - wrn_outputs = net(inputs) - outputs = wrn_outputs[0] - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - - scheduler.step() - - if epoch % 10 == 0 or epoch == epochs - 1: - with torch.no_grad(): - - correct = 0 - total = 0 - - net.eval() - for data in test_loader: - images, labels = data - images = images.to(device) - labels = labels.to(device) - - wrn_outputs = net(images) - outputs = wrn_outputs[0] - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - epoch_accuracy = correct / total - epoch_accuracy = round(100 * epoch_accuracy, 2) - - if log: - print('Accuracy at epoch {} is {}%'.format(epoch + 1, epoch_accuracy)) - with open(logfile, 'a') as temp: - temp.write('Accuracy at epoch {} is {}%\n'.format(epoch + 1, epoch_accuracy)) - - if epoch_accuracy > best_test_set_accuracy: - best_test_set_accuracy = epoch_accuracy - if checkpoint: - torch.save(net.state_dict(), checkpointFile) + if dp_epsilon is not None: + with BatchMemoryManager( + data_loader=train_loader, + max_physical_batch_size=1000, # Roughly 12gb vram, uses 9.4 + optimizer=optimizer + ) as memory_safe_data_loader: + best_test_set_accuracy = train_no_cap(net, epochs, memory_safe_data_loader, device, optimizer, criterion, scheduler, test_loader, log, logfile, checkpointFile) + else: + best_test_set_accuracy = train_no_cap(net, epochs, train_loader, device, optimizer, criterion, scheduler, test_loader, log, logfile, checkpointFile) return best_test_set_accuracy @@ -154,9 +165,8 @@ def train(args): net = WideResNet(d=wrn_depth, k=wrn_width, n_classes=10, input_features=3, output_features=16, strides=strides) net = net.to(device) - checkpointFile = 'wrn-{}-{}-seed-{}-{}-dict.pth'.format(wrn_depth, wrn_width, dataset, seed) if checkpoint else '' epochs = training_configurations.epochs - best_test_set_accuracy = _train_seed(net, loaders, device, dataset, log, checkpoint, logfile, checkpointFile, epochs, args.norm) + best_test_set_accuracy = _train_seed(net, loaders, device, dataset, log, logfile, epochs, args.norm) if log: with open(logfile, 'a') as temp: