From 339261c19f0fcb4ff366c5725790e8b8fe8273d8 Mon Sep 17 00:00:00 2001 From: "Cody A. Coleman" Date: Mon, 11 Dec 2017 11:35:48 -0800 Subject: [PATCH] Update pytorch benchmark code with new command line interface --- pytorch/CIFAR10/.gitignore | 6 + pytorch/CIFAR10/benchmark/cifar10/__init__.py | 0 pytorch/CIFAR10/benchmark/cifar10/__main__.py | 17 + .../CIFAR10/benchmark/{ => cifar10}/infer.py | 23 +- .../{ => cifar10}/models/densenet.py | 0 .../benchmark/{ => cifar10}/models/resnet.py | 119 +++++- pytorch/CIFAR10/benchmark/cifar10/results.py | 153 +++++++ pytorch/CIFAR10/benchmark/cifar10/train.py | 374 +++++++++++++++++ .../CIFAR10/benchmark/imagenet/__main__.py | 14 + pytorch/CIFAR10/benchmark/imagenet/train.py | 339 ++++++++++++++++ pytorch/CIFAR10/benchmark/train.py | 336 --------------- pytorch/CIFAR10/benchmark/utils.py | 61 +++ pytorch/CIFAR10/benchmark/yellowfin.py | 384 ++++++++++++++++++ pytorch/CIFAR10/setup.py | 5 +- 14 files changed, 1476 insertions(+), 355 deletions(-) create mode 100644 pytorch/CIFAR10/.gitignore create mode 100644 pytorch/CIFAR10/benchmark/cifar10/__init__.py create mode 100644 pytorch/CIFAR10/benchmark/cifar10/__main__.py rename pytorch/CIFAR10/benchmark/{ => cifar10}/infer.py (87%) rename pytorch/CIFAR10/benchmark/{ => cifar10}/models/densenet.py (100%) rename pytorch/CIFAR10/benchmark/{ => cifar10}/models/resnet.py (74%) create mode 100644 pytorch/CIFAR10/benchmark/cifar10/results.py create mode 100644 pytorch/CIFAR10/benchmark/cifar10/train.py create mode 100644 pytorch/CIFAR10/benchmark/imagenet/__main__.py create mode 100644 pytorch/CIFAR10/benchmark/imagenet/train.py delete mode 100644 pytorch/CIFAR10/benchmark/train.py create mode 100644 pytorch/CIFAR10/benchmark/utils.py create mode 100644 pytorch/CIFAR10/benchmark/yellowfin.py diff --git a/pytorch/CIFAR10/.gitignore b/pytorch/CIFAR10/.gitignore new file mode 100644 index 0000000..f797795 --- /dev/null +++ b/pytorch/CIFAR10/.gitignore @@ -0,0 +1,6 @@ +*.pyc +__pycache__/ +.eggs/ +*.egg-info/ +.cache +data/ diff --git a/pytorch/CIFAR10/benchmark/cifar10/__init__.py b/pytorch/CIFAR10/benchmark/cifar10/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pytorch/CIFAR10/benchmark/cifar10/__main__.py b/pytorch/CIFAR10/benchmark/cifar10/__main__.py new file mode 100644 index 0000000..9e5be16 --- /dev/null +++ b/pytorch/CIFAR10/benchmark/cifar10/__main__.py @@ -0,0 +1,17 @@ +import click + +from benchmark.cifar10.train import train +from benchmark.cifar10.infer import infer + + +@click.group() +def cli(): + pass + + +cli.add_command(train, name='train') +cli.add_command(infer, name='infer') + + +if __name__ == '__main__': + cli() diff --git a/pytorch/CIFAR10/benchmark/infer.py b/pytorch/CIFAR10/benchmark/cifar10/infer.py similarity index 87% rename from pytorch/CIFAR10/benchmark/infer.py rename to pytorch/CIFAR10/benchmark/cifar10/infer.py index f52c194..795875b 100644 --- a/pytorch/CIFAR10/benchmark/infer.py +++ b/pytorch/CIFAR10/benchmark/cifar10/infer.py @@ -10,15 +10,20 @@ from torch.autograd import Variable from torchvision import transforms from torchvision import datasets -from benchmark.train import load, MEAN, STD, save_result, MODELS +from benchmark.utils import save_result +from benchmark.cifar10.train import MEAN, STD, MODELS class PyTorchEngine: - def __init__(self, filename, use_cuda=False, name=None): - self.filename = filename + def __init__(self, path, arch, use_cuda=False): + self.path = path self.use_cuda = use_cuda - self.name = name - model, epoch, accuracy = load(self.filename) + self.arch = arch + model = MODELS[self.arch]() + restored_state = torch.load(path) + model = model.load_state_dict(restored_state['model']) + accuracy = restored_state['accuracy'] + epoch = restored_state['epoch'] + 1 if self.use_cuda: self.model = model.cuda() @@ -66,13 +71,13 @@ def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True, result = OrderedDict() result['nodename'] = os.uname().nodename - result['model'] = engine.name + result['model'] = engine.arch result['use_cuda'] = engine.use_cuda result['batch_size'] = batch_size result['mean'] = np.mean(times) result['std'] = np.std(times) result['throughput'] = batch_size / np.mean(times) - result['filename'] = engine.filename + result['path'] = engine.path if output is not None: save_result(result, output) @@ -122,13 +127,13 @@ def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2, if cpu: print('With CPU:') - engine = PyTorchEngine(path, use_cuda=False, name=model) + engine = PyTorchEngine(path, use_cuda=False, arch=model) infer_cifar10(testset, engine, start=start, end=end, log2=log2, repeat=repeat, output=output_path) if gpu and torch.cuda.is_available(): print('With GPU:') - engine = PyTorchEngine(path, use_cuda=True, name=model) + engine = PyTorchEngine(path, use_cuda=True, arch=model) # Warmup time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1) diff --git a/pytorch/CIFAR10/benchmark/models/densenet.py b/pytorch/CIFAR10/benchmark/cifar10/models/densenet.py similarity index 100% rename from pytorch/CIFAR10/benchmark/models/densenet.py rename to pytorch/CIFAR10/benchmark/cifar10/models/densenet.py diff --git a/pytorch/CIFAR10/benchmark/models/resnet.py b/pytorch/CIFAR10/benchmark/cifar10/models/resnet.py similarity index 74% rename from pytorch/CIFAR10/benchmark/models/resnet.py rename to pytorch/CIFAR10/benchmark/cifar10/models/resnet.py index 54d4d8e..0908c98 100644 --- a/pytorch/CIFAR10/benchmark/models/resnet.py +++ b/pytorch/CIFAR10/benchmark/cifar10/models/resnet.py @@ -1,6 +1,7 @@ import math from functools import partial +import torch from torch import nn from torch.nn import functional as F @@ -40,6 +41,50 @@ class BasicBlock(nn.Module): return outputs +class StochasticBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, survival_rate=1): + super().__init__() + self.survival_rate = survival_rate + self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.increasing = inplanes != (planes * self.expansion) + if self.increasing: + assert ((1. * planes * self.expansion) / inplanes) == 2 + if stride != 1: + self.shortcut = nn.Sequential(nn.AvgPool2d(stride)) + else: + self.shortcut = nn.Sequential() + + def forward(self, inputs): + shortcut = self.shortcut(inputs) + if self.increasing: + shortcut = torch.cat([shortcut] + [shortcut.mul(0)], 1) + + if not self.training or torch.rand(1)[0] <= self.survival_rate: + H = self.conv1(inputs) + H = self.bn1(H) + H = F.relu(H) + + H = self.conv2(H) + H = self.bn2(H) + + if self.training: + H /= self.survival_rate + H += shortcut + else: + H = shortcut + outputs = F.relu(H) + + return outputs + + class PreActBlock(nn.Module): expansion = 1 @@ -267,6 +312,30 @@ class ResNet(nn.Module): return outputs +class StochasticResNet(ResNet): + + def __init__(self, Block, layers, filters, num_classes=10, inplanes=None, + min_survival_rate=1.0, decay='linear'): + super().__init__(Block, layers, filters, + num_classes=num_classes, + inplanes=inplanes) + L = sum(layers) + l = 1 + for section_index in range(self.num_sections): + section = getattr(self, f'section_{section_index}') + for name, module in section.named_children(): + if decay == 'linear': + survival_rate = 1 - ((l / L) * (1 - min_survival_rate)) + elif decay == 'uniform': + survival_rate = min_survival_rate + else: + raise NotImplementedError( + f"{decay} decay has not been implemented.") + module.survival_rate = survival_rate + l += 1 + assert (l - 1) == L + + # From "Deep Residual Learning for Image Recognition" def ResNet20(): return ResNet(BasicBlock, layers=[3] * 3, filters=[16, 32, 64]) @@ -292,7 +361,28 @@ def ResNet1202(): return ResNet(BasicBlock, layers=[200] * 3, filters=[16, 32, 64]) -# Based on but not it "Identity Mappings in Deep Residual Networks" +# From "Identity Mappings in Deep Residual Networks" +def PreActResNet110(): + return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64]) + + +def PreActResNet164(): + return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64]) + + +def PreActResNet1001(): + return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64]) + + +# Based on but not in "Identity Mappings in Deep Residual Networks" +def PreActResNet8(): + return ResNet(PreActBlock, layers=[1] * 3, filters=[16, 32, 64]) + + +def PreActResNet14(): + return ResNet(PreActBlock, layers=[2] * 3, filters=[16, 32, 64]) + + def PreActResNet20(): return ResNet(PreActBlock, layers=[3] * 3, filters=[16, 32, 64]) @@ -305,17 +395,30 @@ def PreActResNet164Basic(): return ResNet(PreActBlock, layers=[27] * 3, filters=[16, 32, 64]) -# From "Identity Mappings in Deep Residual Networks" -def PreActResNet110(): - return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64]) +# From "Deep Networks with Stochastic Depth" +def StochasticResNet110(): + return StochasticResNet(StochasticBlock, layers=[18] * 3, + filters=[16, 32, 64], min_survival_rate=0.5, + decay='linear') -def PreActResNet164(): - return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64]) +def StochasticResNet1202(): + return StochasticResNet(StochasticBlock, layers=[200] * 3, + filters=[16, 32, 64], min_survival_rate=0.5, + decay='linear') -def PreActResNet1001(): - return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64]) +# Based on but not in "Deep Networks for Stochastic Depth" +def StochasticResNet56(): + return StochasticResNet(StochasticBlock, layers=[9] * 3, + filters=[16, 32, 64], min_survival_rate=0.5, + decay='linear') + + +def StochasticResNet56_08(): + return StochasticResNet(StochasticBlock, layers=[9] * 3, + filters=[16, 32, 64], min_survival_rate=0.8, + decay='linear') # From "Wide Residual Networks" diff --git a/pytorch/CIFAR10/benchmark/cifar10/results.py b/pytorch/CIFAR10/benchmark/cifar10/results.py new file mode 100644 index 0000000..95a3d9d --- /dev/null +++ b/pytorch/CIFAR10/benchmark/cifar10/results.py @@ -0,0 +1,153 @@ +import os +import json + +import pandas as pd + +from benchmark.cifar10.train import MODELS +from benchmark.utils import count_parameters + + +MODEL_SIZES = {key: count_parameters(MODELS[key]()) for key in MODELS.keys()} + + +def single_run_acc(df): + df = df.copy() + df['duration'] = (df['timestamp'] - df['prev_timestamp']).apply(lambda x: x.total_seconds()) + df['batch_duration'] = df['batch_duration'].apply(lambda x: x.total_seconds()) + + tmp = df.loc[:, ['epoch', 'batch_size', 'ncorrect', 'duration', 'batch_duration']].groupby('epoch').sum() + tmp['accuracy'] = tmp['ncorrect'] / tmp['batch_size'] + tmp['throughput'] = tmp['batch_size'] / tmp['duration'] + tmp['_throughput'] = tmp['batch_size'] / tmp['batch_duration'] + tmp['elapsed'] = df.groupby('epoch')['elapsed'].agg('max') + tmp.reset_index(inplace=True) + + return tmp + + +def load_file(file, start_timestamp=None): + df = pd.read_csv(file) + df['timestamp'] = pd.to_datetime(df['timestamp']) + df['batch_duration'] = pd.to_timedelta(df['batch_duration']) + df['ncorrect'] = df['top1_correct'] + start_timestamp = start_timestamp or df['timestamp'].iloc[0] + df['elapsed'] = df['timestamp'] - start_timestamp + df['batch_accuracy'] = df['ncorrect'] / df['batch_size'] + return df + + +def load_data(directory, verbose=True): + train_file = os.path.join(directory, 'train_results.csv') + train = load_file(train_file) + start_timestamp = train['timestamp'].iloc[0] + + if verbose: + print(train_file) + print("Training results shape: {}".format(train.shape)) + + try: + test_file = os.path.join(directory, 'test_results.csv') + test = load_file(test_file, start_timestamp=start_timestamp) + except FileNotFoundError: + test_file = os.path.join(directory, 'valid_results.csv') + test = load_file(test_file, start_timestamp=start_timestamp) + + if verbose: + print(test_file) + print('Test results shape: {}'.format(test.shape)) + + train['mode'] = 'train' + test['mode'] = 'test' + + combined = pd.concat([train, test], ignore_index=True).sort_values(by=['timestamp']) + combined['prev_timestamp'] = combined['timestamp'].shift(1) + combined.loc[0, 'prev_timestamp'] = combined.loc[0, 'timestamp'] - combined.loc[0, 'batch_duration'] + train = combined[combined['mode'] == 'train'].copy() + test = combined[combined['mode'] == 'test'].copy() + + return single_run_acc(train), single_run_acc(test) + + +def load_multiple(directory, timestamps=None, verbose=False): + timestamps = timestamps or os.listdir(directory) + train_sets = [] + test_sets = [] + for timestamp in sorted(timestamps): + _dir = os.path.join(directory, timestamp) + train, test = load_data(_dir, verbose=verbose) + if verbose: + print() + train['run'] = _dir + test['run'] = _dir + train['job_start'] = timestamp + test['job_start'] = timestamp + train_sets.append(train) + test_sets.append(test) + + return pd.concat(train_sets), pd.concat(test_sets) + + +def load_multiple_models(directory, verbose=False): + paths = os.listdir(directory) + models = [path for path in paths if path in MODELS] + + train_sets = [] + test_sets = [] + for model in sorted(models): + if verbose: + print(f"Loading {model}") + _dir = os.path.join(directory, model) + train, test = load_multiple(_dir, verbose=verbose) + train['model'] = model + train['nparameters'] = MODEL_SIZES[model] + test['model'] = model + test['nparameters'] = MODEL_SIZES[model] + + train_sets.append(train) + test_sets.append(test) + + return pd.concat(train_sets), pd.concat(test_sets) + + +def concat_update(existing, other, repeat=False): + for key in other.keys(): + if key in existing: + if existing[key] != other[key] or repeat: + current = existing[key] + if isinstance(current, list): + current.append(other[key]) + else: + existing[key] = [current, other[key]] + else: + existing[key] = other[key] + + +def run_config(run, repeat=False): + full = {} + configs = (os.path.join(run, entry.name) for entry in os.scandir(run) if 'config' in entry.name) + + for config in sorted(configs): + with open(config) as file: + tmp = json.load(file) + + tmp['path'] = config + concat_update(full, tmp, repeat=repeat) + return full + + +def search_configs(criteria, configs): + matches = [] + for run, config in configs.items(): + is_match = True + for key, value in criteria.items(): + try: + config_value = config[key] + if config_value != value: + is_match = False + except KeyError: + is_match = False + + if is_match: + matches.append(run) + + return matches diff --git a/pytorch/CIFAR10/benchmark/cifar10/train.py b/pytorch/CIFAR10/benchmark/cifar10/train.py new file mode 100644 index 0000000..31e7bc9 --- /dev/null +++ b/pytorch/CIFAR10/benchmark/cifar10/train.py @@ -0,0 +1,374 @@ +import os +from datetime import datetime +from collections import OrderedDict + +import click +import torch +import tqdm +import numpy as np +from torch import nn, optim +from torch.autograd import Variable +from torch.utils.data.sampler import SubsetRandomSampler +from torchvision import transforms +from torchvision import datasets + +from benchmark import utils +from benchmark.yellowfin import YFOptimizer +from benchmark.cifar10.models import resnet, densenet + +MEAN = (0.4914, 0.4822, 0.4465) +STD = (0.2023, 0.1994, 0.2010) + +MODELS = { + # "Deep Residual Learning for Image Recognition" + 'resnet20': resnet.ResNet20, + 'resnet32': resnet.ResNet32, + 'resnet44': resnet.ResNet44, + 'resnet56': resnet.ResNet56, + 'resnet110': resnet.ResNet110, + 'resnet1202': resnet.ResNet1202, + + # "Wide Residual Networks" + 'wrn-40-4': resnet.WRN_40_4, + 'wrn-16-8': resnet.WRN_16_8, + 'wrn-28-10': resnet.WRN_28_10, + + # Based on "Identity Mappings in Deep Residual Networks" + 'preact8': resnet.PreActResNet8, + 'preact14': resnet.PreActResNet14, + 'preact20': resnet.PreActResNet20, + 'preact56': resnet.PreActResNet56, + 'preact164-basic': resnet.PreActResNet164Basic, + + # "Identity Mappings in Deep Residual Networks" + 'preact110': resnet.PreActResNet110, + 'preact164': resnet.PreActResNet164, + 'preact1001': resnet.PreActResNet1001, + + # Based on "Deep Networks with Stochastic Depth" + 'stochastic56': resnet.StochasticResNet56, + 'stochastic56-08': resnet.StochasticResNet56_08, + 'stochastic110': resnet.StochasticResNet110, + 'stochastic1202': resnet.StochasticResNet1202, + + # "Aggregated Residual Transformations for Deep Neural Networks" + 'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64), + 'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64), + + # "Densely Connected Convolutional Networks" + 'densenetbc100': densenet.DenseNetBC100, + 'densenetbc250': densenet.DenseNetBC250, + 'densenetbc190': densenet.DenseNetBC190, + + # Kuangliu/pytorch-cifar + 'resnet18': resnet.ResNet18, + 'resnet50': resnet.ResNet50, + 'resnet101': resnet.ResNet101, + 'resnet152': resnet.ResNet152, +} + + +def correct(outputs, targets, top=(1, )): + _, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True) + targets = targets.view(-1, 1).expand_as(predictions) + + corrects = predictions.eq(targets).cpu().int().cumsum(1).sum(0) + tops = list(map(lambda k: corrects.data[0][k - 1], top)) + return tops + + +def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5), + use_cuda=False, tracking=None, train=True, half=False): + accuracies = [utils.AverageMeter() for _ in top] + + assert criterion is not None or not train, 'Need criterion to train model' + assert optimizer is not None or not train, 'Need optimizer to train model' + loader = tqdm.tqdm(loader) + if train: + model.train() + losses = utils.AverageMeter() + else: + model.eval() + + start = datetime.now() + for batch_index, (inputs, targets) in enumerate(loader): + inputs = Variable(inputs, requires_grad=False, volatile=not train) + targets = Variable(targets, requires_grad=False, volatile=not train) + batch_size = targets.size(0) + assert batch_size < 2**32, 'Size is too large! correct will overflow' + + if use_cuda: + inputs = inputs.cuda() + targets = targets.cuda() + if half: + inputs = inputs.half() + + outputs = model(inputs) + + if train: + loss = criterion(outputs, targets) + optimizer.zero_grad() + loss.backward() + optimizer.step() + losses.update(loss.data[0], batch_size) + + _, predictions = torch.max(outputs.data, 1) + top_correct = correct(outputs, targets, top=top) + for i, count in enumerate(top_correct): + accuracies[i].update(count * (100. / batch_size), batch_size) + + end = datetime.now() + if tracking is not None: + result = OrderedDict() + result['timestamp'] = datetime.now() + result['batch_duration'] = end - start + result['epoch'] = epoch + result['batch'] = batch_index + result['batch_size'] = batch_size + for i, k in enumerate(top): + result['top{}_correct'.format(k)] = top_correct[i] + result['top{}_accuracy'.format(k)] = accuracies[i].val + if train: + result['loss'] = loss.data[0] + utils.save_result(result, tracking) + + desc = 'Epoch {} {}'.format(epoch, '(Train):' if train else '(Val): ') + if train: + desc += ' Loss {loss.val:.4f} ({loss.avg:.4f})'.format(loss=losses) + for k, acc in zip(top, accuracies): + desc += ' Prec@{} {acc.val:.3f} ({acc.avg:.3f})'.format(k, acc=acc) + loader.set_description(desc) + start = datetime.now() + + if train: + message = 'Training accuracy of' + else: + message = 'Validation accuracy of' + for i, k in enumerate(top): + message += ' top-{}: {}'.format(k, accuracies[i].avg) + print(message) + return accuracies[0].avg + + +@click.command() +@click.option('--dataset-dir', default='./data/cifar10') +@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']), + default='last') +@click.option('--restore', '-r') +@click.option('--tracking/--no-tracking', default=True) +@click.option('--cuda/--no-cuda', default=True) +@click.option('--epochs', '-e', default=200) +@click.option('--batch-size', '-b', default=32) +@click.option('--learning-rate', '-l', default=1e-3) +@click.option('--lr-factor', default=1.0, help='only for yellowfin') +@click.option('--momentum', default=0.9) +@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam', 'yellowfin']), + default='sgd') +@click.option('--augmentation/--no-augmentation', default=True) +@click.option('device_ids', '--device', '-d', multiple=True, type=int) +@click.option('--num-workers', type=int) +@click.option('--weight-decay', default=5e-4) +@click.option('--validation', '-v', default=0.0) +@click.option('--evaluate', is_flag=True) +@click.option('--shuffle/--no-shuffle', default=True) +@click.option('--half', is_flag=True) +@click.option('--arch', '-a', type=click.Choice(MODELS.keys()), + default='resnet20') +def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs, + batch_size, learning_rate, lr_factor, momentum, optimizer, augmentation, + device_ids, num_workers, weight_decay, validation, evaluate, shuffle, + half, arch): + timestamp = "{:.0f}".format(datetime.utcnow().timestamp()) + local_timestamp = str(datetime.now()) + config = {k: v for k, v in locals().items()} + + use_cuda = cuda and torch.cuda.is_available() + + # create model + model = MODELS[arch]() + + # create optimizer + if optimizer == 'adam': + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + elif optimizer == 'sgd': + optimizer = optim.SGD(model.parameters(), lr=learning_rate, + momentum=momentum, + weight_decay=weight_decay) + elif optimizer == 'yellowfin': + optimizer = YFOptimizer(model.parameters(), lr=learning_rate, + mu=momentum, weight_decay=weight_decay) + + else: + raise NotImplementedError("Unknown optimizer: {}".format(optimizer)) + + if restore is not None: + if restore == 'latest': + restore = utils.latest_file(arch) + print(f'Restoring model from {restore}') + assert os.path.exists(restore) + restored_state = torch.load(restore) + assert restored_state['arch'] == arch + + model.load_state_dict(restored_state['model']) + optimizer.load_state_dict(restored_state['optimizer']) + if not isinstance(optimizer, YFOptimizer): + for group in optimizer.param_groups: + group['lr'] = learning_rate + + best_accuracy = restored_state['accuracy'] + start_epoch = restored_state['epoch'] + 1 + run_dir = os.path.split(restore)[0] + else: + best_accuracy = 0.0 + start_epoch = 1 + run_dir = f"./run/{arch}/{timestamp}" + + print('Starting accuracy is {}'.format(best_accuracy)) + + if not os.path.exists(run_dir): + os.makedirs(run_dir) + utils.save_config(config, run_dir) + + print(model) + print("{} parameters".format(utils.count_parameters(model))) + print(f"Run directory set to {run_dir}") + + # Save model text description + with open(os.path.join(run_dir, 'model.txt'), 'w') as file: + file.write(str(model)) + + if tracking: + train_results_file = os.path.join(run_dir, 'train_results.csv') + valid_results_file = os.path.join(run_dir, 'valid_results.csv') + test_results_file = os.path.join(run_dir, 'test_results.csv') + else: + train_results_file = None + valid_results_file = None + test_results_file = None + + # create loss + criterion = nn.CrossEntropyLoss() + + if use_cuda: + print('Copying model to GPU') + model = model.cuda() + criterion = criterion.cuda() + + if half: + model = model.half() + criterion = criterion.half() + device_ids = device_ids or list(range(torch.cuda.device_count())) + model = torch.nn.DataParallel( + model, device_ids=device_ids) + num_workers = num_workers or len(device_ids) + else: + num_workers = num_workers or 1 + if half: + print('Half precision (16-bit floating point) only works on GPU') + print(f"using {num_workers} workers for data loading") + + # load data + print("Preparing data:") + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(MEAN, STD), + ]) + + test_loader = torch.utils.data.DataLoader( + datasets.CIFAR10(root=dataset_dir, train=False, download=True, + transform=transform_test), + batch_size=batch_size, shuffle=False, num_workers=num_workers, + pin_memory=use_cuda) + + if evaluate: + print("Only running evaluation of model on test dataset") + run(start_epoch - 1, model, test_loader, use_cuda=use_cuda, + tracking=test_results_file, train=False) + return + + if augmentation: + transform_train = [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip() + ] + else: + transform_train = [] + + transform_train = transforms.Compose(transform_train + [ + transforms.ToTensor(), + transforms.Normalize(MEAN, STD), + ]) + + train_dataset = datasets.CIFAR10(root=dataset_dir, train=True, + download=True, transform=transform_train) + + num_train = len(train_dataset) + indices = list(range(num_train)) + assert 1 > validation and validation >= 0, "Validation must be in [0, 1)" + split = num_train - int(validation * num_train) + + if shuffle: + np.random.shuffle(indices) + + train_indices = indices[:split] + valid_indices = indices[split:] + + print('Using {} examples for training'.format(len(train_indices))) + print('Using {} examples for validation'.format(len(valid_indices))) + + train_sampler = SubsetRandomSampler(train_indices) + valid_sampler = SubsetRandomSampler(valid_indices) + + train_loader = torch.utils.data.DataLoader( + train_dataset, sampler=train_sampler, batch_size=batch_size, + num_workers=num_workers, pin_memory=use_cuda) + if validation != 0: + valid_loader = torch.utils.data.DataLoader( + train_dataset, sampler=valid_sampler, batch_size=batch_size, + num_workers=num_workers, pin_memory=use_cuda) + else: + print('Using test dataset for validation') + valid_loader = test_loader + + end_epoch = start_epoch + epochs + # YellowFin doesn't have param_groups causing AttributeError + if not isinstance(optimizer, YFOptimizer): + for group in optimizer.param_groups: + if 'lr' in group: + print('Learning rate set to {}'.format(group['lr'])) + assert group['lr'] == learning_rate + else: + print(f"set lr_factor to {lr_factor}") + optimizer.set_lr_factor(lr_factor) + for epoch in range(start_epoch, end_epoch): + run(epoch, model, train_loader, criterion, optimizer, + use_cuda=use_cuda, tracking=train_results_file, train=True, + half=half) + + valid_acc = run(epoch, model, valid_loader, use_cuda=use_cuda, + tracking=valid_results_file, train=False, half=half) + + is_best = valid_acc > best_accuracy + last_epoch = epoch == (end_epoch - 1) + if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch): + state = { + 'epoch': epoch, + 'arch': arch, + 'model': (model.module if use_cuda else model).state_dict(), + 'accuracy': valid_acc, + 'optimizer': optimizer.state_dict() + } + if is_best: + print('New best model!') + filename = os.path.join(run_dir, 'checkpoint_best_model.t7') + print(f'Saving checkpoint to {filename}') + best_accuracy = valid_acc + torch.save(state, filename) + if checkpoint == 'all' or (checkpoint == 'last' and last_epoch): + filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7') + print(f'Saving checkpoint to {filename}') + torch.save(state, filename) + + +if __name__ == '__main__': + train() diff --git a/pytorch/CIFAR10/benchmark/imagenet/__main__.py b/pytorch/CIFAR10/benchmark/imagenet/__main__.py new file mode 100644 index 0000000..f546ab2 --- /dev/null +++ b/pytorch/CIFAR10/benchmark/imagenet/__main__.py @@ -0,0 +1,14 @@ +import click + +from benchmark.imagenet.train import train + + +@click.group() +def cli(): + pass + + +cli.add_command(train, name='train') + +if __name__ == '__main__': + cli() diff --git a/pytorch/CIFAR10/benchmark/imagenet/train.py b/pytorch/CIFAR10/benchmark/imagenet/train.py new file mode 100644 index 0000000..5f49a23 --- /dev/null +++ b/pytorch/CIFAR10/benchmark/imagenet/train.py @@ -0,0 +1,339 @@ +import os +import time +from datetime import datetime +from collections import OrderedDict + +import click +import tqdm +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +from benchmark import utils + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + + +@click.command() +@click.option('--dataset-dir', default='./data/imagenet') +@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']), + default='last') +@click.option('--restore', '-r') +@click.option('--tracking/--no-tracking', default=True) +@click.option('--cuda/--no-cuda', default=True) +@click.option('--epochs', '-e', default=90) +@click.option('--batch-size', '-b', default=256) +@click.option('--learning-rate', '-l', default=0.1) +@click.option('--learning-rate-decay', default=0.1) +@click.option('--learning-rate-freq', default=30) +@click.option('--momentum', default=0.9) +@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam']), + default='sgd') +@click.option('--augmentation/--no-augmentation', default=True) +@click.option('--pretrained', is_flag=True) +@click.option('--evaluate', is_flag=True) +@click.option('--num-workers', type=int) +@click.option('--weight-decay', default=1e-4) +@click.option('--arch', '-a', type=click.Choice(model_names), + default='resnet18') +def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs, + batch_size, learning_rate, learning_rate_decay, + learning_rate_freq, momentum, optimizer, augmentation, + pretrained, evaluate, num_workers, weight_decay, arch): + timestamp = "{:.0f}".format(datetime.utcnow().timestamp()) + config = {k: v for k, v in locals().items()} + + use_cuda = cuda and torch.cuda.is_available() + + # create model + if pretrained: + print("=> using pre-trained model '{}'".format(arch)) + model = models.__dict__[arch](pretrained=True) + else: + print("=> creating model '{}'".format(arch)) + model = models.__dict__[arch]() + + if optimizer == 'adam': + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + elif optimizer == 'sgd': + optimizer = torch.optim.SGD(model.parameters(), learning_rate, + momentum=momentum, + weight_decay=weight_decay) + else: + raise NotImplementedError("Unknown optimizer: {}".format(optimizer)) + + # optionally resume from a checkpoint + if restore is not None: + if restore == 'latest': + restore = utils.latest_file(arch) + print(f'=> restoring model from {restore}') + restored_state = torch.load(restore) + start_epoch = restored_state['epoch'] + 1 + best_prec1 = restored_state['prec1'] + model.load_state_dict(restored_state['state_dict']) + optimizer.load_state_dict(restored_state['optimizer']) + print('=> starting accuracy is {} (epoch {})' + .format(best_prec1, start_epoch)) + run_dir = os.path.split(restore)[0] + else: + best_prec1 = 0.0 + start_epoch = 1 + run_dir = f"./run/{arch}/{timestamp}" + + if not os.path.exists(run_dir): + os.makedirs(run_dir) + utils.save_config(config, run_dir) + + print(model) + print("{} parameters".format(utils.count_parameters(model))) + print(f"Run directory set to {run_dir}") + + # save model text description + with open(os.path.join(run_dir, 'model.txt'), 'w') as file: + file.write(str(model)) + + if tracking: + train_results_file = os.path.join(run_dir, 'train_results.csv') + test_results_file = os.path.join(run_dir, 'test_results.csv') + else: + train_results_file = None + test_results_file = None + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss() + + # move model and criterion to GPU + if use_cuda: + model.cuda() + criterion = criterion.cuda() + model = torch.nn.parallel.DataParallel(model) + num_workers = num_workers or torch.cuda.device_count() + else: + num_workers = num_workers or 1 + print(f"=> using {num_workers} workers for data loading") + + cudnn.benchmark = True + + # Data loading code + print("=> preparing data:") + traindir = os.path.join(dataset_dir, 'train') + valdir = os.path.join(dataset_dir, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_sampler = None + train_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(traindir, transforms.Compose([ + transforms.RandomSizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])), + batch_size=batch_size, shuffle=(train_sampler is None), + num_workers=num_workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Scale(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=batch_size, shuffle=False, + num_workers=num_workers, pin_memory=True) + + if evaluate: + validate(val_loader, model, criterion) + return + + end_epoch = start_epoch + epochs + for epoch in range(start_epoch, end_epoch): + print('Epoch {} of {}'.format(epoch, end_epoch - 1)) + adjust_learning_rate(optimizer, epoch, learning_rate, + decay=learning_rate_decay, + freq=learning_rate_freq) + + # train for one epoch + _ = train_one_epoch( + train_loader, model, criterion, optimizer, epoch, + tracking=train_results_file) + + # evaluate on validation set + prec1, _ = validate( + val_loader, model, criterion, epoch, tracking=test_results_file) + + # remember best prec@1 and save checkpoint + is_best = prec1 > best_prec1 + last_epoch = epoch == (end_epoch - 1) + if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch): + state = { + 'epoch': epoch, + 'arch': arch, + 'state_dict': (model.module if use_cuda else model).state_dict(), + 'prec1': prec1, + 'optimizer': optimizer.state_dict(), + } + if is_best: + print('New best model!') + filename = os.path.join(run_dir, 'checkpoint_best_model.t7') + print(f'=> saving checkpoint to {filename}') + torch.save(state, filename) + best_prec1 = prec1 + if checkpoint == 'all' or (checkpoint == 'last' and last_epoch): + filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7') + print(f'=> saving checkpoint to {filename}') + torch.save(state, filename) + + +def train_one_epoch(train_loader, model, criterion, optimizer, epoch, + tracking=None): + train_loader = tqdm.tqdm(train_loader) + batch_time = utils.AverageMeter() + data_time = utils.AverageMeter() + losses = utils.AverageMeter() + top1 = utils.AverageMeter() + top5 = utils.AverageMeter() + + # switch to train mode + model.train() + + end = time.time() + for i, (input, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + target = target.cuda(async=True) + input_var = torch.autograd.Variable(input) + target_var = torch.autograd.Variable(target) + + # compute output + output = model(input_var) + loss = criterion(output, target_var) + + # measure accuracy and record loss + prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) + losses.update(loss.data[0], input.size(0)) + top1.update(prec1[0], input.size(0)) + top5.update(prec5[0], input.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + if tracking is not None: + result = OrderedDict() + result['timestamp'] = datetime.now() + result['batch_duration'] = batch_time.val + result['epoch'] = epoch + result['batch'] = i + result['batch_size'] = input.size(0) + result['top1_accuracy'] = prec1[0] + result['top5_accuracy'] = prec5[0] + result['loss'] = loss.data[0] + result['data_duration'] = data_time.val + utils.save_result(result, tracking) + + desc = ('Epoch {0} (Train):' + ' Loss {loss.val:.4f} ({loss.avg:.4f})' + ' Prec@1 {top1.val:.3f} ({top1.avg:.3f})' + ' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1, top5=top5)) + train_loader.set_description(desc) + + end = time.time() + + return top1.avg, top5.avg + + +def validate(val_loader, model, criterion, epoch, tracking=None): + val_loader = tqdm.tqdm(val_loader) + batch_time = utils.AverageMeter() + losses = utils.AverageMeter() + top1 = utils.AverageMeter() + top5 = utils.AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + for i, (input, target) in enumerate(val_loader): + target = target.cuda(async=True) + input_var = torch.autograd.Variable(input, volatile=True) + target_var = torch.autograd.Variable(target, volatile=True) + + # compute output + output = model(input_var) + loss = criterion(output, target_var) + + # measure accuracy and record loss + prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) + losses.update(loss.data[0], input.size(0)) + top1.update(prec1[0], input.size(0)) + top5.update(prec5[0], input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + if tracking is not None: + result = OrderedDict() + result['timestamp'] = datetime.now() + result['batch_duration'] = batch_time.val + result['epoch'] = epoch + result['batch'] = i + result['batch_size'] = input.size(0) + result['top1_accuracy'] = prec1[0] + result['top5_accuracy'] = prec5[0] + result['loss'] = loss.data[0] + utils.save_result(result, tracking) + + desc = ('Epoch {0} (Val): ' + ' Loss {loss.val:.4f} ({loss.avg:.4f})' + ' Prec@1 {top1.val:.3f} ({top1.avg:.3f})' + ' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + epoch, i, len(val_loader), batch_time=batch_time, + loss=losses, top1=top1, top5=top5)) + val_loader.set_description(desc) + end = time.time() + + print("Evaluation: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}" + .format(top1=top1, top5=top5)) + return top1.avg, top5.avg + + +def adjust_learning_rate(optimizer, epoch, initial_learning_rate, decay, freq): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + lr = initial_learning_rate * (decay ** ((epoch - 1) // freq)) + print(f'=> learning rate is set to {lr}') + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + train() diff --git a/pytorch/CIFAR10/benchmark/train.py b/pytorch/CIFAR10/benchmark/train.py deleted file mode 100644 index 78cdd8a..0000000 --- a/pytorch/CIFAR10/benchmark/train.py +++ /dev/null @@ -1,336 +0,0 @@ -import os -import re -import json -from functools import reduce -from datetime import datetime -from collections import OrderedDict - -import click -import torch -import progressbar -from torch import nn, optim -from torch.autograd import Variable -from torchvision import transforms -from torchvision import datasets as dset - -from benchmark.models import resnet, densenet - -MEAN = (0.4914, 0.4822, 0.4465) -STD = (0.2023, 0.1994, 0.2010) - -MODELS = { - # "Deep Residual Learning for Image Recognition" - 'resnet20': resnet.ResNet20, - 'resnet32': resnet.ResNet32, - 'resnet44': resnet.ResNet44, - 'resnet56': resnet.ResNet56, - 'resnet110': resnet.ResNet110, - 'resnet1202': resnet.ResNet1202, - - # "Wide Residual Networks" - 'wrn-40-4': resnet.WRN_40_4, - 'wrn-16-8': resnet.WRN_16_8, - 'wrn-28-10': resnet.WRN_28_10, - - # Based on "Identity Mappings in Deep Residual Networks" - 'preact20': resnet.PreActResNet20, - 'preact56': resnet.PreActResNet56, - 'preact164-basic': resnet.PreActResNet164Basic, - - # "Identity Mappings in Deep Residual Networks" - 'preact110': resnet.PreActResNet110, - 'preact164': resnet.PreActResNet164, - 'preact1001': resnet.PreActResNet1001, - - # "Aggregated Residual Transformations for Deep Neural Networks" - 'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64), - 'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64), - - # "Densely Connected Convolutional Networks" - 'densenetbc100': densenet.DenseNetBC100, - 'densenetbc250': densenet.DenseNetBC250, - 'densenetbc190': densenet.DenseNetBC190, - - # Kuangliu/pytorch-cifar - 'resnet18': resnet.ResNet18, - 'resnet50': resnet.ResNet50, - 'resnet101': resnet.ResNet101, - 'resnet152': resnet.ResNet152, -} - - -def count_parameters(model): - c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters()) - return sum(c) - - -def correct(outputs, targets, top=(1, )): - _, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True) - targets = targets.view(-1, 1).expand_as(predictions) - corrects = predictions.eq(targets).cpu().cumsum(1).sum(0) - tops = list(map(lambda k: corrects.data[0][k - 1], top)) - return tops - - -def save_result(result, path): - write_heading = not os.path.exists(path) - with open(path, mode='a') as out: - if write_heading: - out.write(",".join([str(k) for k, v in result.items()]) + '\n') - out.write(",".join([str(v) for k, v in result.items()]) + '\n') - - -def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5), - use_cuda=False, tracking=None, max_value=None, train=True): - - assert criterion is not None or not train, 'Need criterion to train model' - assert optimizer is not None or not train, 'Need optimizer to train model' - max_value = max_value or progressbar.UnknownLength - bar = progressbar.ProgressBar(max_value=max_value) - total = 0 - correct_counts = {} - if train: - model.train() - else: - model.eval() - - start = datetime.now() - for batch_index, (inputs, targets) in enumerate(loader): - inputs = Variable(inputs, requires_grad=False, volatile=not train) - targets = Variable(targets, requires_grad=False, volatile=not train) - - if use_cuda: - inputs = inputs.cuda() - targets = targets.cuda() - - outputs = model(inputs) - - if train: - loss = criterion(outputs, targets) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - _, predictions = torch.max(outputs.data, 1) - batch_size = targets.size(0) - top_correct = correct(outputs, targets, top=top) - total += batch_size - for k, count in zip(top, top_correct): - correct_counts[k] = correct_counts.get(k, 0) + count - - end = datetime.now() - if tracking is not None: - result = OrderedDict() - result['timestamp'] = datetime.now() - result['batch_duration'] = end - start - result['epoch'] = epoch - result['batch'] = batch_index - result['batch_size'] = batch_size - for i, k in enumerate(top): - result['top{}_correct'.format(k)] = top_correct[i] - if train: - result['loss'] = loss.data[0] - save_result(result, tracking) - - bar.update(batch_index + 1) - start = datetime.now() - - print() - if train: - message = 'Training accuracy of' - else: - message = 'Test accuracy of' - for k in top: - accuracy = correct_counts[k] / total - message += ' top-{}: {}'.format(k, accuracy) - print(message) - return (1. * correct_counts[top[0]]) / total, batch_index + 1 - - -def save(model, directory, epoch, accuracy, use_cuda=False, filename=None): - state = { - 'model': model.module if use_cuda else model, - 'epoch': epoch, - 'accuracy': accuracy - } - - filename = filename or 'checkpoint_{}.t7'.format(epoch) - torch.save(state, os.path.join(directory, filename)) - - -def save_config(config, run_dir): - path = os.path.join(run_dir, "config_{}.json".format(config['timestamp'])) - with open(path, 'w') as config_file: - json.dump(config, config_file) - config_file.write('\n') - - -def load(path): - assert os.path.exists(path) - state = torch.load(path) - model = state['model'] - epoch = state['epoch'] - accuracy = state['accuracy'] - return model, epoch, accuracy - - -def latest_file(model): - restore = f'./run/{model}' - timestamps = sorted(os.listdir(restore)) - assert len(timestamps) > 0 - run_dir = os.path.join(restore, timestamps[-1]) - files = os.listdir(run_dir) - max_checkpoint = -1 - for filename in files: - if re.search('checkpoint_\d+.t7', filename): - num = int(re.search('\d+', filename).group()) - - if num > max_checkpoint: - max_checkpoint = num - max_checkpoint_file = filename - - assert max_checkpoint != -1 - return os.path.join(run_dir, max_checkpoint_file) - - -@click.command() -@click.option('--dataset-dir', default='./data/cifar10') -@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']), - default='last') -@click.option('--restore', '-r') -@click.option('--tracking/--no-tracking', default=True) -@click.option('--cuda/--no-cuda', default=True) -@click.option('--epochs', '-e', default=200) -@click.option('--batch-size', '-b', default=32) -@click.option('--learning-rate', '-l', default=1e-3) -@click.option('--sgd', 'optimizer', flag_value='sgd') -@click.option('--adam', 'optimizer', flag_value='adam', default=True) -@click.option('--augmentation/--no-augmentation', default=True) -@click.option('--num-workers', type=int) -@click.option('--weight-decay', default=5e-4) -@click.option('--model', '-m', type=click.Choice(MODELS.keys()), - default='resnet20') -def main(dataset_dir, checkpoint, restore, tracking, cuda, epochs, - batch_size, learning_rate, optimizer, augmentation, num_workers, - weight_decay, model): - timestamp = "{:.0f}".format(datetime.utcnow().timestamp()) - config = {k: v for k, v in locals().items()} - - use_cuda = cuda and torch.cuda.is_available() - if use_cuda: - num_workers = num_workers or torch.cuda.device_count() - else: - num_workers = num_workers or 1 - - print(f"using {num_workers} workers for data loading") - - print("Preparing data:") - - if augmentation: - transform_train = [ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip() - ] - else: - transform_train = [] - - transform_train = transforms.Compose(transform_train + [ - transforms.ToTensor(), - transforms.Normalize(MEAN, STD), - ]) - - trainset = dset.CIFAR10(root=dataset_dir, train=True, download=True, - transform=transform_train) - train_loader = torch.utils.data.DataLoader( - trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, - pin_memory=use_cuda) - - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(MEAN, STD), - ]) - - testset = dset.CIFAR10(root=dataset_dir, train=False, download=True, - transform=transform_test) - test_loader = torch.utils.data.DataLoader( - testset, batch_size=batch_size, shuffle=False, num_workers=num_workers, - pin_memory=use_cuda) - - if restore is not None: - if restore == 'latest': - restore = latest_file(model) - print(f'Restoring model from {restore}') - model, start_epoch, best_accuracy = load(restore) - start_epoch += 1 - print('Starting accuracy is {}'.format(best_accuracy)) - run_dir = os.path.split(restore)[0] - else: - print(f'Building {model} model') - best_accuracy = -1 - start_epoch = 1 - run_dir = f"./run/{model}/{timestamp}" - model = MODELS[model]() - - if not os.path.exists(run_dir): - os.makedirs(run_dir) - save_config(config, run_dir) - - print(model) - print("{} parameters".format(count_parameters(model))) - print(f"Run directory set to {run_dir}") - - # Save model text description - with open(os.path.join(run_dir, 'model.txt'), 'w') as file: - file.write(str(model)) - - if tracking: - train_results_file = os.path.join(run_dir, 'train_results.csv') - test_results_file = os.path.join(run_dir, 'test_results.csv') - else: - train_results_file = None - test_results_file = None - - if use_cuda: - print('Copying model to GPU') - model.cuda() - model = torch.nn.DataParallel( - model, device_ids=range(torch.cuda.device_count())) - criterion = nn.CrossEntropyLoss() - - # Other parameters? - if optimizer == 'adam': - optimizer = optim.Adam(model.parameters(), lr=learning_rate) - elif optimizer == 'sgd': - optimizer = optim.SGD(model.parameters(), lr=learning_rate, - momentum=0.9, - weight_decay=weight_decay) - else: - raise NotImplementedError("Unknown optimizer: {}".format(optimizer)) - - train_max_value = None - test_max_value = None - end_epoch = start_epoch + epochs - for epoch in range(start_epoch, end_epoch): - print('Epoch {} of {}'.format(epoch, end_epoch - 1)) - train_acc, train_max_value = run(epoch, model, train_loader, criterion, - optimizer, use_cuda=use_cuda, - tracking=train_results_file, - max_value=train_max_value, train=True) - - test_acc, test_max_value = run(epoch, model, test_loader, - use_cuda=use_cuda, - tracking=test_results_file, train=False) - - if test_acc > best_accuracy: - print('New best model!') - save(model, run_dir, epoch, test_acc, use_cuda=use_cuda, - filename='checkpoint_best_model.t7') - best_accuracy = test_acc - - last_epoch = epoch == (end_epoch - 1) - if checkpoint == 'all' or (checkpoint == 'last' and last_epoch): - save(model, run_dir, epoch, test_acc, use_cuda=use_cuda) - - -if __name__ == '__main__': - main() diff --git a/pytorch/CIFAR10/benchmark/utils.py b/pytorch/CIFAR10/benchmark/utils.py new file mode 100644 index 0000000..df8141c --- /dev/null +++ b/pytorch/CIFAR10/benchmark/utils.py @@ -0,0 +1,61 @@ +import os +import json +import re +from functools import reduce + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def count_parameters(model): + c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters()) + return sum(c) + + +def latest_file(model): + restore = f'./run/{model}' + timestamps = sorted(os.listdir(restore)) + assert len(timestamps) > 0 + run_dir = os.path.join(restore, timestamps[-1]) + files = os.listdir(run_dir) + max_checkpoint = -1 + for filename in files: + if re.search('checkpoint_\d+.t7', filename): + num = int(re.search('\d+', filename).group()) + + if num > max_checkpoint: + max_checkpoint = num + max_checkpoint_file = filename + + assert max_checkpoint != -1 + return os.path.join(run_dir, max_checkpoint_file) + + +def save_result(result, path): + write_heading = not os.path.exists(path) + with open(path, mode='a') as out: + if write_heading: + out.write(",".join([str(k) for k, v in result.items()]) + '\n') + out.write(",".join([str(v) for k, v in result.items()]) + '\n') + + +def save_config(config, run_dir): + path = os.path.join(run_dir, "config_{}.json".format(config['timestamp'])) + with open(path, 'w') as config_file: + json.dump(config, config_file) + config_file.write('\n') diff --git a/pytorch/CIFAR10/benchmark/yellowfin.py b/pytorch/CIFAR10/benchmark/yellowfin.py new file mode 100644 index 0000000..2f1c732 --- /dev/null +++ b/pytorch/CIFAR10/benchmark/yellowfin.py @@ -0,0 +1,384 @@ + +import math +import numpy as np +import torch + +# eps for numerical stability +eps = 1e-15 + +class YFOptimizer(object): + def __init__(self, var_list, lr=0.1, mu=0.0, clip_thresh=None, weight_decay=0.0, + beta=0.999, curv_win_width=20, zero_debias=True, sparsity_debias=True, delta_mu=0.0, + auto_clip_fac=None, force_non_inc_step=False): + ''' + clip thresh is the threshold value on ||lr * gradient|| + delta_mu can be place holder/variable/python scalar. They are used for additional + momentum in situations such as asynchronous-parallel training. The default is 0.0 + for basic usage of the optimizer. + Args: + lr: python scalar. The initial value of learning rate, we use 1.0 in our paper. + mu: python scalar. The initial value of momentum, we use 0.0 in our paper. + clip_thresh: python scalar. The manaully-set clipping threshold for tf.clip_by_global_norm. + if None, the automatic clipping can be carried out. The automatic clipping + feature is parameterized by argument auto_clip_fac. The auto clip feature + can be switched off with auto_clip_fac = None + beta: python scalar. The smoothing parameter for estimations. + sparsity_debias: gradient norm and curvature are biased to larger values when + calculated with sparse gradient. This is useful when the model is very sparse, + e.g. LSTM with word embedding. For non-sparse CNN, turning it off could slightly + accelerate the speed. + delta_mu: for extensions. Not necessary in the basic use. + force_non_inc_step: in some very rare cases, it is necessary to force ||lr * gradient|| + to be not increasing dramatically for stableness after some iterations. + In practice, if turned on, we enforce lr * sqrt(smoothed ||grad||^2) + to be less than 2x of the minimal value of historical value on smoothed || lr * grad ||. + This feature is turned off by default. + Other features: + If you want to manually control the learning rates, self.lr_factor is + an interface to the outside, it is an multiplier for the internal learning rate + in YellowFin. It is helpful when you want to do additional hand tuning + or some decaying scheme to the tuned learning rate in YellowFin. + Example on using lr_factor can be found here: + https://github.com/JianGoForIt/YellowFin_Pytorch/blob/master/pytorch-cifar/main.py#L109 + ''' + self._lr = lr + self._mu = mu + # we convert var_list from generator to list so that + # it can be used for multiple times + self._var_list = list(var_list) + self._clip_thresh = clip_thresh + self._auto_clip_fac = auto_clip_fac + self._beta = beta + self._curv_win_width = curv_win_width + self._zero_debias = zero_debias + self._sparsity_debias = sparsity_debias + self._force_non_inc_step = force_non_inc_step + self._optimizer = torch.optim.SGD(self._var_list, lr=self._lr, + momentum=self._mu, weight_decay=weight_decay) + self._iter = 0 + # global states are the statistics + self._global_state = {} + + # for decaying learning rate and etc. + self._lr_factor = 1.0 + + + def state_dict(self): + # for checkpoint saving + sgd_state_dict = self._optimizer.state_dict() + global_state = self._global_state + lr_factor = self._lr_factor + iter = self._iter + lr = self._lr + mu = self._mu + clip_thresh = self._clip_thresh + beta = self._beta + curv_win_width = self._curv_win_width + zero_debias = self._zero_debias + h_min = self._h_min + h_max = self._h_max + + return { + "sgd_state_dict": sgd_state_dict, + "global_state": global_state, + "lr_factor": lr_factor, + "iter": iter, + "lr": lr, + "mu": mu, + "clip_thresh": clip_thresh, + "beta": beta, + "curv_win_width": curv_win_width, + "zero_debias": zero_debias, + "h_min": h_min, + "h_max": h_max + } + + + def load_state_dict(self, state_dict): + # for checkpoint saving + self._optimizer.load_state_dict(state_dict['sgd_state_dict']) + self._global_state = state_dict['global_state'] + self._lr_factor = state_dict['lr_factor'] + self._iter = state_dict['iter'] + self._lr = state_dict['lr'] + self._mu = state_dict['mu'] + self._clip_thresh = state_dict['clip_thresh'] + self._beta = state_dict['beta'] + self._curv_win_width = state_dict['curv_win_width'] + self._zero_debias = state_dict['zero_debias'] + self._h_min = state_dict["h_min"] + self._h_max = state_dict["h_max"] + return + + + def set_lr_factor(self, factor): + self._lr_factor = factor + return + + + def get_lr_factor(self): + return self._lr_factor + + + def zero_grad(self): + self._optimizer.zero_grad() + return + + + def zero_debias_factor(self): + return 1.0 - self._beta ** (self._iter + 1) + + + def zero_debias_factor_delay(self, delay): + # for exponentially averaged stat which starts at non-zero iter + return 1.0 - self._beta ** (self._iter - delay + 1) + + + def curvature_range(self): + global_state = self._global_state + if self._iter == 0: + global_state["curv_win"] = torch.FloatTensor(self._curv_win_width, 1).zero_() + curv_win = global_state["curv_win"] + grad_norm_squared = self._global_state["grad_norm_squared"] + curv_win[self._iter % self._curv_win_width] = np.log(grad_norm_squared + eps) + valid_end = min(self._curv_win_width, self._iter + 1) + # we use running average over log scale, accelerating + # h_max / min in the begining to follow the varying trend of curvature. + beta = self._beta + if self._iter == 0: + global_state["h_min_avg"] = 0.0 + global_state["h_max_avg"] = 0.0 + self._h_min = 0.0 + self._h_max = 0.0 + global_state["h_min_avg"] = \ + global_state["h_min_avg"] * beta + (1 - beta) * torch.min(curv_win[:valid_end] ) + global_state["h_max_avg"] = \ + global_state["h_max_avg"] * beta + (1 - beta) * torch.max(curv_win[:valid_end] ) + if self._zero_debias: + debias_factor = self.zero_debias_factor() + self._h_min = np.exp(global_state["h_min_avg"] / debias_factor) + self._h_max = np.exp(global_state["h_max_avg"] / debias_factor) + else: + self._h_min = np.exp(global_state["h_min_avg"] ) + self._h_max = np.exp(global_state["h_max_avg"] ) + if self._sparsity_debias: + self._h_min *= self._sparsity_avg + self._h_max *= self._sparsity_avg + return + + + def grad_variance(self): + global_state = self._global_state + beta = self._beta + self._grad_var = np.array(0.0, dtype=np.float32) + for group in self._optimizer.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + state = self._optimizer.state[p] + + if self._iter == 0: + state["grad_avg"] = grad.new().resize_as_(grad).zero_() + state["grad_avg_squared"] = 0.0 + state["grad_avg"].mul_(beta).add_(1 - beta, grad) + self._grad_var += torch.sum(state["grad_avg"] * state["grad_avg"] ) + + if self._zero_debias: + debias_factor = self.zero_debias_factor() + else: + debias_factor = 1.0 + + self._grad_var /= -(debias_factor**2) + self._grad_var += global_state['grad_norm_squared_avg'] / debias_factor + # in case of negative variance: the two term are using different debias factors + self._grad_var = max(self._grad_var, eps) + if self._sparsity_debias: + self._grad_var *= self._sparsity_avg + return + + + def dist_to_opt(self): + global_state = self._global_state + beta = self._beta + if self._iter == 0: + global_state["grad_norm_avg"] = 0.0 + global_state["dist_to_opt_avg"] = 0.0 + global_state["grad_norm_avg"] = \ + global_state["grad_norm_avg"] * beta + (1 - beta) * math.sqrt(global_state["grad_norm_squared"] ) + global_state["dist_to_opt_avg"] = \ + global_state["dist_to_opt_avg"] * beta \ + + (1 - beta) * global_state["grad_norm_avg"] / (global_state['grad_norm_squared_avg'] + eps) + if self._zero_debias: + debias_factor = self.zero_debias_factor() + self._dist_to_opt = global_state["dist_to_opt_avg"] / debias_factor + else: + self._dist_to_opt = global_state["dist_to_opt_avg"] + if self._sparsity_debias: + self._dist_to_opt /= (np.sqrt(self._sparsity_avg) + eps) + return + + + def grad_sparsity(self): + global_state = self._global_state + if self._iter == 0: + global_state["sparsity_avg"] = 0.0 + non_zero_cnt = 0.0 + all_entry_cnt = 0.0 + for group in self._optimizer.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + grad_non_zero = grad.nonzero() + if grad_non_zero.dim() > 0: + non_zero_cnt += grad_non_zero.size()[0] + all_entry_cnt += torch.numel(grad) + beta = self._beta + global_state["sparsity_avg"] = beta * global_state["sparsity_avg"] \ + + (1 - beta) * non_zero_cnt / float(all_entry_cnt) + self._sparsity_avg = \ + global_state["sparsity_avg"] / self.zero_debias_factor() + return + + + def lr_grad_norm_avg(self): + # this is for enforcing lr * grad_norm not + # increasing dramatically in case of instability. + # Not necessary for basic use. + global_state = self._global_state + beta = self._beta + if "lr_grad_norm_avg" not in global_state: + global_state['grad_norm_squared_avg_log'] = 0.0 + global_state['grad_norm_squared_avg_log'] = \ + global_state['grad_norm_squared_avg_log'] * beta \ + + (1 - beta) * np.log(global_state['grad_norm_squared'] + eps) + if "lr_grad_norm_avg" not in global_state: + global_state["lr_grad_norm_avg"] = \ + 0.0 * beta + (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps) + # we monitor the minimal smoothed ||lr * grad|| + global_state["lr_grad_norm_avg_min"] = \ + np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() ) + else: + global_state["lr_grad_norm_avg"] = global_state["lr_grad_norm_avg"] * beta \ + + (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps) + global_state["lr_grad_norm_avg_min"] = \ + min(global_state["lr_grad_norm_avg_min"], + np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() ) ) + + + def after_apply(self): + # compute running average of gradient and norm of gradient + beta = self._beta + global_state = self._global_state + if self._iter == 0: + global_state["grad_norm_squared_avg"] = 0.0 + + global_state["grad_norm_squared"] = 0.0 + for group in self._optimizer.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + global_state['grad_norm_squared'] += torch.sum(grad * grad) + + global_state['grad_norm_squared_avg'] = \ + global_state['grad_norm_squared_avg'] * beta + (1 - beta) * global_state['grad_norm_squared'] + + if self._sparsity_debias: + self.grad_sparsity() + + self.curvature_range() + self.grad_variance() + self.dist_to_opt() + + if self._iter > 0: + self.get_mu() + self.get_lr() + + self._lr = beta * self._lr + (1 - beta) * self._lr_t + self._mu = beta * self._mu + (1 - beta) * self._mu_t + return + + + def get_lr(self): + self._lr_t = (1.0 - math.sqrt(self._mu_t) )**2 / (self._h_min + eps) + return + + + def get_cubic_root(self): + # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 + # where x = sqrt(mu). + # We substitute x, which is sqrt(mu), with x = y + 1. + # It gives y^3 + py = q + # where p = (D^2 h_min^2)/(2*C) and q = -p. + # We use the Vieta's substution to compute the root. + # There is only one real solution y (which is in [0, 1] ). + # http://mathworld.wolfram.com/VietasSubstitution.html + # eps in the numerator is to prevent momentum = 1 in case of zero gradient + p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps) + w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0 + w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0) + y = w - p / 3.0 / (w + eps) + x = y + 1 + return x + + + def get_mu(self): + root = self.get_cubic_root() + dr = self._h_max / self._h_min + self._mu_t = max(root**2, ( (np.sqrt(dr) - 1) / (np.sqrt(dr) + 1) )**2 ) + return + + + def update_hyper_param(self): + for group in self._optimizer.param_groups: + group['momentum'] = self._mu + if self._force_non_inc_step == False: + group['lr'] = self._lr * self._lr_factor + elif self._iter > self._curv_win_width: + # force to guarantee lr * grad_norm not increasing dramatically. + # Not necessary for basic use. Please refer to the comments + # in YFOptimizer.__init__ for more details + self.lr_grad_norm_avg() + debias_factor = self.zero_debias_factor() + group['lr'] = min(self._lr * self._lr_factor, + 2.0 * self._global_state["lr_grad_norm_avg_min"] \ + / np.sqrt(np.exp(self._global_state['grad_norm_squared_avg_log'] / debias_factor) ) ) + return + + + def auto_clip_thresh(self): + # Heuristic to automatically prevent sudden exploding gradient + # Not necessary for basic use. + return math.sqrt(self._h_max) * self._auto_clip_fac + + + def step(self): + # add weight decay + for group in self._optimizer.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + + if group['weight_decay'] != 0: + grad = grad.add(group['weight_decay'], p.data) + + if self._clip_thresh != None: + torch.nn.utils.clip_grad_norm(self._var_list, self._clip_thresh) + elif (self._iter != 0 and self._auto_clip_fac != None): + # do not clip the first iteration + torch.nn.utils.clip_grad_norm(self._var_list, self.auto_clip_thresh() ) + + # apply update + self._optimizer.step() + + # after appply + self.after_apply() + + # update learning rate and momentum + self.update_hyper_param() + + self._iter += 1 + return diff --git a/pytorch/CIFAR10/setup.py b/pytorch/CIFAR10/setup.py index 5f2a1ed..70ee5be 100644 --- a/pytorch/CIFAR10/setup.py +++ b/pytorch/CIFAR10/setup.py @@ -9,12 +9,13 @@ setup( packages=['benchmark'], entry_points={ 'console_scripts': [ - 'bench = benchmark.train:main' + 'cifar10 = benchmark.cifar10.__main__:cli', + 'imagenet = benchmark.imagenet.__main__:cli' ] }, install_requires=[ + 'tqdm', 'torchvision', 'click', - 'progressbar2' ] )