Update pytorch benchmark code with new command line interface

This commit is contained in:
Cody A. Coleman 2017-12-11 11:35:48 -08:00
parent bf53b6a029
commit 339261c19f
14 changed files with 1476 additions and 355 deletions

6
pytorch/CIFAR10/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
*.pyc
__pycache__/
.eggs/
*.egg-info/
.cache
data/

View file

@ -0,0 +1,17 @@
import click
from benchmark.cifar10.train import train
from benchmark.cifar10.infer import infer
@click.group()
def cli():
pass
cli.add_command(train, name='train')
cli.add_command(infer, name='infer')
if __name__ == '__main__':
cli()

View file

@ -10,15 +10,20 @@ from torch.autograd import Variable
from torchvision import transforms
from torchvision import datasets
from benchmark.train import load, MEAN, STD, save_result, MODELS
from benchmark.utils import save_result
from benchmark.cifar10.train import MEAN, STD, MODELS
class PyTorchEngine:
def __init__(self, filename, use_cuda=False, name=None):
self.filename = filename
def __init__(self, path, arch, use_cuda=False):
self.path = path
self.use_cuda = use_cuda
self.name = name
model, epoch, accuracy = load(self.filename)
self.arch = arch
model = MODELS[self.arch]()
restored_state = torch.load(path)
model = model.load_state_dict(restored_state['model'])
accuracy = restored_state['accuracy']
epoch = restored_state['epoch'] + 1
if self.use_cuda:
self.model = model.cuda()
@ -66,13 +71,13 @@ def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
result = OrderedDict()
result['nodename'] = os.uname().nodename
result['model'] = engine.name
result['model'] = engine.arch
result['use_cuda'] = engine.use_cuda
result['batch_size'] = batch_size
result['mean'] = np.mean(times)
result['std'] = np.std(times)
result['throughput'] = batch_size / np.mean(times)
result['filename'] = engine.filename
result['path'] = engine.path
if output is not None:
save_result(result, output)
@ -122,13 +127,13 @@ def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
if cpu:
print('With CPU:')
engine = PyTorchEngine(path, use_cuda=False, name=model)
engine = PyTorchEngine(path, use_cuda=False, arch=model)
infer_cifar10(testset, engine, start=start, end=end, log2=log2,
repeat=repeat, output=output_path)
if gpu and torch.cuda.is_available():
print('With GPU:')
engine = PyTorchEngine(path, use_cuda=True, name=model)
engine = PyTorchEngine(path, use_cuda=True, arch=model)
# Warmup
time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)

View file

@ -1,6 +1,7 @@
import math
from functools import partial
import torch
from torch import nn
from torch.nn import functional as F
@ -40,6 +41,50 @@ class BasicBlock(nn.Module):
return outputs
class StochasticBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, survival_rate=1):
super().__init__()
self.survival_rate = survival_rate
self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.increasing = inplanes != (planes * self.expansion)
if self.increasing:
assert ((1. * planes * self.expansion) / inplanes) == 2
if stride != 1:
self.shortcut = nn.Sequential(nn.AvgPool2d(stride))
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
shortcut = self.shortcut(inputs)
if self.increasing:
shortcut = torch.cat([shortcut] + [shortcut.mul(0)], 1)
if not self.training or torch.rand(1)[0] <= self.survival_rate:
H = self.conv1(inputs)
H = self.bn1(H)
H = F.relu(H)
H = self.conv2(H)
H = self.bn2(H)
if self.training:
H /= self.survival_rate
H += shortcut
else:
H = shortcut
outputs = F.relu(H)
return outputs
class PreActBlock(nn.Module):
expansion = 1
@ -267,6 +312,30 @@ class ResNet(nn.Module):
return outputs
class StochasticResNet(ResNet):
def __init__(self, Block, layers, filters, num_classes=10, inplanes=None,
min_survival_rate=1.0, decay='linear'):
super().__init__(Block, layers, filters,
num_classes=num_classes,
inplanes=inplanes)
L = sum(layers)
l = 1
for section_index in range(self.num_sections):
section = getattr(self, f'section_{section_index}')
for name, module in section.named_children():
if decay == 'linear':
survival_rate = 1 - ((l / L) * (1 - min_survival_rate))
elif decay == 'uniform':
survival_rate = min_survival_rate
else:
raise NotImplementedError(
f"{decay} decay has not been implemented.")
module.survival_rate = survival_rate
l += 1
assert (l - 1) == L
# From "Deep Residual Learning for Image Recognition"
def ResNet20():
return ResNet(BasicBlock, layers=[3] * 3, filters=[16, 32, 64])
@ -292,7 +361,28 @@ def ResNet1202():
return ResNet(BasicBlock, layers=[200] * 3, filters=[16, 32, 64])
# Based on but not it "Identity Mappings in Deep Residual Networks"
# From "Identity Mappings in Deep Residual Networks"
def PreActResNet110():
return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
def PreActResNet164():
return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
def PreActResNet1001():
return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
# Based on but not in "Identity Mappings in Deep Residual Networks"
def PreActResNet8():
return ResNet(PreActBlock, layers=[1] * 3, filters=[16, 32, 64])
def PreActResNet14():
return ResNet(PreActBlock, layers=[2] * 3, filters=[16, 32, 64])
def PreActResNet20():
return ResNet(PreActBlock, layers=[3] * 3, filters=[16, 32, 64])
@ -305,17 +395,30 @@ def PreActResNet164Basic():
return ResNet(PreActBlock, layers=[27] * 3, filters=[16, 32, 64])
# From "Identity Mappings in Deep Residual Networks"
def PreActResNet110():
return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
# From "Deep Networks with Stochastic Depth"
def StochasticResNet110():
return StochasticResNet(StochasticBlock, layers=[18] * 3,
filters=[16, 32, 64], min_survival_rate=0.5,
decay='linear')
def PreActResNet164():
return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
def StochasticResNet1202():
return StochasticResNet(StochasticBlock, layers=[200] * 3,
filters=[16, 32, 64], min_survival_rate=0.5,
decay='linear')
def PreActResNet1001():
return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
# Based on but not in "Deep Networks for Stochastic Depth"
def StochasticResNet56():
return StochasticResNet(StochasticBlock, layers=[9] * 3,
filters=[16, 32, 64], min_survival_rate=0.5,
decay='linear')
def StochasticResNet56_08():
return StochasticResNet(StochasticBlock, layers=[9] * 3,
filters=[16, 32, 64], min_survival_rate=0.8,
decay='linear')
# From "Wide Residual Networks"

View file

@ -0,0 +1,153 @@
import os
import json
import pandas as pd
from benchmark.cifar10.train import MODELS
from benchmark.utils import count_parameters
MODEL_SIZES = {key: count_parameters(MODELS[key]()) for key in MODELS.keys()}
def single_run_acc(df):
df = df.copy()
df['duration'] = (df['timestamp'] - df['prev_timestamp']).apply(lambda x: x.total_seconds())
df['batch_duration'] = df['batch_duration'].apply(lambda x: x.total_seconds())
tmp = df.loc[:, ['epoch', 'batch_size', 'ncorrect', 'duration', 'batch_duration']].groupby('epoch').sum()
tmp['accuracy'] = tmp['ncorrect'] / tmp['batch_size']
tmp['throughput'] = tmp['batch_size'] / tmp['duration']
tmp['_throughput'] = tmp['batch_size'] / tmp['batch_duration']
tmp['elapsed'] = df.groupby('epoch')['elapsed'].agg('max')
tmp.reset_index(inplace=True)
return tmp
def load_file(file, start_timestamp=None):
df = pd.read_csv(file)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['batch_duration'] = pd.to_timedelta(df['batch_duration'])
df['ncorrect'] = df['top1_correct']
start_timestamp = start_timestamp or df['timestamp'].iloc[0]
df['elapsed'] = df['timestamp'] - start_timestamp
df['batch_accuracy'] = df['ncorrect'] / df['batch_size']
return df
def load_data(directory, verbose=True):
train_file = os.path.join(directory, 'train_results.csv')
train = load_file(train_file)
start_timestamp = train['timestamp'].iloc[0]
if verbose:
print(train_file)
print("Training results shape: {}".format(train.shape))
try:
test_file = os.path.join(directory, 'test_results.csv')
test = load_file(test_file, start_timestamp=start_timestamp)
except FileNotFoundError:
test_file = os.path.join(directory, 'valid_results.csv')
test = load_file(test_file, start_timestamp=start_timestamp)
if verbose:
print(test_file)
print('Test results shape: {}'.format(test.shape))
train['mode'] = 'train'
test['mode'] = 'test'
combined = pd.concat([train, test], ignore_index=True).sort_values(by=['timestamp'])
combined['prev_timestamp'] = combined['timestamp'].shift(1)
combined.loc[0, 'prev_timestamp'] = combined.loc[0, 'timestamp'] - combined.loc[0, 'batch_duration']
train = combined[combined['mode'] == 'train'].copy()
test = combined[combined['mode'] == 'test'].copy()
return single_run_acc(train), single_run_acc(test)
def load_multiple(directory, timestamps=None, verbose=False):
timestamps = timestamps or os.listdir(directory)
train_sets = []
test_sets = []
for timestamp in sorted(timestamps):
_dir = os.path.join(directory, timestamp)
train, test = load_data(_dir, verbose=verbose)
if verbose:
print()
train['run'] = _dir
test['run'] = _dir
train['job_start'] = timestamp
test['job_start'] = timestamp
train_sets.append(train)
test_sets.append(test)
return pd.concat(train_sets), pd.concat(test_sets)
def load_multiple_models(directory, verbose=False):
paths = os.listdir(directory)
models = [path for path in paths if path in MODELS]
train_sets = []
test_sets = []
for model in sorted(models):
if verbose:
print(f"Loading {model}")
_dir = os.path.join(directory, model)
train, test = load_multiple(_dir, verbose=verbose)
train['model'] = model
train['nparameters'] = MODEL_SIZES[model]
test['model'] = model
test['nparameters'] = MODEL_SIZES[model]
train_sets.append(train)
test_sets.append(test)
return pd.concat(train_sets), pd.concat(test_sets)
def concat_update(existing, other, repeat=False):
for key in other.keys():
if key in existing:
if existing[key] != other[key] or repeat:
current = existing[key]
if isinstance(current, list):
current.append(other[key])
else:
existing[key] = [current, other[key]]
else:
existing[key] = other[key]
def run_config(run, repeat=False):
full = {}
configs = (os.path.join(run, entry.name) for entry in os.scandir(run) if 'config' in entry.name)
for config in sorted(configs):
with open(config) as file:
tmp = json.load(file)
tmp['path'] = config
concat_update(full, tmp, repeat=repeat)
return full
def search_configs(criteria, configs):
matches = []
for run, config in configs.items():
is_match = True
for key, value in criteria.items():
try:
config_value = config[key]
if config_value != value:
is_match = False
except KeyError:
is_match = False
if is_match:
matches.append(run)
return matches

View file

@ -0,0 +1,374 @@
import os
from datetime import datetime
from collections import OrderedDict
import click
import torch
import tqdm
import numpy as np
from torch import nn, optim
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
from torchvision import datasets
from benchmark import utils
from benchmark.yellowfin import YFOptimizer
from benchmark.cifar10.models import resnet, densenet
MEAN = (0.4914, 0.4822, 0.4465)
STD = (0.2023, 0.1994, 0.2010)
MODELS = {
# "Deep Residual Learning for Image Recognition"
'resnet20': resnet.ResNet20,
'resnet32': resnet.ResNet32,
'resnet44': resnet.ResNet44,
'resnet56': resnet.ResNet56,
'resnet110': resnet.ResNet110,
'resnet1202': resnet.ResNet1202,
# "Wide Residual Networks"
'wrn-40-4': resnet.WRN_40_4,
'wrn-16-8': resnet.WRN_16_8,
'wrn-28-10': resnet.WRN_28_10,
# Based on "Identity Mappings in Deep Residual Networks"
'preact8': resnet.PreActResNet8,
'preact14': resnet.PreActResNet14,
'preact20': resnet.PreActResNet20,
'preact56': resnet.PreActResNet56,
'preact164-basic': resnet.PreActResNet164Basic,
# "Identity Mappings in Deep Residual Networks"
'preact110': resnet.PreActResNet110,
'preact164': resnet.PreActResNet164,
'preact1001': resnet.PreActResNet1001,
# Based on "Deep Networks with Stochastic Depth"
'stochastic56': resnet.StochasticResNet56,
'stochastic56-08': resnet.StochasticResNet56_08,
'stochastic110': resnet.StochasticResNet110,
'stochastic1202': resnet.StochasticResNet1202,
# "Aggregated Residual Transformations for Deep Neural Networks"
'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
# "Densely Connected Convolutional Networks"
'densenetbc100': densenet.DenseNetBC100,
'densenetbc250': densenet.DenseNetBC250,
'densenetbc190': densenet.DenseNetBC190,
# Kuangliu/pytorch-cifar
'resnet18': resnet.ResNet18,
'resnet50': resnet.ResNet50,
'resnet101': resnet.ResNet101,
'resnet152': resnet.ResNet152,
}
def correct(outputs, targets, top=(1, )):
_, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
targets = targets.view(-1, 1).expand_as(predictions)
corrects = predictions.eq(targets).cpu().int().cumsum(1).sum(0)
tops = list(map(lambda k: corrects.data[0][k - 1], top))
return tops
def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
use_cuda=False, tracking=None, train=True, half=False):
accuracies = [utils.AverageMeter() for _ in top]
assert criterion is not None or not train, 'Need criterion to train model'
assert optimizer is not None or not train, 'Need optimizer to train model'
loader = tqdm.tqdm(loader)
if train:
model.train()
losses = utils.AverageMeter()
else:
model.eval()
start = datetime.now()
for batch_index, (inputs, targets) in enumerate(loader):
inputs = Variable(inputs, requires_grad=False, volatile=not train)
targets = Variable(targets, requires_grad=False, volatile=not train)
batch_size = targets.size(0)
assert batch_size < 2**32, 'Size is too large! correct will overflow'
if use_cuda:
inputs = inputs.cuda()
targets = targets.cuda()
if half:
inputs = inputs.half()
outputs = model(inputs)
if train:
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.update(loss.data[0], batch_size)
_, predictions = torch.max(outputs.data, 1)
top_correct = correct(outputs, targets, top=top)
for i, count in enumerate(top_correct):
accuracies[i].update(count * (100. / batch_size), batch_size)
end = datetime.now()
if tracking is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['batch_duration'] = end - start
result['epoch'] = epoch
result['batch'] = batch_index
result['batch_size'] = batch_size
for i, k in enumerate(top):
result['top{}_correct'.format(k)] = top_correct[i]
result['top{}_accuracy'.format(k)] = accuracies[i].val
if train:
result['loss'] = loss.data[0]
utils.save_result(result, tracking)
desc = 'Epoch {} {}'.format(epoch, '(Train):' if train else '(Val): ')
if train:
desc += ' Loss {loss.val:.4f} ({loss.avg:.4f})'.format(loss=losses)
for k, acc in zip(top, accuracies):
desc += ' Prec@{} {acc.val:.3f} ({acc.avg:.3f})'.format(k, acc=acc)
loader.set_description(desc)
start = datetime.now()
if train:
message = 'Training accuracy of'
else:
message = 'Validation accuracy of'
for i, k in enumerate(top):
message += ' top-{}: {}'.format(k, accuracies[i].avg)
print(message)
return accuracies[0].avg
@click.command()
@click.option('--dataset-dir', default='./data/cifar10')
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
default='last')
@click.option('--restore', '-r')
@click.option('--tracking/--no-tracking', default=True)
@click.option('--cuda/--no-cuda', default=True)
@click.option('--epochs', '-e', default=200)
@click.option('--batch-size', '-b', default=32)
@click.option('--learning-rate', '-l', default=1e-3)
@click.option('--lr-factor', default=1.0, help='only for yellowfin')
@click.option('--momentum', default=0.9)
@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam', 'yellowfin']),
default='sgd')
@click.option('--augmentation/--no-augmentation', default=True)
@click.option('device_ids', '--device', '-d', multiple=True, type=int)
@click.option('--num-workers', type=int)
@click.option('--weight-decay', default=5e-4)
@click.option('--validation', '-v', default=0.0)
@click.option('--evaluate', is_flag=True)
@click.option('--shuffle/--no-shuffle', default=True)
@click.option('--half', is_flag=True)
@click.option('--arch', '-a', type=click.Choice(MODELS.keys()),
default='resnet20')
def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
batch_size, learning_rate, lr_factor, momentum, optimizer, augmentation,
device_ids, num_workers, weight_decay, validation, evaluate, shuffle,
half, arch):
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
local_timestamp = str(datetime.now())
config = {k: v for k, v in locals().items()}
use_cuda = cuda and torch.cuda.is_available()
# create model
model = MODELS[arch]()
# create optimizer
if optimizer == 'adam':
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
momentum=momentum,
weight_decay=weight_decay)
elif optimizer == 'yellowfin':
optimizer = YFOptimizer(model.parameters(), lr=learning_rate,
mu=momentum, weight_decay=weight_decay)
else:
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
if restore is not None:
if restore == 'latest':
restore = utils.latest_file(arch)
print(f'Restoring model from {restore}')
assert os.path.exists(restore)
restored_state = torch.load(restore)
assert restored_state['arch'] == arch
model.load_state_dict(restored_state['model'])
optimizer.load_state_dict(restored_state['optimizer'])
if not isinstance(optimizer, YFOptimizer):
for group in optimizer.param_groups:
group['lr'] = learning_rate
best_accuracy = restored_state['accuracy']
start_epoch = restored_state['epoch'] + 1
run_dir = os.path.split(restore)[0]
else:
best_accuracy = 0.0
start_epoch = 1
run_dir = f"./run/{arch}/{timestamp}"
print('Starting accuracy is {}'.format(best_accuracy))
if not os.path.exists(run_dir):
os.makedirs(run_dir)
utils.save_config(config, run_dir)
print(model)
print("{} parameters".format(utils.count_parameters(model)))
print(f"Run directory set to {run_dir}")
# Save model text description
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
file.write(str(model))
if tracking:
train_results_file = os.path.join(run_dir, 'train_results.csv')
valid_results_file = os.path.join(run_dir, 'valid_results.csv')
test_results_file = os.path.join(run_dir, 'test_results.csv')
else:
train_results_file = None
valid_results_file = None
test_results_file = None
# create loss
criterion = nn.CrossEntropyLoss()
if use_cuda:
print('Copying model to GPU')
model = model.cuda()
criterion = criterion.cuda()
if half:
model = model.half()
criterion = criterion.half()
device_ids = device_ids or list(range(torch.cuda.device_count()))
model = torch.nn.DataParallel(
model, device_ids=device_ids)
num_workers = num_workers or len(device_ids)
else:
num_workers = num_workers or 1
if half:
print('Half precision (16-bit floating point) only works on GPU')
print(f"using {num_workers} workers for data loading")
# load data
print("Preparing data:")
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root=dataset_dir, train=False, download=True,
transform=transform_test),
batch_size=batch_size, shuffle=False, num_workers=num_workers,
pin_memory=use_cuda)
if evaluate:
print("Only running evaluation of model on test dataset")
run(start_epoch - 1, model, test_loader, use_cuda=use_cuda,
tracking=test_results_file, train=False)
return
if augmentation:
transform_train = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
else:
transform_train = []
transform_train = transforms.Compose(transform_train + [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
train_dataset = datasets.CIFAR10(root=dataset_dir, train=True,
download=True, transform=transform_train)
num_train = len(train_dataset)
indices = list(range(num_train))
assert 1 > validation and validation >= 0, "Validation must be in [0, 1)"
split = num_train - int(validation * num_train)
if shuffle:
np.random.shuffle(indices)
train_indices = indices[:split]
valid_indices = indices[split:]
print('Using {} examples for training'.format(len(train_indices)))
print('Using {} examples for validation'.format(len(valid_indices)))
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)
train_loader = torch.utils.data.DataLoader(
train_dataset, sampler=train_sampler, batch_size=batch_size,
num_workers=num_workers, pin_memory=use_cuda)
if validation != 0:
valid_loader = torch.utils.data.DataLoader(
train_dataset, sampler=valid_sampler, batch_size=batch_size,
num_workers=num_workers, pin_memory=use_cuda)
else:
print('Using test dataset for validation')
valid_loader = test_loader
end_epoch = start_epoch + epochs
# YellowFin doesn't have param_groups causing AttributeError
if not isinstance(optimizer, YFOptimizer):
for group in optimizer.param_groups:
if 'lr' in group:
print('Learning rate set to {}'.format(group['lr']))
assert group['lr'] == learning_rate
else:
print(f"set lr_factor to {lr_factor}")
optimizer.set_lr_factor(lr_factor)
for epoch in range(start_epoch, end_epoch):
run(epoch, model, train_loader, criterion, optimizer,
use_cuda=use_cuda, tracking=train_results_file, train=True,
half=half)
valid_acc = run(epoch, model, valid_loader, use_cuda=use_cuda,
tracking=valid_results_file, train=False, half=half)
is_best = valid_acc > best_accuracy
last_epoch = epoch == (end_epoch - 1)
if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
state = {
'epoch': epoch,
'arch': arch,
'model': (model.module if use_cuda else model).state_dict(),
'accuracy': valid_acc,
'optimizer': optimizer.state_dict()
}
if is_best:
print('New best model!')
filename = os.path.join(run_dir, 'checkpoint_best_model.t7')
print(f'Saving checkpoint to {filename}')
best_accuracy = valid_acc
torch.save(state, filename)
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7')
print(f'Saving checkpoint to {filename}')
torch.save(state, filename)
if __name__ == '__main__':
train()

View file

@ -0,0 +1,14 @@
import click
from benchmark.imagenet.train import train
@click.group()
def cli():
pass
cli.add_command(train, name='train')
if __name__ == '__main__':
cli()

View file

@ -0,0 +1,339 @@
import os
import time
from datetime import datetime
from collections import OrderedDict
import click
import tqdm
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from benchmark import utils
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
@click.command()
@click.option('--dataset-dir', default='./data/imagenet')
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
default='last')
@click.option('--restore', '-r')
@click.option('--tracking/--no-tracking', default=True)
@click.option('--cuda/--no-cuda', default=True)
@click.option('--epochs', '-e', default=90)
@click.option('--batch-size', '-b', default=256)
@click.option('--learning-rate', '-l', default=0.1)
@click.option('--learning-rate-decay', default=0.1)
@click.option('--learning-rate-freq', default=30)
@click.option('--momentum', default=0.9)
@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam']),
default='sgd')
@click.option('--augmentation/--no-augmentation', default=True)
@click.option('--pretrained', is_flag=True)
@click.option('--evaluate', is_flag=True)
@click.option('--num-workers', type=int)
@click.option('--weight-decay', default=1e-4)
@click.option('--arch', '-a', type=click.Choice(model_names),
default='resnet18')
def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
batch_size, learning_rate, learning_rate_decay,
learning_rate_freq, momentum, optimizer, augmentation,
pretrained, evaluate, num_workers, weight_decay, arch):
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
config = {k: v for k, v in locals().items()}
use_cuda = cuda and torch.cuda.is_available()
# create model
if pretrained:
print("=> using pre-trained model '{}'".format(arch))
model = models.__dict__[arch](pretrained=True)
else:
print("=> creating model '{}'".format(arch))
model = models.__dict__[arch]()
if optimizer == 'adam':
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
optimizer = torch.optim.SGD(model.parameters(), learning_rate,
momentum=momentum,
weight_decay=weight_decay)
else:
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
# optionally resume from a checkpoint
if restore is not None:
if restore == 'latest':
restore = utils.latest_file(arch)
print(f'=> restoring model from {restore}')
restored_state = torch.load(restore)
start_epoch = restored_state['epoch'] + 1
best_prec1 = restored_state['prec1']
model.load_state_dict(restored_state['state_dict'])
optimizer.load_state_dict(restored_state['optimizer'])
print('=> starting accuracy is {} (epoch {})'
.format(best_prec1, start_epoch))
run_dir = os.path.split(restore)[0]
else:
best_prec1 = 0.0
start_epoch = 1
run_dir = f"./run/{arch}/{timestamp}"
if not os.path.exists(run_dir):
os.makedirs(run_dir)
utils.save_config(config, run_dir)
print(model)
print("{} parameters".format(utils.count_parameters(model)))
print(f"Run directory set to {run_dir}")
# save model text description
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
file.write(str(model))
if tracking:
train_results_file = os.path.join(run_dir, 'train_results.csv')
test_results_file = os.path.join(run_dir, 'test_results.csv')
else:
train_results_file = None
test_results_file = None
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss()
# move model and criterion to GPU
if use_cuda:
model.cuda()
criterion = criterion.cuda()
model = torch.nn.parallel.DataParallel(model)
num_workers = num_workers or torch.cuda.device_count()
else:
num_workers = num_workers or 1
print(f"=> using {num_workers} workers for data loading")
cudnn.benchmark = True
# Data loading code
print("=> preparing data:")
traindir = os.path.join(dataset_dir, 'train')
valdir = os.path.join(dataset_dir, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_sampler = None
train_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(traindir, transforms.Compose([
transforms.RandomSizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])),
batch_size=batch_size, shuffle=(train_sampler is None),
num_workers=num_workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Scale(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=True)
if evaluate:
validate(val_loader, model, criterion)
return
end_epoch = start_epoch + epochs
for epoch in range(start_epoch, end_epoch):
print('Epoch {} of {}'.format(epoch, end_epoch - 1))
adjust_learning_rate(optimizer, epoch, learning_rate,
decay=learning_rate_decay,
freq=learning_rate_freq)
# train for one epoch
_ = train_one_epoch(
train_loader, model, criterion, optimizer, epoch,
tracking=train_results_file)
# evaluate on validation set
prec1, _ = validate(
val_loader, model, criterion, epoch, tracking=test_results_file)
# remember best prec@1 and save checkpoint
is_best = prec1 > best_prec1
last_epoch = epoch == (end_epoch - 1)
if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
state = {
'epoch': epoch,
'arch': arch,
'state_dict': (model.module if use_cuda else model).state_dict(),
'prec1': prec1,
'optimizer': optimizer.state_dict(),
}
if is_best:
print('New best model!')
filename = os.path.join(run_dir, 'checkpoint_best_model.t7')
print(f'=> saving checkpoint to {filename}')
torch.save(state, filename)
best_prec1 = prec1
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7')
print(f'=> saving checkpoint to {filename}')
torch.save(state, filename)
def train_one_epoch(train_loader, model, criterion, optimizer, epoch,
tracking=None):
train_loader = tqdm.tqdm(train_loader)
batch_time = utils.AverageMeter()
data_time = utils.AverageMeter()
losses = utils.AverageMeter()
top1 = utils.AverageMeter()
top5 = utils.AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
target = target.cuda(async=True)
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
if tracking is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['batch_duration'] = batch_time.val
result['epoch'] = epoch
result['batch'] = i
result['batch_size'] = input.size(0)
result['top1_accuracy'] = prec1[0]
result['top5_accuracy'] = prec5[0]
result['loss'] = loss.data[0]
result['data_duration'] = data_time.val
utils.save_result(result, tracking)
desc = ('Epoch {0} (Train):'
' Loss {loss.val:.4f} ({loss.avg:.4f})'
' Prec@1 {top1.val:.3f} ({top1.avg:.3f})'
' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
train_loader.set_description(desc)
end = time.time()
return top1.avg, top5.avg
def validate(val_loader, model, criterion, epoch, tracking=None):
val_loader = tqdm.tqdm(val_loader)
batch_time = utils.AverageMeter()
losses = utils.AverageMeter()
top1 = utils.AverageMeter()
top5 = utils.AverageMeter()
# switch to evaluate mode
model.eval()
end = time.time()
for i, (input, target) in enumerate(val_loader):
target = target.cuda(async=True)
input_var = torch.autograd.Variable(input, volatile=True)
target_var = torch.autograd.Variable(target, volatile=True)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
if tracking is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['batch_duration'] = batch_time.val
result['epoch'] = epoch
result['batch'] = i
result['batch_size'] = input.size(0)
result['top1_accuracy'] = prec1[0]
result['top5_accuracy'] = prec5[0]
result['loss'] = loss.data[0]
utils.save_result(result, tracking)
desc = ('Epoch {0} (Val): '
' Loss {loss.val:.4f} ({loss.avg:.4f})'
' Prec@1 {top1.val:.3f} ({top1.avg:.3f})'
' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
epoch, i, len(val_loader), batch_time=batch_time,
loss=losses, top1=top1, top5=top5))
val_loader.set_description(desc)
end = time.time()
print("Evaluation: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}"
.format(top1=top1, top5=top5))
return top1.avg, top5.avg
def adjust_learning_rate(optimizer, epoch, initial_learning_rate, decay, freq):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = initial_learning_rate * (decay ** ((epoch - 1) // freq))
print(f'=> learning rate is set to {lr}')
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
train()

View file

@ -1,336 +0,0 @@
import os
import re
import json
from functools import reduce
from datetime import datetime
from collections import OrderedDict
import click
import torch
import progressbar
from torch import nn, optim
from torch.autograd import Variable
from torchvision import transforms
from torchvision import datasets as dset
from benchmark.models import resnet, densenet
MEAN = (0.4914, 0.4822, 0.4465)
STD = (0.2023, 0.1994, 0.2010)
MODELS = {
# "Deep Residual Learning for Image Recognition"
'resnet20': resnet.ResNet20,
'resnet32': resnet.ResNet32,
'resnet44': resnet.ResNet44,
'resnet56': resnet.ResNet56,
'resnet110': resnet.ResNet110,
'resnet1202': resnet.ResNet1202,
# "Wide Residual Networks"
'wrn-40-4': resnet.WRN_40_4,
'wrn-16-8': resnet.WRN_16_8,
'wrn-28-10': resnet.WRN_28_10,
# Based on "Identity Mappings in Deep Residual Networks"
'preact20': resnet.PreActResNet20,
'preact56': resnet.PreActResNet56,
'preact164-basic': resnet.PreActResNet164Basic,
# "Identity Mappings in Deep Residual Networks"
'preact110': resnet.PreActResNet110,
'preact164': resnet.PreActResNet164,
'preact1001': resnet.PreActResNet1001,
# "Aggregated Residual Transformations for Deep Neural Networks"
'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
# "Densely Connected Convolutional Networks"
'densenetbc100': densenet.DenseNetBC100,
'densenetbc250': densenet.DenseNetBC250,
'densenetbc190': densenet.DenseNetBC190,
# Kuangliu/pytorch-cifar
'resnet18': resnet.ResNet18,
'resnet50': resnet.ResNet50,
'resnet101': resnet.ResNet101,
'resnet152': resnet.ResNet152,
}
def count_parameters(model):
c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
return sum(c)
def correct(outputs, targets, top=(1, )):
_, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
targets = targets.view(-1, 1).expand_as(predictions)
corrects = predictions.eq(targets).cpu().cumsum(1).sum(0)
tops = list(map(lambda k: corrects.data[0][k - 1], top))
return tops
def save_result(result, path):
write_heading = not os.path.exists(path)
with open(path, mode='a') as out:
if write_heading:
out.write(",".join([str(k) for k, v in result.items()]) + '\n')
out.write(",".join([str(v) for k, v in result.items()]) + '\n')
def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
use_cuda=False, tracking=None, max_value=None, train=True):
assert criterion is not None or not train, 'Need criterion to train model'
assert optimizer is not None or not train, 'Need optimizer to train model'
max_value = max_value or progressbar.UnknownLength
bar = progressbar.ProgressBar(max_value=max_value)
total = 0
correct_counts = {}
if train:
model.train()
else:
model.eval()
start = datetime.now()
for batch_index, (inputs, targets) in enumerate(loader):
inputs = Variable(inputs, requires_grad=False, volatile=not train)
targets = Variable(targets, requires_grad=False, volatile=not train)
if use_cuda:
inputs = inputs.cuda()
targets = targets.cuda()
outputs = model(inputs)
if train:
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, predictions = torch.max(outputs.data, 1)
batch_size = targets.size(0)
top_correct = correct(outputs, targets, top=top)
total += batch_size
for k, count in zip(top, top_correct):
correct_counts[k] = correct_counts.get(k, 0) + count
end = datetime.now()
if tracking is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['batch_duration'] = end - start
result['epoch'] = epoch
result['batch'] = batch_index
result['batch_size'] = batch_size
for i, k in enumerate(top):
result['top{}_correct'.format(k)] = top_correct[i]
if train:
result['loss'] = loss.data[0]
save_result(result, tracking)
bar.update(batch_index + 1)
start = datetime.now()
print()
if train:
message = 'Training accuracy of'
else:
message = 'Test accuracy of'
for k in top:
accuracy = correct_counts[k] / total
message += ' top-{}: {}'.format(k, accuracy)
print(message)
return (1. * correct_counts[top[0]]) / total, batch_index + 1
def save(model, directory, epoch, accuracy, use_cuda=False, filename=None):
state = {
'model': model.module if use_cuda else model,
'epoch': epoch,
'accuracy': accuracy
}
filename = filename or 'checkpoint_{}.t7'.format(epoch)
torch.save(state, os.path.join(directory, filename))
def save_config(config, run_dir):
path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
with open(path, 'w') as config_file:
json.dump(config, config_file)
config_file.write('\n')
def load(path):
assert os.path.exists(path)
state = torch.load(path)
model = state['model']
epoch = state['epoch']
accuracy = state['accuracy']
return model, epoch, accuracy
def latest_file(model):
restore = f'./run/{model}'
timestamps = sorted(os.listdir(restore))
assert len(timestamps) > 0
run_dir = os.path.join(restore, timestamps[-1])
files = os.listdir(run_dir)
max_checkpoint = -1
for filename in files:
if re.search('checkpoint_\d+.t7', filename):
num = int(re.search('\d+', filename).group())
if num > max_checkpoint:
max_checkpoint = num
max_checkpoint_file = filename
assert max_checkpoint != -1
return os.path.join(run_dir, max_checkpoint_file)
@click.command()
@click.option('--dataset-dir', default='./data/cifar10')
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
default='last')
@click.option('--restore', '-r')
@click.option('--tracking/--no-tracking', default=True)
@click.option('--cuda/--no-cuda', default=True)
@click.option('--epochs', '-e', default=200)
@click.option('--batch-size', '-b', default=32)
@click.option('--learning-rate', '-l', default=1e-3)
@click.option('--sgd', 'optimizer', flag_value='sgd')
@click.option('--adam', 'optimizer', flag_value='adam', default=True)
@click.option('--augmentation/--no-augmentation', default=True)
@click.option('--num-workers', type=int)
@click.option('--weight-decay', default=5e-4)
@click.option('--model', '-m', type=click.Choice(MODELS.keys()),
default='resnet20')
def main(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
batch_size, learning_rate, optimizer, augmentation, num_workers,
weight_decay, model):
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
config = {k: v for k, v in locals().items()}
use_cuda = cuda and torch.cuda.is_available()
if use_cuda:
num_workers = num_workers or torch.cuda.device_count()
else:
num_workers = num_workers or 1
print(f"using {num_workers} workers for data loading")
print("Preparing data:")
if augmentation:
transform_train = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
else:
transform_train = []
transform_train = transforms.Compose(transform_train + [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
trainset = dset.CIFAR10(root=dataset_dir, train=True, download=True,
transform=transform_train)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
pin_memory=use_cuda)
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
testset = dset.CIFAR10(root=dataset_dir, train=False, download=True,
transform=transform_test)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
pin_memory=use_cuda)
if restore is not None:
if restore == 'latest':
restore = latest_file(model)
print(f'Restoring model from {restore}')
model, start_epoch, best_accuracy = load(restore)
start_epoch += 1
print('Starting accuracy is {}'.format(best_accuracy))
run_dir = os.path.split(restore)[0]
else:
print(f'Building {model} model')
best_accuracy = -1
start_epoch = 1
run_dir = f"./run/{model}/{timestamp}"
model = MODELS[model]()
if not os.path.exists(run_dir):
os.makedirs(run_dir)
save_config(config, run_dir)
print(model)
print("{} parameters".format(count_parameters(model)))
print(f"Run directory set to {run_dir}")
# Save model text description
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
file.write(str(model))
if tracking:
train_results_file = os.path.join(run_dir, 'train_results.csv')
test_results_file = os.path.join(run_dir, 'test_results.csv')
else:
train_results_file = None
test_results_file = None
if use_cuda:
print('Copying model to GPU')
model.cuda()
model = torch.nn.DataParallel(
model, device_ids=range(torch.cuda.device_count()))
criterion = nn.CrossEntropyLoss()
# Other parameters?
if optimizer == 'adam':
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
momentum=0.9,
weight_decay=weight_decay)
else:
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
train_max_value = None
test_max_value = None
end_epoch = start_epoch + epochs
for epoch in range(start_epoch, end_epoch):
print('Epoch {} of {}'.format(epoch, end_epoch - 1))
train_acc, train_max_value = run(epoch, model, train_loader, criterion,
optimizer, use_cuda=use_cuda,
tracking=train_results_file,
max_value=train_max_value, train=True)
test_acc, test_max_value = run(epoch, model, test_loader,
use_cuda=use_cuda,
tracking=test_results_file, train=False)
if test_acc > best_accuracy:
print('New best model!')
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda,
filename='checkpoint_best_model.t7')
best_accuracy = test_acc
last_epoch = epoch == (end_epoch - 1)
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,61 @@
import os
import json
import re
from functools import reduce
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def count_parameters(model):
c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
return sum(c)
def latest_file(model):
restore = f'./run/{model}'
timestamps = sorted(os.listdir(restore))
assert len(timestamps) > 0
run_dir = os.path.join(restore, timestamps[-1])
files = os.listdir(run_dir)
max_checkpoint = -1
for filename in files:
if re.search('checkpoint_\d+.t7', filename):
num = int(re.search('\d+', filename).group())
if num > max_checkpoint:
max_checkpoint = num
max_checkpoint_file = filename
assert max_checkpoint != -1
return os.path.join(run_dir, max_checkpoint_file)
def save_result(result, path):
write_heading = not os.path.exists(path)
with open(path, mode='a') as out:
if write_heading:
out.write(",".join([str(k) for k, v in result.items()]) + '\n')
out.write(",".join([str(v) for k, v in result.items()]) + '\n')
def save_config(config, run_dir):
path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
with open(path, 'w') as config_file:
json.dump(config, config_file)
config_file.write('\n')

View file

@ -0,0 +1,384 @@
import math
import numpy as np
import torch
# eps for numerical stability
eps = 1e-15
class YFOptimizer(object):
def __init__(self, var_list, lr=0.1, mu=0.0, clip_thresh=None, weight_decay=0.0,
beta=0.999, curv_win_width=20, zero_debias=True, sparsity_debias=True, delta_mu=0.0,
auto_clip_fac=None, force_non_inc_step=False):
'''
clip thresh is the threshold value on ||lr * gradient||
delta_mu can be place holder/variable/python scalar. They are used for additional
momentum in situations such as asynchronous-parallel training. The default is 0.0
for basic usage of the optimizer.
Args:
lr: python scalar. The initial value of learning rate, we use 1.0 in our paper.
mu: python scalar. The initial value of momentum, we use 0.0 in our paper.
clip_thresh: python scalar. The manaully-set clipping threshold for tf.clip_by_global_norm.
if None, the automatic clipping can be carried out. The automatic clipping
feature is parameterized by argument auto_clip_fac. The auto clip feature
can be switched off with auto_clip_fac = None
beta: python scalar. The smoothing parameter for estimations.
sparsity_debias: gradient norm and curvature are biased to larger values when
calculated with sparse gradient. This is useful when the model is very sparse,
e.g. LSTM with word embedding. For non-sparse CNN, turning it off could slightly
accelerate the speed.
delta_mu: for extensions. Not necessary in the basic use.
force_non_inc_step: in some very rare cases, it is necessary to force ||lr * gradient||
to be not increasing dramatically for stableness after some iterations.
In practice, if turned on, we enforce lr * sqrt(smoothed ||grad||^2)
to be less than 2x of the minimal value of historical value on smoothed || lr * grad ||.
This feature is turned off by default.
Other features:
If you want to manually control the learning rates, self.lr_factor is
an interface to the outside, it is an multiplier for the internal learning rate
in YellowFin. It is helpful when you want to do additional hand tuning
or some decaying scheme to the tuned learning rate in YellowFin.
Example on using lr_factor can be found here:
https://github.com/JianGoForIt/YellowFin_Pytorch/blob/master/pytorch-cifar/main.py#L109
'''
self._lr = lr
self._mu = mu
# we convert var_list from generator to list so that
# it can be used for multiple times
self._var_list = list(var_list)
self._clip_thresh = clip_thresh
self._auto_clip_fac = auto_clip_fac
self._beta = beta
self._curv_win_width = curv_win_width
self._zero_debias = zero_debias
self._sparsity_debias = sparsity_debias
self._force_non_inc_step = force_non_inc_step
self._optimizer = torch.optim.SGD(self._var_list, lr=self._lr,
momentum=self._mu, weight_decay=weight_decay)
self._iter = 0
# global states are the statistics
self._global_state = {}
# for decaying learning rate and etc.
self._lr_factor = 1.0
def state_dict(self):
# for checkpoint saving
sgd_state_dict = self._optimizer.state_dict()
global_state = self._global_state
lr_factor = self._lr_factor
iter = self._iter
lr = self._lr
mu = self._mu
clip_thresh = self._clip_thresh
beta = self._beta
curv_win_width = self._curv_win_width
zero_debias = self._zero_debias
h_min = self._h_min
h_max = self._h_max
return {
"sgd_state_dict": sgd_state_dict,
"global_state": global_state,
"lr_factor": lr_factor,
"iter": iter,
"lr": lr,
"mu": mu,
"clip_thresh": clip_thresh,
"beta": beta,
"curv_win_width": curv_win_width,
"zero_debias": zero_debias,
"h_min": h_min,
"h_max": h_max
}
def load_state_dict(self, state_dict):
# for checkpoint saving
self._optimizer.load_state_dict(state_dict['sgd_state_dict'])
self._global_state = state_dict['global_state']
self._lr_factor = state_dict['lr_factor']
self._iter = state_dict['iter']
self._lr = state_dict['lr']
self._mu = state_dict['mu']
self._clip_thresh = state_dict['clip_thresh']
self._beta = state_dict['beta']
self._curv_win_width = state_dict['curv_win_width']
self._zero_debias = state_dict['zero_debias']
self._h_min = state_dict["h_min"]
self._h_max = state_dict["h_max"]
return
def set_lr_factor(self, factor):
self._lr_factor = factor
return
def get_lr_factor(self):
return self._lr_factor
def zero_grad(self):
self._optimizer.zero_grad()
return
def zero_debias_factor(self):
return 1.0 - self._beta ** (self._iter + 1)
def zero_debias_factor_delay(self, delay):
# for exponentially averaged stat which starts at non-zero iter
return 1.0 - self._beta ** (self._iter - delay + 1)
def curvature_range(self):
global_state = self._global_state
if self._iter == 0:
global_state["curv_win"] = torch.FloatTensor(self._curv_win_width, 1).zero_()
curv_win = global_state["curv_win"]
grad_norm_squared = self._global_state["grad_norm_squared"]
curv_win[self._iter % self._curv_win_width] = np.log(grad_norm_squared + eps)
valid_end = min(self._curv_win_width, self._iter + 1)
# we use running average over log scale, accelerating
# h_max / min in the begining to follow the varying trend of curvature.
beta = self._beta
if self._iter == 0:
global_state["h_min_avg"] = 0.0
global_state["h_max_avg"] = 0.0
self._h_min = 0.0
self._h_max = 0.0
global_state["h_min_avg"] = \
global_state["h_min_avg"] * beta + (1 - beta) * torch.min(curv_win[:valid_end] )
global_state["h_max_avg"] = \
global_state["h_max_avg"] * beta + (1 - beta) * torch.max(curv_win[:valid_end] )
if self._zero_debias:
debias_factor = self.zero_debias_factor()
self._h_min = np.exp(global_state["h_min_avg"] / debias_factor)
self._h_max = np.exp(global_state["h_max_avg"] / debias_factor)
else:
self._h_min = np.exp(global_state["h_min_avg"] )
self._h_max = np.exp(global_state["h_max_avg"] )
if self._sparsity_debias:
self._h_min *= self._sparsity_avg
self._h_max *= self._sparsity_avg
return
def grad_variance(self):
global_state = self._global_state
beta = self._beta
self._grad_var = np.array(0.0, dtype=np.float32)
for group in self._optimizer.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self._optimizer.state[p]
if self._iter == 0:
state["grad_avg"] = grad.new().resize_as_(grad).zero_()
state["grad_avg_squared"] = 0.0
state["grad_avg"].mul_(beta).add_(1 - beta, grad)
self._grad_var += torch.sum(state["grad_avg"] * state["grad_avg"] )
if self._zero_debias:
debias_factor = self.zero_debias_factor()
else:
debias_factor = 1.0
self._grad_var /= -(debias_factor**2)
self._grad_var += global_state['grad_norm_squared_avg'] / debias_factor
# in case of negative variance: the two term are using different debias factors
self._grad_var = max(self._grad_var, eps)
if self._sparsity_debias:
self._grad_var *= self._sparsity_avg
return
def dist_to_opt(self):
global_state = self._global_state
beta = self._beta
if self._iter == 0:
global_state["grad_norm_avg"] = 0.0
global_state["dist_to_opt_avg"] = 0.0
global_state["grad_norm_avg"] = \
global_state["grad_norm_avg"] * beta + (1 - beta) * math.sqrt(global_state["grad_norm_squared"] )
global_state["dist_to_opt_avg"] = \
global_state["dist_to_opt_avg"] * beta \
+ (1 - beta) * global_state["grad_norm_avg"] / (global_state['grad_norm_squared_avg'] + eps)
if self._zero_debias:
debias_factor = self.zero_debias_factor()
self._dist_to_opt = global_state["dist_to_opt_avg"] / debias_factor
else:
self._dist_to_opt = global_state["dist_to_opt_avg"]
if self._sparsity_debias:
self._dist_to_opt /= (np.sqrt(self._sparsity_avg) + eps)
return
def grad_sparsity(self):
global_state = self._global_state
if self._iter == 0:
global_state["sparsity_avg"] = 0.0
non_zero_cnt = 0.0
all_entry_cnt = 0.0
for group in self._optimizer.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
grad_non_zero = grad.nonzero()
if grad_non_zero.dim() > 0:
non_zero_cnt += grad_non_zero.size()[0]
all_entry_cnt += torch.numel(grad)
beta = self._beta
global_state["sparsity_avg"] = beta * global_state["sparsity_avg"] \
+ (1 - beta) * non_zero_cnt / float(all_entry_cnt)
self._sparsity_avg = \
global_state["sparsity_avg"] / self.zero_debias_factor()
return
def lr_grad_norm_avg(self):
# this is for enforcing lr * grad_norm not
# increasing dramatically in case of instability.
# Not necessary for basic use.
global_state = self._global_state
beta = self._beta
if "lr_grad_norm_avg" not in global_state:
global_state['grad_norm_squared_avg_log'] = 0.0
global_state['grad_norm_squared_avg_log'] = \
global_state['grad_norm_squared_avg_log'] * beta \
+ (1 - beta) * np.log(global_state['grad_norm_squared'] + eps)
if "lr_grad_norm_avg" not in global_state:
global_state["lr_grad_norm_avg"] = \
0.0 * beta + (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
# we monitor the minimal smoothed ||lr * grad||
global_state["lr_grad_norm_avg_min"] = \
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() )
else:
global_state["lr_grad_norm_avg"] = global_state["lr_grad_norm_avg"] * beta \
+ (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
global_state["lr_grad_norm_avg_min"] = \
min(global_state["lr_grad_norm_avg_min"],
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() ) )
def after_apply(self):
# compute running average of gradient and norm of gradient
beta = self._beta
global_state = self._global_state
if self._iter == 0:
global_state["grad_norm_squared_avg"] = 0.0
global_state["grad_norm_squared"] = 0.0
for group in self._optimizer.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
global_state['grad_norm_squared'] += torch.sum(grad * grad)
global_state['grad_norm_squared_avg'] = \
global_state['grad_norm_squared_avg'] * beta + (1 - beta) * global_state['grad_norm_squared']
if self._sparsity_debias:
self.grad_sparsity()
self.curvature_range()
self.grad_variance()
self.dist_to_opt()
if self._iter > 0:
self.get_mu()
self.get_lr()
self._lr = beta * self._lr + (1 - beta) * self._lr_t
self._mu = beta * self._mu + (1 - beta) * self._mu_t
return
def get_lr(self):
self._lr_t = (1.0 - math.sqrt(self._mu_t) )**2 / (self._h_min + eps)
return
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# eps in the numerator is to prevent momentum = 1 in case of zero gradient
p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps)
w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + eps)
x = y + 1
return x
def get_mu(self):
root = self.get_cubic_root()
dr = self._h_max / self._h_min
self._mu_t = max(root**2, ( (np.sqrt(dr) - 1) / (np.sqrt(dr) + 1) )**2 )
return
def update_hyper_param(self):
for group in self._optimizer.param_groups:
group['momentum'] = self._mu
if self._force_non_inc_step == False:
group['lr'] = self._lr * self._lr_factor
elif self._iter > self._curv_win_width:
# force to guarantee lr * grad_norm not increasing dramatically.
# Not necessary for basic use. Please refer to the comments
# in YFOptimizer.__init__ for more details
self.lr_grad_norm_avg()
debias_factor = self.zero_debias_factor()
group['lr'] = min(self._lr * self._lr_factor,
2.0 * self._global_state["lr_grad_norm_avg_min"] \
/ np.sqrt(np.exp(self._global_state['grad_norm_squared_avg_log'] / debias_factor) ) )
return
def auto_clip_thresh(self):
# Heuristic to automatically prevent sudden exploding gradient
# Not necessary for basic use.
return math.sqrt(self._h_max) * self._auto_clip_fac
def step(self):
# add weight decay
for group in self._optimizer.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
if self._clip_thresh != None:
torch.nn.utils.clip_grad_norm(self._var_list, self._clip_thresh)
elif (self._iter != 0 and self._auto_clip_fac != None):
# do not clip the first iteration
torch.nn.utils.clip_grad_norm(self._var_list, self.auto_clip_thresh() )
# apply update
self._optimizer.step()
# after appply
self.after_apply()
# update learning rate and momentum
self.update_hyper_param()
self._iter += 1
return

View file

@ -9,12 +9,13 @@ setup(
packages=['benchmark'],
entry_points={
'console_scripts': [
'bench = benchmark.train:main'
'cifar10 = benchmark.cifar10.__main__:cli',
'imagenet = benchmark.imagenet.__main__:cli'
]
},
install_requires=[
'tqdm',
'torchvision',
'click',
'progressbar2'
]
)