Update pytorch benchmark code with new command line interface
This commit is contained in:
parent
bf53b6a029
commit
339261c19f
14 changed files with 1476 additions and 355 deletions
6
pytorch/CIFAR10/.gitignore
vendored
Normal file
6
pytorch/CIFAR10/.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
*.pyc
|
||||
__pycache__/
|
||||
.eggs/
|
||||
*.egg-info/
|
||||
.cache
|
||||
data/
|
0
pytorch/CIFAR10/benchmark/cifar10/__init__.py
Normal file
0
pytorch/CIFAR10/benchmark/cifar10/__init__.py
Normal file
17
pytorch/CIFAR10/benchmark/cifar10/__main__.py
Normal file
17
pytorch/CIFAR10/benchmark/cifar10/__main__.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import click
|
||||
|
||||
from benchmark.cifar10.train import train
|
||||
from benchmark.cifar10.infer import infer
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
cli.add_command(train, name='train')
|
||||
cli.add_command(infer, name='infer')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
|
@ -10,15 +10,20 @@ from torch.autograd import Variable
|
|||
from torchvision import transforms
|
||||
from torchvision import datasets
|
||||
|
||||
from benchmark.train import load, MEAN, STD, save_result, MODELS
|
||||
from benchmark.utils import save_result
|
||||
from benchmark.cifar10.train import MEAN, STD, MODELS
|
||||
|
||||
|
||||
class PyTorchEngine:
|
||||
def __init__(self, filename, use_cuda=False, name=None):
|
||||
self.filename = filename
|
||||
def __init__(self, path, arch, use_cuda=False):
|
||||
self.path = path
|
||||
self.use_cuda = use_cuda
|
||||
self.name = name
|
||||
model, epoch, accuracy = load(self.filename)
|
||||
self.arch = arch
|
||||
model = MODELS[self.arch]()
|
||||
restored_state = torch.load(path)
|
||||
model = model.load_state_dict(restored_state['model'])
|
||||
accuracy = restored_state['accuracy']
|
||||
epoch = restored_state['epoch'] + 1
|
||||
|
||||
if self.use_cuda:
|
||||
self.model = model.cuda()
|
||||
|
@ -66,13 +71,13 @@ def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
|
|||
|
||||
result = OrderedDict()
|
||||
result['nodename'] = os.uname().nodename
|
||||
result['model'] = engine.name
|
||||
result['model'] = engine.arch
|
||||
result['use_cuda'] = engine.use_cuda
|
||||
result['batch_size'] = batch_size
|
||||
result['mean'] = np.mean(times)
|
||||
result['std'] = np.std(times)
|
||||
result['throughput'] = batch_size / np.mean(times)
|
||||
result['filename'] = engine.filename
|
||||
result['path'] = engine.path
|
||||
if output is not None:
|
||||
save_result(result, output)
|
||||
|
||||
|
@ -122,13 +127,13 @@ def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
|
|||
|
||||
if cpu:
|
||||
print('With CPU:')
|
||||
engine = PyTorchEngine(path, use_cuda=False, name=model)
|
||||
engine = PyTorchEngine(path, use_cuda=False, arch=model)
|
||||
infer_cifar10(testset, engine, start=start, end=end, log2=log2,
|
||||
repeat=repeat, output=output_path)
|
||||
|
||||
if gpu and torch.cuda.is_available():
|
||||
print('With GPU:')
|
||||
engine = PyTorchEngine(path, use_cuda=True, name=model)
|
||||
engine = PyTorchEngine(path, use_cuda=True, arch=model)
|
||||
# Warmup
|
||||
time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import math
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
@ -40,6 +41,50 @@ class BasicBlock(nn.Module):
|
|||
return outputs
|
||||
|
||||
|
||||
class StochasticBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, survival_rate=1):
|
||||
super().__init__()
|
||||
self.survival_rate = survival_rate
|
||||
self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
|
||||
bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(planes)
|
||||
|
||||
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
|
||||
self.bn2 = nn.BatchNorm2d(planes)
|
||||
|
||||
self.increasing = inplanes != (planes * self.expansion)
|
||||
if self.increasing:
|
||||
assert ((1. * planes * self.expansion) / inplanes) == 2
|
||||
if stride != 1:
|
||||
self.shortcut = nn.Sequential(nn.AvgPool2d(stride))
|
||||
else:
|
||||
self.shortcut = nn.Sequential()
|
||||
|
||||
def forward(self, inputs):
|
||||
shortcut = self.shortcut(inputs)
|
||||
if self.increasing:
|
||||
shortcut = torch.cat([shortcut] + [shortcut.mul(0)], 1)
|
||||
|
||||
if not self.training or torch.rand(1)[0] <= self.survival_rate:
|
||||
H = self.conv1(inputs)
|
||||
H = self.bn1(H)
|
||||
H = F.relu(H)
|
||||
|
||||
H = self.conv2(H)
|
||||
H = self.bn2(H)
|
||||
|
||||
if self.training:
|
||||
H /= self.survival_rate
|
||||
H += shortcut
|
||||
else:
|
||||
H = shortcut
|
||||
outputs = F.relu(H)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class PreActBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
|
@ -267,6 +312,30 @@ class ResNet(nn.Module):
|
|||
return outputs
|
||||
|
||||
|
||||
class StochasticResNet(ResNet):
|
||||
|
||||
def __init__(self, Block, layers, filters, num_classes=10, inplanes=None,
|
||||
min_survival_rate=1.0, decay='linear'):
|
||||
super().__init__(Block, layers, filters,
|
||||
num_classes=num_classes,
|
||||
inplanes=inplanes)
|
||||
L = sum(layers)
|
||||
l = 1
|
||||
for section_index in range(self.num_sections):
|
||||
section = getattr(self, f'section_{section_index}')
|
||||
for name, module in section.named_children():
|
||||
if decay == 'linear':
|
||||
survival_rate = 1 - ((l / L) * (1 - min_survival_rate))
|
||||
elif decay == 'uniform':
|
||||
survival_rate = min_survival_rate
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"{decay} decay has not been implemented.")
|
||||
module.survival_rate = survival_rate
|
||||
l += 1
|
||||
assert (l - 1) == L
|
||||
|
||||
|
||||
# From "Deep Residual Learning for Image Recognition"
|
||||
def ResNet20():
|
||||
return ResNet(BasicBlock, layers=[3] * 3, filters=[16, 32, 64])
|
||||
|
@ -292,7 +361,28 @@ def ResNet1202():
|
|||
return ResNet(BasicBlock, layers=[200] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
# Based on but not it "Identity Mappings in Deep Residual Networks"
|
||||
# From "Identity Mappings in Deep Residual Networks"
|
||||
def PreActResNet110():
|
||||
return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
def PreActResNet164():
|
||||
return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
def PreActResNet1001():
|
||||
return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
# Based on but not in "Identity Mappings in Deep Residual Networks"
|
||||
def PreActResNet8():
|
||||
return ResNet(PreActBlock, layers=[1] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
def PreActResNet14():
|
||||
return ResNet(PreActBlock, layers=[2] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
def PreActResNet20():
|
||||
return ResNet(PreActBlock, layers=[3] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
@ -305,17 +395,30 @@ def PreActResNet164Basic():
|
|||
return ResNet(PreActBlock, layers=[27] * 3, filters=[16, 32, 64])
|
||||
|
||||
|
||||
# From "Identity Mappings in Deep Residual Networks"
|
||||
def PreActResNet110():
|
||||
return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
|
||||
# From "Deep Networks with Stochastic Depth"
|
||||
def StochasticResNet110():
|
||||
return StochasticResNet(StochasticBlock, layers=[18] * 3,
|
||||
filters=[16, 32, 64], min_survival_rate=0.5,
|
||||
decay='linear')
|
||||
|
||||
|
||||
def PreActResNet164():
|
||||
return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
|
||||
def StochasticResNet1202():
|
||||
return StochasticResNet(StochasticBlock, layers=[200] * 3,
|
||||
filters=[16, 32, 64], min_survival_rate=0.5,
|
||||
decay='linear')
|
||||
|
||||
|
||||
def PreActResNet1001():
|
||||
return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
|
||||
# Based on but not in "Deep Networks for Stochastic Depth"
|
||||
def StochasticResNet56():
|
||||
return StochasticResNet(StochasticBlock, layers=[9] * 3,
|
||||
filters=[16, 32, 64], min_survival_rate=0.5,
|
||||
decay='linear')
|
||||
|
||||
|
||||
def StochasticResNet56_08():
|
||||
return StochasticResNet(StochasticBlock, layers=[9] * 3,
|
||||
filters=[16, 32, 64], min_survival_rate=0.8,
|
||||
decay='linear')
|
||||
|
||||
|
||||
# From "Wide Residual Networks"
|
153
pytorch/CIFAR10/benchmark/cifar10/results.py
Normal file
153
pytorch/CIFAR10/benchmark/cifar10/results.py
Normal file
|
@ -0,0 +1,153 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from benchmark.cifar10.train import MODELS
|
||||
from benchmark.utils import count_parameters
|
||||
|
||||
|
||||
MODEL_SIZES = {key: count_parameters(MODELS[key]()) for key in MODELS.keys()}
|
||||
|
||||
|
||||
def single_run_acc(df):
|
||||
df = df.copy()
|
||||
df['duration'] = (df['timestamp'] - df['prev_timestamp']).apply(lambda x: x.total_seconds())
|
||||
df['batch_duration'] = df['batch_duration'].apply(lambda x: x.total_seconds())
|
||||
|
||||
tmp = df.loc[:, ['epoch', 'batch_size', 'ncorrect', 'duration', 'batch_duration']].groupby('epoch').sum()
|
||||
tmp['accuracy'] = tmp['ncorrect'] / tmp['batch_size']
|
||||
tmp['throughput'] = tmp['batch_size'] / tmp['duration']
|
||||
tmp['_throughput'] = tmp['batch_size'] / tmp['batch_duration']
|
||||
tmp['elapsed'] = df.groupby('epoch')['elapsed'].agg('max')
|
||||
tmp.reset_index(inplace=True)
|
||||
|
||||
return tmp
|
||||
|
||||
|
||||
def load_file(file, start_timestamp=None):
|
||||
df = pd.read_csv(file)
|
||||
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
||||
df['batch_duration'] = pd.to_timedelta(df['batch_duration'])
|
||||
df['ncorrect'] = df['top1_correct']
|
||||
start_timestamp = start_timestamp or df['timestamp'].iloc[0]
|
||||
df['elapsed'] = df['timestamp'] - start_timestamp
|
||||
df['batch_accuracy'] = df['ncorrect'] / df['batch_size']
|
||||
return df
|
||||
|
||||
|
||||
def load_data(directory, verbose=True):
|
||||
train_file = os.path.join(directory, 'train_results.csv')
|
||||
train = load_file(train_file)
|
||||
start_timestamp = train['timestamp'].iloc[0]
|
||||
|
||||
if verbose:
|
||||
print(train_file)
|
||||
print("Training results shape: {}".format(train.shape))
|
||||
|
||||
try:
|
||||
test_file = os.path.join(directory, 'test_results.csv')
|
||||
test = load_file(test_file, start_timestamp=start_timestamp)
|
||||
except FileNotFoundError:
|
||||
test_file = os.path.join(directory, 'valid_results.csv')
|
||||
test = load_file(test_file, start_timestamp=start_timestamp)
|
||||
|
||||
if verbose:
|
||||
print(test_file)
|
||||
print('Test results shape: {}'.format(test.shape))
|
||||
|
||||
train['mode'] = 'train'
|
||||
test['mode'] = 'test'
|
||||
|
||||
combined = pd.concat([train, test], ignore_index=True).sort_values(by=['timestamp'])
|
||||
combined['prev_timestamp'] = combined['timestamp'].shift(1)
|
||||
combined.loc[0, 'prev_timestamp'] = combined.loc[0, 'timestamp'] - combined.loc[0, 'batch_duration']
|
||||
train = combined[combined['mode'] == 'train'].copy()
|
||||
test = combined[combined['mode'] == 'test'].copy()
|
||||
|
||||
return single_run_acc(train), single_run_acc(test)
|
||||
|
||||
|
||||
def load_multiple(directory, timestamps=None, verbose=False):
|
||||
timestamps = timestamps or os.listdir(directory)
|
||||
train_sets = []
|
||||
test_sets = []
|
||||
for timestamp in sorted(timestamps):
|
||||
_dir = os.path.join(directory, timestamp)
|
||||
train, test = load_data(_dir, verbose=verbose)
|
||||
if verbose:
|
||||
print()
|
||||
train['run'] = _dir
|
||||
test['run'] = _dir
|
||||
train['job_start'] = timestamp
|
||||
test['job_start'] = timestamp
|
||||
train_sets.append(train)
|
||||
test_sets.append(test)
|
||||
|
||||
return pd.concat(train_sets), pd.concat(test_sets)
|
||||
|
||||
|
||||
def load_multiple_models(directory, verbose=False):
|
||||
paths = os.listdir(directory)
|
||||
models = [path for path in paths if path in MODELS]
|
||||
|
||||
train_sets = []
|
||||
test_sets = []
|
||||
for model in sorted(models):
|
||||
if verbose:
|
||||
print(f"Loading {model}")
|
||||
_dir = os.path.join(directory, model)
|
||||
train, test = load_multiple(_dir, verbose=verbose)
|
||||
train['model'] = model
|
||||
train['nparameters'] = MODEL_SIZES[model]
|
||||
test['model'] = model
|
||||
test['nparameters'] = MODEL_SIZES[model]
|
||||
|
||||
train_sets.append(train)
|
||||
test_sets.append(test)
|
||||
|
||||
return pd.concat(train_sets), pd.concat(test_sets)
|
||||
|
||||
|
||||
def concat_update(existing, other, repeat=False):
|
||||
for key in other.keys():
|
||||
if key in existing:
|
||||
if existing[key] != other[key] or repeat:
|
||||
current = existing[key]
|
||||
if isinstance(current, list):
|
||||
current.append(other[key])
|
||||
else:
|
||||
existing[key] = [current, other[key]]
|
||||
else:
|
||||
existing[key] = other[key]
|
||||
|
||||
|
||||
def run_config(run, repeat=False):
|
||||
full = {}
|
||||
configs = (os.path.join(run, entry.name) for entry in os.scandir(run) if 'config' in entry.name)
|
||||
|
||||
for config in sorted(configs):
|
||||
with open(config) as file:
|
||||
tmp = json.load(file)
|
||||
|
||||
tmp['path'] = config
|
||||
concat_update(full, tmp, repeat=repeat)
|
||||
return full
|
||||
|
||||
|
||||
def search_configs(criteria, configs):
|
||||
matches = []
|
||||
for run, config in configs.items():
|
||||
is_match = True
|
||||
for key, value in criteria.items():
|
||||
try:
|
||||
config_value = config[key]
|
||||
if config_value != value:
|
||||
is_match = False
|
||||
except KeyError:
|
||||
is_match = False
|
||||
|
||||
if is_match:
|
||||
matches.append(run)
|
||||
|
||||
return matches
|
374
pytorch/CIFAR10/benchmark/cifar10/train.py
Normal file
374
pytorch/CIFAR10/benchmark/cifar10/train.py
Normal file
|
@ -0,0 +1,374 @@
|
|||
import os
|
||||
from datetime import datetime
|
||||
from collections import OrderedDict
|
||||
|
||||
import click
|
||||
import torch
|
||||
import tqdm
|
||||
import numpy as np
|
||||
from torch import nn, optim
|
||||
from torch.autograd import Variable
|
||||
from torch.utils.data.sampler import SubsetRandomSampler
|
||||
from torchvision import transforms
|
||||
from torchvision import datasets
|
||||
|
||||
from benchmark import utils
|
||||
from benchmark.yellowfin import YFOptimizer
|
||||
from benchmark.cifar10.models import resnet, densenet
|
||||
|
||||
MEAN = (0.4914, 0.4822, 0.4465)
|
||||
STD = (0.2023, 0.1994, 0.2010)
|
||||
|
||||
MODELS = {
|
||||
# "Deep Residual Learning for Image Recognition"
|
||||
'resnet20': resnet.ResNet20,
|
||||
'resnet32': resnet.ResNet32,
|
||||
'resnet44': resnet.ResNet44,
|
||||
'resnet56': resnet.ResNet56,
|
||||
'resnet110': resnet.ResNet110,
|
||||
'resnet1202': resnet.ResNet1202,
|
||||
|
||||
# "Wide Residual Networks"
|
||||
'wrn-40-4': resnet.WRN_40_4,
|
||||
'wrn-16-8': resnet.WRN_16_8,
|
||||
'wrn-28-10': resnet.WRN_28_10,
|
||||
|
||||
# Based on "Identity Mappings in Deep Residual Networks"
|
||||
'preact8': resnet.PreActResNet8,
|
||||
'preact14': resnet.PreActResNet14,
|
||||
'preact20': resnet.PreActResNet20,
|
||||
'preact56': resnet.PreActResNet56,
|
||||
'preact164-basic': resnet.PreActResNet164Basic,
|
||||
|
||||
# "Identity Mappings in Deep Residual Networks"
|
||||
'preact110': resnet.PreActResNet110,
|
||||
'preact164': resnet.PreActResNet164,
|
||||
'preact1001': resnet.PreActResNet1001,
|
||||
|
||||
# Based on "Deep Networks with Stochastic Depth"
|
||||
'stochastic56': resnet.StochasticResNet56,
|
||||
'stochastic56-08': resnet.StochasticResNet56_08,
|
||||
'stochastic110': resnet.StochasticResNet110,
|
||||
'stochastic1202': resnet.StochasticResNet1202,
|
||||
|
||||
# "Aggregated Residual Transformations for Deep Neural Networks"
|
||||
'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
|
||||
'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
|
||||
|
||||
# "Densely Connected Convolutional Networks"
|
||||
'densenetbc100': densenet.DenseNetBC100,
|
||||
'densenetbc250': densenet.DenseNetBC250,
|
||||
'densenetbc190': densenet.DenseNetBC190,
|
||||
|
||||
# Kuangliu/pytorch-cifar
|
||||
'resnet18': resnet.ResNet18,
|
||||
'resnet50': resnet.ResNet50,
|
||||
'resnet101': resnet.ResNet101,
|
||||
'resnet152': resnet.ResNet152,
|
||||
}
|
||||
|
||||
|
||||
def correct(outputs, targets, top=(1, )):
|
||||
_, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
|
||||
targets = targets.view(-1, 1).expand_as(predictions)
|
||||
|
||||
corrects = predictions.eq(targets).cpu().int().cumsum(1).sum(0)
|
||||
tops = list(map(lambda k: corrects.data[0][k - 1], top))
|
||||
return tops
|
||||
|
||||
|
||||
def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
|
||||
use_cuda=False, tracking=None, train=True, half=False):
|
||||
accuracies = [utils.AverageMeter() for _ in top]
|
||||
|
||||
assert criterion is not None or not train, 'Need criterion to train model'
|
||||
assert optimizer is not None or not train, 'Need optimizer to train model'
|
||||
loader = tqdm.tqdm(loader)
|
||||
if train:
|
||||
model.train()
|
||||
losses = utils.AverageMeter()
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
start = datetime.now()
|
||||
for batch_index, (inputs, targets) in enumerate(loader):
|
||||
inputs = Variable(inputs, requires_grad=False, volatile=not train)
|
||||
targets = Variable(targets, requires_grad=False, volatile=not train)
|
||||
batch_size = targets.size(0)
|
||||
assert batch_size < 2**32, 'Size is too large! correct will overflow'
|
||||
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda()
|
||||
targets = targets.cuda()
|
||||
if half:
|
||||
inputs = inputs.half()
|
||||
|
||||
outputs = model(inputs)
|
||||
|
||||
if train:
|
||||
loss = criterion(outputs, targets)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
losses.update(loss.data[0], batch_size)
|
||||
|
||||
_, predictions = torch.max(outputs.data, 1)
|
||||
top_correct = correct(outputs, targets, top=top)
|
||||
for i, count in enumerate(top_correct):
|
||||
accuracies[i].update(count * (100. / batch_size), batch_size)
|
||||
|
||||
end = datetime.now()
|
||||
if tracking is not None:
|
||||
result = OrderedDict()
|
||||
result['timestamp'] = datetime.now()
|
||||
result['batch_duration'] = end - start
|
||||
result['epoch'] = epoch
|
||||
result['batch'] = batch_index
|
||||
result['batch_size'] = batch_size
|
||||
for i, k in enumerate(top):
|
||||
result['top{}_correct'.format(k)] = top_correct[i]
|
||||
result['top{}_accuracy'.format(k)] = accuracies[i].val
|
||||
if train:
|
||||
result['loss'] = loss.data[0]
|
||||
utils.save_result(result, tracking)
|
||||
|
||||
desc = 'Epoch {} {}'.format(epoch, '(Train):' if train else '(Val): ')
|
||||
if train:
|
||||
desc += ' Loss {loss.val:.4f} ({loss.avg:.4f})'.format(loss=losses)
|
||||
for k, acc in zip(top, accuracies):
|
||||
desc += ' Prec@{} {acc.val:.3f} ({acc.avg:.3f})'.format(k, acc=acc)
|
||||
loader.set_description(desc)
|
||||
start = datetime.now()
|
||||
|
||||
if train:
|
||||
message = 'Training accuracy of'
|
||||
else:
|
||||
message = 'Validation accuracy of'
|
||||
for i, k in enumerate(top):
|
||||
message += ' top-{}: {}'.format(k, accuracies[i].avg)
|
||||
print(message)
|
||||
return accuracies[0].avg
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dataset-dir', default='./data/cifar10')
|
||||
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
|
||||
default='last')
|
||||
@click.option('--restore', '-r')
|
||||
@click.option('--tracking/--no-tracking', default=True)
|
||||
@click.option('--cuda/--no-cuda', default=True)
|
||||
@click.option('--epochs', '-e', default=200)
|
||||
@click.option('--batch-size', '-b', default=32)
|
||||
@click.option('--learning-rate', '-l', default=1e-3)
|
||||
@click.option('--lr-factor', default=1.0, help='only for yellowfin')
|
||||
@click.option('--momentum', default=0.9)
|
||||
@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam', 'yellowfin']),
|
||||
default='sgd')
|
||||
@click.option('--augmentation/--no-augmentation', default=True)
|
||||
@click.option('device_ids', '--device', '-d', multiple=True, type=int)
|
||||
@click.option('--num-workers', type=int)
|
||||
@click.option('--weight-decay', default=5e-4)
|
||||
@click.option('--validation', '-v', default=0.0)
|
||||
@click.option('--evaluate', is_flag=True)
|
||||
@click.option('--shuffle/--no-shuffle', default=True)
|
||||
@click.option('--half', is_flag=True)
|
||||
@click.option('--arch', '-a', type=click.Choice(MODELS.keys()),
|
||||
default='resnet20')
|
||||
def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
|
||||
batch_size, learning_rate, lr_factor, momentum, optimizer, augmentation,
|
||||
device_ids, num_workers, weight_decay, validation, evaluate, shuffle,
|
||||
half, arch):
|
||||
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
|
||||
local_timestamp = str(datetime.now())
|
||||
config = {k: v for k, v in locals().items()}
|
||||
|
||||
use_cuda = cuda and torch.cuda.is_available()
|
||||
|
||||
# create model
|
||||
model = MODELS[arch]()
|
||||
|
||||
# create optimizer
|
||||
if optimizer == 'adam':
|
||||
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
||||
elif optimizer == 'sgd':
|
||||
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay)
|
||||
elif optimizer == 'yellowfin':
|
||||
optimizer = YFOptimizer(model.parameters(), lr=learning_rate,
|
||||
mu=momentum, weight_decay=weight_decay)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
|
||||
|
||||
if restore is not None:
|
||||
if restore == 'latest':
|
||||
restore = utils.latest_file(arch)
|
||||
print(f'Restoring model from {restore}')
|
||||
assert os.path.exists(restore)
|
||||
restored_state = torch.load(restore)
|
||||
assert restored_state['arch'] == arch
|
||||
|
||||
model.load_state_dict(restored_state['model'])
|
||||
optimizer.load_state_dict(restored_state['optimizer'])
|
||||
if not isinstance(optimizer, YFOptimizer):
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = learning_rate
|
||||
|
||||
best_accuracy = restored_state['accuracy']
|
||||
start_epoch = restored_state['epoch'] + 1
|
||||
run_dir = os.path.split(restore)[0]
|
||||
else:
|
||||
best_accuracy = 0.0
|
||||
start_epoch = 1
|
||||
run_dir = f"./run/{arch}/{timestamp}"
|
||||
|
||||
print('Starting accuracy is {}'.format(best_accuracy))
|
||||
|
||||
if not os.path.exists(run_dir):
|
||||
os.makedirs(run_dir)
|
||||
utils.save_config(config, run_dir)
|
||||
|
||||
print(model)
|
||||
print("{} parameters".format(utils.count_parameters(model)))
|
||||
print(f"Run directory set to {run_dir}")
|
||||
|
||||
# Save model text description
|
||||
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
|
||||
file.write(str(model))
|
||||
|
||||
if tracking:
|
||||
train_results_file = os.path.join(run_dir, 'train_results.csv')
|
||||
valid_results_file = os.path.join(run_dir, 'valid_results.csv')
|
||||
test_results_file = os.path.join(run_dir, 'test_results.csv')
|
||||
else:
|
||||
train_results_file = None
|
||||
valid_results_file = None
|
||||
test_results_file = None
|
||||
|
||||
# create loss
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
if use_cuda:
|
||||
print('Copying model to GPU')
|
||||
model = model.cuda()
|
||||
criterion = criterion.cuda()
|
||||
|
||||
if half:
|
||||
model = model.half()
|
||||
criterion = criterion.half()
|
||||
device_ids = device_ids or list(range(torch.cuda.device_count()))
|
||||
model = torch.nn.DataParallel(
|
||||
model, device_ids=device_ids)
|
||||
num_workers = num_workers or len(device_ids)
|
||||
else:
|
||||
num_workers = num_workers or 1
|
||||
if half:
|
||||
print('Half precision (16-bit floating point) only works on GPU')
|
||||
print(f"using {num_workers} workers for data loading")
|
||||
|
||||
# load data
|
||||
print("Preparing data:")
|
||||
transform_test = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(MEAN, STD),
|
||||
])
|
||||
|
||||
test_loader = torch.utils.data.DataLoader(
|
||||
datasets.CIFAR10(root=dataset_dir, train=False, download=True,
|
||||
transform=transform_test),
|
||||
batch_size=batch_size, shuffle=False, num_workers=num_workers,
|
||||
pin_memory=use_cuda)
|
||||
|
||||
if evaluate:
|
||||
print("Only running evaluation of model on test dataset")
|
||||
run(start_epoch - 1, model, test_loader, use_cuda=use_cuda,
|
||||
tracking=test_results_file, train=False)
|
||||
return
|
||||
|
||||
if augmentation:
|
||||
transform_train = [
|
||||
transforms.RandomCrop(32, padding=4),
|
||||
transforms.RandomHorizontalFlip()
|
||||
]
|
||||
else:
|
||||
transform_train = []
|
||||
|
||||
transform_train = transforms.Compose(transform_train + [
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(MEAN, STD),
|
||||
])
|
||||
|
||||
train_dataset = datasets.CIFAR10(root=dataset_dir, train=True,
|
||||
download=True, transform=transform_train)
|
||||
|
||||
num_train = len(train_dataset)
|
||||
indices = list(range(num_train))
|
||||
assert 1 > validation and validation >= 0, "Validation must be in [0, 1)"
|
||||
split = num_train - int(validation * num_train)
|
||||
|
||||
if shuffle:
|
||||
np.random.shuffle(indices)
|
||||
|
||||
train_indices = indices[:split]
|
||||
valid_indices = indices[split:]
|
||||
|
||||
print('Using {} examples for training'.format(len(train_indices)))
|
||||
print('Using {} examples for validation'.format(len(valid_indices)))
|
||||
|
||||
train_sampler = SubsetRandomSampler(train_indices)
|
||||
valid_sampler = SubsetRandomSampler(valid_indices)
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, sampler=train_sampler, batch_size=batch_size,
|
||||
num_workers=num_workers, pin_memory=use_cuda)
|
||||
if validation != 0:
|
||||
valid_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, sampler=valid_sampler, batch_size=batch_size,
|
||||
num_workers=num_workers, pin_memory=use_cuda)
|
||||
else:
|
||||
print('Using test dataset for validation')
|
||||
valid_loader = test_loader
|
||||
|
||||
end_epoch = start_epoch + epochs
|
||||
# YellowFin doesn't have param_groups causing AttributeError
|
||||
if not isinstance(optimizer, YFOptimizer):
|
||||
for group in optimizer.param_groups:
|
||||
if 'lr' in group:
|
||||
print('Learning rate set to {}'.format(group['lr']))
|
||||
assert group['lr'] == learning_rate
|
||||
else:
|
||||
print(f"set lr_factor to {lr_factor}")
|
||||
optimizer.set_lr_factor(lr_factor)
|
||||
for epoch in range(start_epoch, end_epoch):
|
||||
run(epoch, model, train_loader, criterion, optimizer,
|
||||
use_cuda=use_cuda, tracking=train_results_file, train=True,
|
||||
half=half)
|
||||
|
||||
valid_acc = run(epoch, model, valid_loader, use_cuda=use_cuda,
|
||||
tracking=valid_results_file, train=False, half=half)
|
||||
|
||||
is_best = valid_acc > best_accuracy
|
||||
last_epoch = epoch == (end_epoch - 1)
|
||||
if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
|
||||
state = {
|
||||
'epoch': epoch,
|
||||
'arch': arch,
|
||||
'model': (model.module if use_cuda else model).state_dict(),
|
||||
'accuracy': valid_acc,
|
||||
'optimizer': optimizer.state_dict()
|
||||
}
|
||||
if is_best:
|
||||
print('New best model!')
|
||||
filename = os.path.join(run_dir, 'checkpoint_best_model.t7')
|
||||
print(f'Saving checkpoint to {filename}')
|
||||
best_accuracy = valid_acc
|
||||
torch.save(state, filename)
|
||||
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
|
||||
filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7')
|
||||
print(f'Saving checkpoint to {filename}')
|
||||
torch.save(state, filename)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train()
|
14
pytorch/CIFAR10/benchmark/imagenet/__main__.py
Normal file
14
pytorch/CIFAR10/benchmark/imagenet/__main__.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
import click
|
||||
|
||||
from benchmark.imagenet.train import train
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
cli.add_command(train, name='train')
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
339
pytorch/CIFAR10/benchmark/imagenet/train.py
Normal file
339
pytorch/CIFAR10/benchmark/imagenet/train.py
Normal file
|
@ -0,0 +1,339 @@
|
|||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from collections import OrderedDict
|
||||
|
||||
import click
|
||||
import tqdm
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.optim
|
||||
import torch.utils.data
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
|
||||
from benchmark import utils
|
||||
|
||||
model_names = sorted(name for name in models.__dict__
|
||||
if name.islower() and not name.startswith("__")
|
||||
and callable(models.__dict__[name]))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dataset-dir', default='./data/imagenet')
|
||||
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
|
||||
default='last')
|
||||
@click.option('--restore', '-r')
|
||||
@click.option('--tracking/--no-tracking', default=True)
|
||||
@click.option('--cuda/--no-cuda', default=True)
|
||||
@click.option('--epochs', '-e', default=90)
|
||||
@click.option('--batch-size', '-b', default=256)
|
||||
@click.option('--learning-rate', '-l', default=0.1)
|
||||
@click.option('--learning-rate-decay', default=0.1)
|
||||
@click.option('--learning-rate-freq', default=30)
|
||||
@click.option('--momentum', default=0.9)
|
||||
@click.option('--optimizer', '-o', type=click.Choice(['sgd', 'adam']),
|
||||
default='sgd')
|
||||
@click.option('--augmentation/--no-augmentation', default=True)
|
||||
@click.option('--pretrained', is_flag=True)
|
||||
@click.option('--evaluate', is_flag=True)
|
||||
@click.option('--num-workers', type=int)
|
||||
@click.option('--weight-decay', default=1e-4)
|
||||
@click.option('--arch', '-a', type=click.Choice(model_names),
|
||||
default='resnet18')
|
||||
def train(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
|
||||
batch_size, learning_rate, learning_rate_decay,
|
||||
learning_rate_freq, momentum, optimizer, augmentation,
|
||||
pretrained, evaluate, num_workers, weight_decay, arch):
|
||||
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
|
||||
config = {k: v for k, v in locals().items()}
|
||||
|
||||
use_cuda = cuda and torch.cuda.is_available()
|
||||
|
||||
# create model
|
||||
if pretrained:
|
||||
print("=> using pre-trained model '{}'".format(arch))
|
||||
model = models.__dict__[arch](pretrained=True)
|
||||
else:
|
||||
print("=> creating model '{}'".format(arch))
|
||||
model = models.__dict__[arch]()
|
||||
|
||||
if optimizer == 'adam':
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
|
||||
elif optimizer == 'sgd':
|
||||
optimizer = torch.optim.SGD(model.parameters(), learning_rate,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay)
|
||||
else:
|
||||
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if restore is not None:
|
||||
if restore == 'latest':
|
||||
restore = utils.latest_file(arch)
|
||||
print(f'=> restoring model from {restore}')
|
||||
restored_state = torch.load(restore)
|
||||
start_epoch = restored_state['epoch'] + 1
|
||||
best_prec1 = restored_state['prec1']
|
||||
model.load_state_dict(restored_state['state_dict'])
|
||||
optimizer.load_state_dict(restored_state['optimizer'])
|
||||
print('=> starting accuracy is {} (epoch {})'
|
||||
.format(best_prec1, start_epoch))
|
||||
run_dir = os.path.split(restore)[0]
|
||||
else:
|
||||
best_prec1 = 0.0
|
||||
start_epoch = 1
|
||||
run_dir = f"./run/{arch}/{timestamp}"
|
||||
|
||||
if not os.path.exists(run_dir):
|
||||
os.makedirs(run_dir)
|
||||
utils.save_config(config, run_dir)
|
||||
|
||||
print(model)
|
||||
print("{} parameters".format(utils.count_parameters(model)))
|
||||
print(f"Run directory set to {run_dir}")
|
||||
|
||||
# save model text description
|
||||
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
|
||||
file.write(str(model))
|
||||
|
||||
if tracking:
|
||||
train_results_file = os.path.join(run_dir, 'train_results.csv')
|
||||
test_results_file = os.path.join(run_dir, 'test_results.csv')
|
||||
else:
|
||||
train_results_file = None
|
||||
test_results_file = None
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# move model and criterion to GPU
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
criterion = criterion.cuda()
|
||||
model = torch.nn.parallel.DataParallel(model)
|
||||
num_workers = num_workers or torch.cuda.device_count()
|
||||
else:
|
||||
num_workers = num_workers or 1
|
||||
print(f"=> using {num_workers} workers for data loading")
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
# Data loading code
|
||||
print("=> preparing data:")
|
||||
traindir = os.path.join(dataset_dir, 'train')
|
||||
valdir = os.path.join(dataset_dir, 'val')
|
||||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])
|
||||
|
||||
train_sampler = None
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(traindir, transforms.Compose([
|
||||
transforms.RandomSizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=num_workers, pin_memory=True, sampler=train_sampler)
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(valdir, transforms.Compose([
|
||||
transforms.Scale(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=batch_size, shuffle=False,
|
||||
num_workers=num_workers, pin_memory=True)
|
||||
|
||||
if evaluate:
|
||||
validate(val_loader, model, criterion)
|
||||
return
|
||||
|
||||
end_epoch = start_epoch + epochs
|
||||
for epoch in range(start_epoch, end_epoch):
|
||||
print('Epoch {} of {}'.format(epoch, end_epoch - 1))
|
||||
adjust_learning_rate(optimizer, epoch, learning_rate,
|
||||
decay=learning_rate_decay,
|
||||
freq=learning_rate_freq)
|
||||
|
||||
# train for one epoch
|
||||
_ = train_one_epoch(
|
||||
train_loader, model, criterion, optimizer, epoch,
|
||||
tracking=train_results_file)
|
||||
|
||||
# evaluate on validation set
|
||||
prec1, _ = validate(
|
||||
val_loader, model, criterion, epoch, tracking=test_results_file)
|
||||
|
||||
# remember best prec@1 and save checkpoint
|
||||
is_best = prec1 > best_prec1
|
||||
last_epoch = epoch == (end_epoch - 1)
|
||||
if is_best or checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
|
||||
state = {
|
||||
'epoch': epoch,
|
||||
'arch': arch,
|
||||
'state_dict': (model.module if use_cuda else model).state_dict(),
|
||||
'prec1': prec1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
}
|
||||
if is_best:
|
||||
print('New best model!')
|
||||
filename = os.path.join(run_dir, 'checkpoint_best_model.t7')
|
||||
print(f'=> saving checkpoint to {filename}')
|
||||
torch.save(state, filename)
|
||||
best_prec1 = prec1
|
||||
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
|
||||
filename = os.path.join(run_dir, f'checkpoint_{epoch}.t7')
|
||||
print(f'=> saving checkpoint to {filename}')
|
||||
torch.save(state, filename)
|
||||
|
||||
|
||||
def train_one_epoch(train_loader, model, criterion, optimizer, epoch,
|
||||
tracking=None):
|
||||
train_loader = tqdm.tqdm(train_loader)
|
||||
batch_time = utils.AverageMeter()
|
||||
data_time = utils.AverageMeter()
|
||||
losses = utils.AverageMeter()
|
||||
top1 = utils.AverageMeter()
|
||||
top5 = utils.AverageMeter()
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
|
||||
end = time.time()
|
||||
for i, (input, target) in enumerate(train_loader):
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
target = target.cuda(async=True)
|
||||
input_var = torch.autograd.Variable(input)
|
||||
target_var = torch.autograd.Variable(target)
|
||||
|
||||
# compute output
|
||||
output = model(input_var)
|
||||
loss = criterion(output, target_var)
|
||||
|
||||
# measure accuracy and record loss
|
||||
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
|
||||
losses.update(loss.data[0], input.size(0))
|
||||
top1.update(prec1[0], input.size(0))
|
||||
top5.update(prec5[0], input.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
if tracking is not None:
|
||||
result = OrderedDict()
|
||||
result['timestamp'] = datetime.now()
|
||||
result['batch_duration'] = batch_time.val
|
||||
result['epoch'] = epoch
|
||||
result['batch'] = i
|
||||
result['batch_size'] = input.size(0)
|
||||
result['top1_accuracy'] = prec1[0]
|
||||
result['top5_accuracy'] = prec5[0]
|
||||
result['loss'] = loss.data[0]
|
||||
result['data_duration'] = data_time.val
|
||||
utils.save_result(result, tracking)
|
||||
|
||||
desc = ('Epoch {0} (Train):'
|
||||
' Loss {loss.val:.4f} ({loss.avg:.4f})'
|
||||
' Prec@1 {top1.val:.3f} ({top1.avg:.3f})'
|
||||
' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
|
||||
epoch, i, len(train_loader), batch_time=batch_time,
|
||||
data_time=data_time, loss=losses, top1=top1, top5=top5))
|
||||
train_loader.set_description(desc)
|
||||
|
||||
end = time.time()
|
||||
|
||||
return top1.avg, top5.avg
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion, epoch, tracking=None):
|
||||
val_loader = tqdm.tqdm(val_loader)
|
||||
batch_time = utils.AverageMeter()
|
||||
losses = utils.AverageMeter()
|
||||
top1 = utils.AverageMeter()
|
||||
top5 = utils.AverageMeter()
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
end = time.time()
|
||||
for i, (input, target) in enumerate(val_loader):
|
||||
target = target.cuda(async=True)
|
||||
input_var = torch.autograd.Variable(input, volatile=True)
|
||||
target_var = torch.autograd.Variable(target, volatile=True)
|
||||
|
||||
# compute output
|
||||
output = model(input_var)
|
||||
loss = criterion(output, target_var)
|
||||
|
||||
# measure accuracy and record loss
|
||||
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
|
||||
losses.update(loss.data[0], input.size(0))
|
||||
top1.update(prec1[0], input.size(0))
|
||||
top5.update(prec5[0], input.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
if tracking is not None:
|
||||
result = OrderedDict()
|
||||
result['timestamp'] = datetime.now()
|
||||
result['batch_duration'] = batch_time.val
|
||||
result['epoch'] = epoch
|
||||
result['batch'] = i
|
||||
result['batch_size'] = input.size(0)
|
||||
result['top1_accuracy'] = prec1[0]
|
||||
result['top5_accuracy'] = prec5[0]
|
||||
result['loss'] = loss.data[0]
|
||||
utils.save_result(result, tracking)
|
||||
|
||||
desc = ('Epoch {0} (Val): '
|
||||
' Loss {loss.val:.4f} ({loss.avg:.4f})'
|
||||
' Prec@1 {top1.val:.3f} ({top1.avg:.3f})'
|
||||
' Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
|
||||
epoch, i, len(val_loader), batch_time=batch_time,
|
||||
loss=losses, top1=top1, top5=top5))
|
||||
val_loader.set_description(desc)
|
||||
end = time.time()
|
||||
|
||||
print("Evaluation: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}"
|
||||
.format(top1=top1, top5=top5))
|
||||
return top1.avg, top5.avg
|
||||
|
||||
|
||||
def adjust_learning_rate(optimizer, epoch, initial_learning_rate, decay, freq):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
lr = initial_learning_rate * (decay ** ((epoch - 1) // freq))
|
||||
print(f'=> learning rate is set to {lr}')
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the precision@k for the specified values of k"""
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train()
|
|
@ -1,336 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import json
|
||||
from functools import reduce
|
||||
from datetime import datetime
|
||||
from collections import OrderedDict
|
||||
|
||||
import click
|
||||
import torch
|
||||
import progressbar
|
||||
from torch import nn, optim
|
||||
from torch.autograd import Variable
|
||||
from torchvision import transforms
|
||||
from torchvision import datasets as dset
|
||||
|
||||
from benchmark.models import resnet, densenet
|
||||
|
||||
MEAN = (0.4914, 0.4822, 0.4465)
|
||||
STD = (0.2023, 0.1994, 0.2010)
|
||||
|
||||
MODELS = {
|
||||
# "Deep Residual Learning for Image Recognition"
|
||||
'resnet20': resnet.ResNet20,
|
||||
'resnet32': resnet.ResNet32,
|
||||
'resnet44': resnet.ResNet44,
|
||||
'resnet56': resnet.ResNet56,
|
||||
'resnet110': resnet.ResNet110,
|
||||
'resnet1202': resnet.ResNet1202,
|
||||
|
||||
# "Wide Residual Networks"
|
||||
'wrn-40-4': resnet.WRN_40_4,
|
||||
'wrn-16-8': resnet.WRN_16_8,
|
||||
'wrn-28-10': resnet.WRN_28_10,
|
||||
|
||||
# Based on "Identity Mappings in Deep Residual Networks"
|
||||
'preact20': resnet.PreActResNet20,
|
||||
'preact56': resnet.PreActResNet56,
|
||||
'preact164-basic': resnet.PreActResNet164Basic,
|
||||
|
||||
# "Identity Mappings in Deep Residual Networks"
|
||||
'preact110': resnet.PreActResNet110,
|
||||
'preact164': resnet.PreActResNet164,
|
||||
'preact1001': resnet.PreActResNet1001,
|
||||
|
||||
# "Aggregated Residual Transformations for Deep Neural Networks"
|
||||
'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
|
||||
'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
|
||||
|
||||
# "Densely Connected Convolutional Networks"
|
||||
'densenetbc100': densenet.DenseNetBC100,
|
||||
'densenetbc250': densenet.DenseNetBC250,
|
||||
'densenetbc190': densenet.DenseNetBC190,
|
||||
|
||||
# Kuangliu/pytorch-cifar
|
||||
'resnet18': resnet.ResNet18,
|
||||
'resnet50': resnet.ResNet50,
|
||||
'resnet101': resnet.ResNet101,
|
||||
'resnet152': resnet.ResNet152,
|
||||
}
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
|
||||
return sum(c)
|
||||
|
||||
|
||||
def correct(outputs, targets, top=(1, )):
|
||||
_, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
|
||||
targets = targets.view(-1, 1).expand_as(predictions)
|
||||
corrects = predictions.eq(targets).cpu().cumsum(1).sum(0)
|
||||
tops = list(map(lambda k: corrects.data[0][k - 1], top))
|
||||
return tops
|
||||
|
||||
|
||||
def save_result(result, path):
|
||||
write_heading = not os.path.exists(path)
|
||||
with open(path, mode='a') as out:
|
||||
if write_heading:
|
||||
out.write(",".join([str(k) for k, v in result.items()]) + '\n')
|
||||
out.write(",".join([str(v) for k, v in result.items()]) + '\n')
|
||||
|
||||
|
||||
def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
|
||||
use_cuda=False, tracking=None, max_value=None, train=True):
|
||||
|
||||
assert criterion is not None or not train, 'Need criterion to train model'
|
||||
assert optimizer is not None or not train, 'Need optimizer to train model'
|
||||
max_value = max_value or progressbar.UnknownLength
|
||||
bar = progressbar.ProgressBar(max_value=max_value)
|
||||
total = 0
|
||||
correct_counts = {}
|
||||
if train:
|
||||
model.train()
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
start = datetime.now()
|
||||
for batch_index, (inputs, targets) in enumerate(loader):
|
||||
inputs = Variable(inputs, requires_grad=False, volatile=not train)
|
||||
targets = Variable(targets, requires_grad=False, volatile=not train)
|
||||
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda()
|
||||
targets = targets.cuda()
|
||||
|
||||
outputs = model(inputs)
|
||||
|
||||
if train:
|
||||
loss = criterion(outputs, targets)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
_, predictions = torch.max(outputs.data, 1)
|
||||
batch_size = targets.size(0)
|
||||
top_correct = correct(outputs, targets, top=top)
|
||||
total += batch_size
|
||||
for k, count in zip(top, top_correct):
|
||||
correct_counts[k] = correct_counts.get(k, 0) + count
|
||||
|
||||
end = datetime.now()
|
||||
if tracking is not None:
|
||||
result = OrderedDict()
|
||||
result['timestamp'] = datetime.now()
|
||||
result['batch_duration'] = end - start
|
||||
result['epoch'] = epoch
|
||||
result['batch'] = batch_index
|
||||
result['batch_size'] = batch_size
|
||||
for i, k in enumerate(top):
|
||||
result['top{}_correct'.format(k)] = top_correct[i]
|
||||
if train:
|
||||
result['loss'] = loss.data[0]
|
||||
save_result(result, tracking)
|
||||
|
||||
bar.update(batch_index + 1)
|
||||
start = datetime.now()
|
||||
|
||||
print()
|
||||
if train:
|
||||
message = 'Training accuracy of'
|
||||
else:
|
||||
message = 'Test accuracy of'
|
||||
for k in top:
|
||||
accuracy = correct_counts[k] / total
|
||||
message += ' top-{}: {}'.format(k, accuracy)
|
||||
print(message)
|
||||
return (1. * correct_counts[top[0]]) / total, batch_index + 1
|
||||
|
||||
|
||||
def save(model, directory, epoch, accuracy, use_cuda=False, filename=None):
|
||||
state = {
|
||||
'model': model.module if use_cuda else model,
|
||||
'epoch': epoch,
|
||||
'accuracy': accuracy
|
||||
}
|
||||
|
||||
filename = filename or 'checkpoint_{}.t7'.format(epoch)
|
||||
torch.save(state, os.path.join(directory, filename))
|
||||
|
||||
|
||||
def save_config(config, run_dir):
|
||||
path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
|
||||
with open(path, 'w') as config_file:
|
||||
json.dump(config, config_file)
|
||||
config_file.write('\n')
|
||||
|
||||
|
||||
def load(path):
|
||||
assert os.path.exists(path)
|
||||
state = torch.load(path)
|
||||
model = state['model']
|
||||
epoch = state['epoch']
|
||||
accuracy = state['accuracy']
|
||||
return model, epoch, accuracy
|
||||
|
||||
|
||||
def latest_file(model):
|
||||
restore = f'./run/{model}'
|
||||
timestamps = sorted(os.listdir(restore))
|
||||
assert len(timestamps) > 0
|
||||
run_dir = os.path.join(restore, timestamps[-1])
|
||||
files = os.listdir(run_dir)
|
||||
max_checkpoint = -1
|
||||
for filename in files:
|
||||
if re.search('checkpoint_\d+.t7', filename):
|
||||
num = int(re.search('\d+', filename).group())
|
||||
|
||||
if num > max_checkpoint:
|
||||
max_checkpoint = num
|
||||
max_checkpoint_file = filename
|
||||
|
||||
assert max_checkpoint != -1
|
||||
return os.path.join(run_dir, max_checkpoint_file)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dataset-dir', default='./data/cifar10')
|
||||
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
|
||||
default='last')
|
||||
@click.option('--restore', '-r')
|
||||
@click.option('--tracking/--no-tracking', default=True)
|
||||
@click.option('--cuda/--no-cuda', default=True)
|
||||
@click.option('--epochs', '-e', default=200)
|
||||
@click.option('--batch-size', '-b', default=32)
|
||||
@click.option('--learning-rate', '-l', default=1e-3)
|
||||
@click.option('--sgd', 'optimizer', flag_value='sgd')
|
||||
@click.option('--adam', 'optimizer', flag_value='adam', default=True)
|
||||
@click.option('--augmentation/--no-augmentation', default=True)
|
||||
@click.option('--num-workers', type=int)
|
||||
@click.option('--weight-decay', default=5e-4)
|
||||
@click.option('--model', '-m', type=click.Choice(MODELS.keys()),
|
||||
default='resnet20')
|
||||
def main(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
|
||||
batch_size, learning_rate, optimizer, augmentation, num_workers,
|
||||
weight_decay, model):
|
||||
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
|
||||
config = {k: v for k, v in locals().items()}
|
||||
|
||||
use_cuda = cuda and torch.cuda.is_available()
|
||||
if use_cuda:
|
||||
num_workers = num_workers or torch.cuda.device_count()
|
||||
else:
|
||||
num_workers = num_workers or 1
|
||||
|
||||
print(f"using {num_workers} workers for data loading")
|
||||
|
||||
print("Preparing data:")
|
||||
|
||||
if augmentation:
|
||||
transform_train = [
|
||||
transforms.RandomCrop(32, padding=4),
|
||||
transforms.RandomHorizontalFlip()
|
||||
]
|
||||
else:
|
||||
transform_train = []
|
||||
|
||||
transform_train = transforms.Compose(transform_train + [
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(MEAN, STD),
|
||||
])
|
||||
|
||||
trainset = dset.CIFAR10(root=dataset_dir, train=True, download=True,
|
||||
transform=transform_train)
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
|
||||
pin_memory=use_cuda)
|
||||
|
||||
transform_test = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(MEAN, STD),
|
||||
])
|
||||
|
||||
testset = dset.CIFAR10(root=dataset_dir, train=False, download=True,
|
||||
transform=transform_test)
|
||||
test_loader = torch.utils.data.DataLoader(
|
||||
testset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
|
||||
pin_memory=use_cuda)
|
||||
|
||||
if restore is not None:
|
||||
if restore == 'latest':
|
||||
restore = latest_file(model)
|
||||
print(f'Restoring model from {restore}')
|
||||
model, start_epoch, best_accuracy = load(restore)
|
||||
start_epoch += 1
|
||||
print('Starting accuracy is {}'.format(best_accuracy))
|
||||
run_dir = os.path.split(restore)[0]
|
||||
else:
|
||||
print(f'Building {model} model')
|
||||
best_accuracy = -1
|
||||
start_epoch = 1
|
||||
run_dir = f"./run/{model}/{timestamp}"
|
||||
model = MODELS[model]()
|
||||
|
||||
if not os.path.exists(run_dir):
|
||||
os.makedirs(run_dir)
|
||||
save_config(config, run_dir)
|
||||
|
||||
print(model)
|
||||
print("{} parameters".format(count_parameters(model)))
|
||||
print(f"Run directory set to {run_dir}")
|
||||
|
||||
# Save model text description
|
||||
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
|
||||
file.write(str(model))
|
||||
|
||||
if tracking:
|
||||
train_results_file = os.path.join(run_dir, 'train_results.csv')
|
||||
test_results_file = os.path.join(run_dir, 'test_results.csv')
|
||||
else:
|
||||
train_results_file = None
|
||||
test_results_file = None
|
||||
|
||||
if use_cuda:
|
||||
print('Copying model to GPU')
|
||||
model.cuda()
|
||||
model = torch.nn.DataParallel(
|
||||
model, device_ids=range(torch.cuda.device_count()))
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# Other parameters?
|
||||
if optimizer == 'adam':
|
||||
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
||||
elif optimizer == 'sgd':
|
||||
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
|
||||
momentum=0.9,
|
||||
weight_decay=weight_decay)
|
||||
else:
|
||||
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
|
||||
|
||||
train_max_value = None
|
||||
test_max_value = None
|
||||
end_epoch = start_epoch + epochs
|
||||
for epoch in range(start_epoch, end_epoch):
|
||||
print('Epoch {} of {}'.format(epoch, end_epoch - 1))
|
||||
train_acc, train_max_value = run(epoch, model, train_loader, criterion,
|
||||
optimizer, use_cuda=use_cuda,
|
||||
tracking=train_results_file,
|
||||
max_value=train_max_value, train=True)
|
||||
|
||||
test_acc, test_max_value = run(epoch, model, test_loader,
|
||||
use_cuda=use_cuda,
|
||||
tracking=test_results_file, train=False)
|
||||
|
||||
if test_acc > best_accuracy:
|
||||
print('New best model!')
|
||||
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda,
|
||||
filename='checkpoint_best_model.t7')
|
||||
best_accuracy = test_acc
|
||||
|
||||
last_epoch = epoch == (end_epoch - 1)
|
||||
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
|
||||
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
61
pytorch/CIFAR10/benchmark/utils.py
Normal file
61
pytorch/CIFAR10/benchmark/utils.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import os
|
||||
import json
|
||||
import re
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.sum += val * n
|
||||
self.count += n
|
||||
self.avg = self.sum / self.count
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
|
||||
return sum(c)
|
||||
|
||||
|
||||
def latest_file(model):
|
||||
restore = f'./run/{model}'
|
||||
timestamps = sorted(os.listdir(restore))
|
||||
assert len(timestamps) > 0
|
||||
run_dir = os.path.join(restore, timestamps[-1])
|
||||
files = os.listdir(run_dir)
|
||||
max_checkpoint = -1
|
||||
for filename in files:
|
||||
if re.search('checkpoint_\d+.t7', filename):
|
||||
num = int(re.search('\d+', filename).group())
|
||||
|
||||
if num > max_checkpoint:
|
||||
max_checkpoint = num
|
||||
max_checkpoint_file = filename
|
||||
|
||||
assert max_checkpoint != -1
|
||||
return os.path.join(run_dir, max_checkpoint_file)
|
||||
|
||||
|
||||
def save_result(result, path):
|
||||
write_heading = not os.path.exists(path)
|
||||
with open(path, mode='a') as out:
|
||||
if write_heading:
|
||||
out.write(",".join([str(k) for k, v in result.items()]) + '\n')
|
||||
out.write(",".join([str(v) for k, v in result.items()]) + '\n')
|
||||
|
||||
|
||||
def save_config(config, run_dir):
|
||||
path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
|
||||
with open(path, 'w') as config_file:
|
||||
json.dump(config, config_file)
|
||||
config_file.write('\n')
|
384
pytorch/CIFAR10/benchmark/yellowfin.py
Normal file
384
pytorch/CIFAR10/benchmark/yellowfin.py
Normal file
|
@ -0,0 +1,384 @@
|
|||
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# eps for numerical stability
|
||||
eps = 1e-15
|
||||
|
||||
class YFOptimizer(object):
|
||||
def __init__(self, var_list, lr=0.1, mu=0.0, clip_thresh=None, weight_decay=0.0,
|
||||
beta=0.999, curv_win_width=20, zero_debias=True, sparsity_debias=True, delta_mu=0.0,
|
||||
auto_clip_fac=None, force_non_inc_step=False):
|
||||
'''
|
||||
clip thresh is the threshold value on ||lr * gradient||
|
||||
delta_mu can be place holder/variable/python scalar. They are used for additional
|
||||
momentum in situations such as asynchronous-parallel training. The default is 0.0
|
||||
for basic usage of the optimizer.
|
||||
Args:
|
||||
lr: python scalar. The initial value of learning rate, we use 1.0 in our paper.
|
||||
mu: python scalar. The initial value of momentum, we use 0.0 in our paper.
|
||||
clip_thresh: python scalar. The manaully-set clipping threshold for tf.clip_by_global_norm.
|
||||
if None, the automatic clipping can be carried out. The automatic clipping
|
||||
feature is parameterized by argument auto_clip_fac. The auto clip feature
|
||||
can be switched off with auto_clip_fac = None
|
||||
beta: python scalar. The smoothing parameter for estimations.
|
||||
sparsity_debias: gradient norm and curvature are biased to larger values when
|
||||
calculated with sparse gradient. This is useful when the model is very sparse,
|
||||
e.g. LSTM with word embedding. For non-sparse CNN, turning it off could slightly
|
||||
accelerate the speed.
|
||||
delta_mu: for extensions. Not necessary in the basic use.
|
||||
force_non_inc_step: in some very rare cases, it is necessary to force ||lr * gradient||
|
||||
to be not increasing dramatically for stableness after some iterations.
|
||||
In practice, if turned on, we enforce lr * sqrt(smoothed ||grad||^2)
|
||||
to be less than 2x of the minimal value of historical value on smoothed || lr * grad ||.
|
||||
This feature is turned off by default.
|
||||
Other features:
|
||||
If you want to manually control the learning rates, self.lr_factor is
|
||||
an interface to the outside, it is an multiplier for the internal learning rate
|
||||
in YellowFin. It is helpful when you want to do additional hand tuning
|
||||
or some decaying scheme to the tuned learning rate in YellowFin.
|
||||
Example on using lr_factor can be found here:
|
||||
https://github.com/JianGoForIt/YellowFin_Pytorch/blob/master/pytorch-cifar/main.py#L109
|
||||
'''
|
||||
self._lr = lr
|
||||
self._mu = mu
|
||||
# we convert var_list from generator to list so that
|
||||
# it can be used for multiple times
|
||||
self._var_list = list(var_list)
|
||||
self._clip_thresh = clip_thresh
|
||||
self._auto_clip_fac = auto_clip_fac
|
||||
self._beta = beta
|
||||
self._curv_win_width = curv_win_width
|
||||
self._zero_debias = zero_debias
|
||||
self._sparsity_debias = sparsity_debias
|
||||
self._force_non_inc_step = force_non_inc_step
|
||||
self._optimizer = torch.optim.SGD(self._var_list, lr=self._lr,
|
||||
momentum=self._mu, weight_decay=weight_decay)
|
||||
self._iter = 0
|
||||
# global states are the statistics
|
||||
self._global_state = {}
|
||||
|
||||
# for decaying learning rate and etc.
|
||||
self._lr_factor = 1.0
|
||||
|
||||
|
||||
def state_dict(self):
|
||||
# for checkpoint saving
|
||||
sgd_state_dict = self._optimizer.state_dict()
|
||||
global_state = self._global_state
|
||||
lr_factor = self._lr_factor
|
||||
iter = self._iter
|
||||
lr = self._lr
|
||||
mu = self._mu
|
||||
clip_thresh = self._clip_thresh
|
||||
beta = self._beta
|
||||
curv_win_width = self._curv_win_width
|
||||
zero_debias = self._zero_debias
|
||||
h_min = self._h_min
|
||||
h_max = self._h_max
|
||||
|
||||
return {
|
||||
"sgd_state_dict": sgd_state_dict,
|
||||
"global_state": global_state,
|
||||
"lr_factor": lr_factor,
|
||||
"iter": iter,
|
||||
"lr": lr,
|
||||
"mu": mu,
|
||||
"clip_thresh": clip_thresh,
|
||||
"beta": beta,
|
||||
"curv_win_width": curv_win_width,
|
||||
"zero_debias": zero_debias,
|
||||
"h_min": h_min,
|
||||
"h_max": h_max
|
||||
}
|
||||
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
# for checkpoint saving
|
||||
self._optimizer.load_state_dict(state_dict['sgd_state_dict'])
|
||||
self._global_state = state_dict['global_state']
|
||||
self._lr_factor = state_dict['lr_factor']
|
||||
self._iter = state_dict['iter']
|
||||
self._lr = state_dict['lr']
|
||||
self._mu = state_dict['mu']
|
||||
self._clip_thresh = state_dict['clip_thresh']
|
||||
self._beta = state_dict['beta']
|
||||
self._curv_win_width = state_dict['curv_win_width']
|
||||
self._zero_debias = state_dict['zero_debias']
|
||||
self._h_min = state_dict["h_min"]
|
||||
self._h_max = state_dict["h_max"]
|
||||
return
|
||||
|
||||
|
||||
def set_lr_factor(self, factor):
|
||||
self._lr_factor = factor
|
||||
return
|
||||
|
||||
|
||||
def get_lr_factor(self):
|
||||
return self._lr_factor
|
||||
|
||||
|
||||
def zero_grad(self):
|
||||
self._optimizer.zero_grad()
|
||||
return
|
||||
|
||||
|
||||
def zero_debias_factor(self):
|
||||
return 1.0 - self._beta ** (self._iter + 1)
|
||||
|
||||
|
||||
def zero_debias_factor_delay(self, delay):
|
||||
# for exponentially averaged stat which starts at non-zero iter
|
||||
return 1.0 - self._beta ** (self._iter - delay + 1)
|
||||
|
||||
|
||||
def curvature_range(self):
|
||||
global_state = self._global_state
|
||||
if self._iter == 0:
|
||||
global_state["curv_win"] = torch.FloatTensor(self._curv_win_width, 1).zero_()
|
||||
curv_win = global_state["curv_win"]
|
||||
grad_norm_squared = self._global_state["grad_norm_squared"]
|
||||
curv_win[self._iter % self._curv_win_width] = np.log(grad_norm_squared + eps)
|
||||
valid_end = min(self._curv_win_width, self._iter + 1)
|
||||
# we use running average over log scale, accelerating
|
||||
# h_max / min in the begining to follow the varying trend of curvature.
|
||||
beta = self._beta
|
||||
if self._iter == 0:
|
||||
global_state["h_min_avg"] = 0.0
|
||||
global_state["h_max_avg"] = 0.0
|
||||
self._h_min = 0.0
|
||||
self._h_max = 0.0
|
||||
global_state["h_min_avg"] = \
|
||||
global_state["h_min_avg"] * beta + (1 - beta) * torch.min(curv_win[:valid_end] )
|
||||
global_state["h_max_avg"] = \
|
||||
global_state["h_max_avg"] * beta + (1 - beta) * torch.max(curv_win[:valid_end] )
|
||||
if self._zero_debias:
|
||||
debias_factor = self.zero_debias_factor()
|
||||
self._h_min = np.exp(global_state["h_min_avg"] / debias_factor)
|
||||
self._h_max = np.exp(global_state["h_max_avg"] / debias_factor)
|
||||
else:
|
||||
self._h_min = np.exp(global_state["h_min_avg"] )
|
||||
self._h_max = np.exp(global_state["h_max_avg"] )
|
||||
if self._sparsity_debias:
|
||||
self._h_min *= self._sparsity_avg
|
||||
self._h_max *= self._sparsity_avg
|
||||
return
|
||||
|
||||
|
||||
def grad_variance(self):
|
||||
global_state = self._global_state
|
||||
beta = self._beta
|
||||
self._grad_var = np.array(0.0, dtype=np.float32)
|
||||
for group in self._optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
state = self._optimizer.state[p]
|
||||
|
||||
if self._iter == 0:
|
||||
state["grad_avg"] = grad.new().resize_as_(grad).zero_()
|
||||
state["grad_avg_squared"] = 0.0
|
||||
state["grad_avg"].mul_(beta).add_(1 - beta, grad)
|
||||
self._grad_var += torch.sum(state["grad_avg"] * state["grad_avg"] )
|
||||
|
||||
if self._zero_debias:
|
||||
debias_factor = self.zero_debias_factor()
|
||||
else:
|
||||
debias_factor = 1.0
|
||||
|
||||
self._grad_var /= -(debias_factor**2)
|
||||
self._grad_var += global_state['grad_norm_squared_avg'] / debias_factor
|
||||
# in case of negative variance: the two term are using different debias factors
|
||||
self._grad_var = max(self._grad_var, eps)
|
||||
if self._sparsity_debias:
|
||||
self._grad_var *= self._sparsity_avg
|
||||
return
|
||||
|
||||
|
||||
def dist_to_opt(self):
|
||||
global_state = self._global_state
|
||||
beta = self._beta
|
||||
if self._iter == 0:
|
||||
global_state["grad_norm_avg"] = 0.0
|
||||
global_state["dist_to_opt_avg"] = 0.0
|
||||
global_state["grad_norm_avg"] = \
|
||||
global_state["grad_norm_avg"] * beta + (1 - beta) * math.sqrt(global_state["grad_norm_squared"] )
|
||||
global_state["dist_to_opt_avg"] = \
|
||||
global_state["dist_to_opt_avg"] * beta \
|
||||
+ (1 - beta) * global_state["grad_norm_avg"] / (global_state['grad_norm_squared_avg'] + eps)
|
||||
if self._zero_debias:
|
||||
debias_factor = self.zero_debias_factor()
|
||||
self._dist_to_opt = global_state["dist_to_opt_avg"] / debias_factor
|
||||
else:
|
||||
self._dist_to_opt = global_state["dist_to_opt_avg"]
|
||||
if self._sparsity_debias:
|
||||
self._dist_to_opt /= (np.sqrt(self._sparsity_avg) + eps)
|
||||
return
|
||||
|
||||
|
||||
def grad_sparsity(self):
|
||||
global_state = self._global_state
|
||||
if self._iter == 0:
|
||||
global_state["sparsity_avg"] = 0.0
|
||||
non_zero_cnt = 0.0
|
||||
all_entry_cnt = 0.0
|
||||
for group in self._optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
grad_non_zero = grad.nonzero()
|
||||
if grad_non_zero.dim() > 0:
|
||||
non_zero_cnt += grad_non_zero.size()[0]
|
||||
all_entry_cnt += torch.numel(grad)
|
||||
beta = self._beta
|
||||
global_state["sparsity_avg"] = beta * global_state["sparsity_avg"] \
|
||||
+ (1 - beta) * non_zero_cnt / float(all_entry_cnt)
|
||||
self._sparsity_avg = \
|
||||
global_state["sparsity_avg"] / self.zero_debias_factor()
|
||||
return
|
||||
|
||||
|
||||
def lr_grad_norm_avg(self):
|
||||
# this is for enforcing lr * grad_norm not
|
||||
# increasing dramatically in case of instability.
|
||||
# Not necessary for basic use.
|
||||
global_state = self._global_state
|
||||
beta = self._beta
|
||||
if "lr_grad_norm_avg" not in global_state:
|
||||
global_state['grad_norm_squared_avg_log'] = 0.0
|
||||
global_state['grad_norm_squared_avg_log'] = \
|
||||
global_state['grad_norm_squared_avg_log'] * beta \
|
||||
+ (1 - beta) * np.log(global_state['grad_norm_squared'] + eps)
|
||||
if "lr_grad_norm_avg" not in global_state:
|
||||
global_state["lr_grad_norm_avg"] = \
|
||||
0.0 * beta + (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
|
||||
# we monitor the minimal smoothed ||lr * grad||
|
||||
global_state["lr_grad_norm_avg_min"] = \
|
||||
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() )
|
||||
else:
|
||||
global_state["lr_grad_norm_avg"] = global_state["lr_grad_norm_avg"] * beta \
|
||||
+ (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
|
||||
global_state["lr_grad_norm_avg_min"] = \
|
||||
min(global_state["lr_grad_norm_avg_min"],
|
||||
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() ) )
|
||||
|
||||
|
||||
def after_apply(self):
|
||||
# compute running average of gradient and norm of gradient
|
||||
beta = self._beta
|
||||
global_state = self._global_state
|
||||
if self._iter == 0:
|
||||
global_state["grad_norm_squared_avg"] = 0.0
|
||||
|
||||
global_state["grad_norm_squared"] = 0.0
|
||||
for group in self._optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
global_state['grad_norm_squared'] += torch.sum(grad * grad)
|
||||
|
||||
global_state['grad_norm_squared_avg'] = \
|
||||
global_state['grad_norm_squared_avg'] * beta + (1 - beta) * global_state['grad_norm_squared']
|
||||
|
||||
if self._sparsity_debias:
|
||||
self.grad_sparsity()
|
||||
|
||||
self.curvature_range()
|
||||
self.grad_variance()
|
||||
self.dist_to_opt()
|
||||
|
||||
if self._iter > 0:
|
||||
self.get_mu()
|
||||
self.get_lr()
|
||||
|
||||
self._lr = beta * self._lr + (1 - beta) * self._lr_t
|
||||
self._mu = beta * self._mu + (1 - beta) * self._mu_t
|
||||
return
|
||||
|
||||
|
||||
def get_lr(self):
|
||||
self._lr_t = (1.0 - math.sqrt(self._mu_t) )**2 / (self._h_min + eps)
|
||||
return
|
||||
|
||||
|
||||
def get_cubic_root(self):
|
||||
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
|
||||
# where x = sqrt(mu).
|
||||
# We substitute x, which is sqrt(mu), with x = y + 1.
|
||||
# It gives y^3 + py = q
|
||||
# where p = (D^2 h_min^2)/(2*C) and q = -p.
|
||||
# We use the Vieta's substution to compute the root.
|
||||
# There is only one real solution y (which is in [0, 1] ).
|
||||
# http://mathworld.wolfram.com/VietasSubstitution.html
|
||||
# eps in the numerator is to prevent momentum = 1 in case of zero gradient
|
||||
p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps)
|
||||
w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
|
||||
w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0)
|
||||
y = w - p / 3.0 / (w + eps)
|
||||
x = y + 1
|
||||
return x
|
||||
|
||||
|
||||
def get_mu(self):
|
||||
root = self.get_cubic_root()
|
||||
dr = self._h_max / self._h_min
|
||||
self._mu_t = max(root**2, ( (np.sqrt(dr) - 1) / (np.sqrt(dr) + 1) )**2 )
|
||||
return
|
||||
|
||||
|
||||
def update_hyper_param(self):
|
||||
for group in self._optimizer.param_groups:
|
||||
group['momentum'] = self._mu
|
||||
if self._force_non_inc_step == False:
|
||||
group['lr'] = self._lr * self._lr_factor
|
||||
elif self._iter > self._curv_win_width:
|
||||
# force to guarantee lr * grad_norm not increasing dramatically.
|
||||
# Not necessary for basic use. Please refer to the comments
|
||||
# in YFOptimizer.__init__ for more details
|
||||
self.lr_grad_norm_avg()
|
||||
debias_factor = self.zero_debias_factor()
|
||||
group['lr'] = min(self._lr * self._lr_factor,
|
||||
2.0 * self._global_state["lr_grad_norm_avg_min"] \
|
||||
/ np.sqrt(np.exp(self._global_state['grad_norm_squared_avg_log'] / debias_factor) ) )
|
||||
return
|
||||
|
||||
|
||||
def auto_clip_thresh(self):
|
||||
# Heuristic to automatically prevent sudden exploding gradient
|
||||
# Not necessary for basic use.
|
||||
return math.sqrt(self._h_max) * self._auto_clip_fac
|
||||
|
||||
|
||||
def step(self):
|
||||
# add weight decay
|
||||
for group in self._optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
|
||||
if group['weight_decay'] != 0:
|
||||
grad = grad.add(group['weight_decay'], p.data)
|
||||
|
||||
if self._clip_thresh != None:
|
||||
torch.nn.utils.clip_grad_norm(self._var_list, self._clip_thresh)
|
||||
elif (self._iter != 0 and self._auto_clip_fac != None):
|
||||
# do not clip the first iteration
|
||||
torch.nn.utils.clip_grad_norm(self._var_list, self.auto_clip_thresh() )
|
||||
|
||||
# apply update
|
||||
self._optimizer.step()
|
||||
|
||||
# after appply
|
||||
self.after_apply()
|
||||
|
||||
# update learning rate and momentum
|
||||
self.update_hyper_param()
|
||||
|
||||
self._iter += 1
|
||||
return
|
|
@ -9,12 +9,13 @@ setup(
|
|||
packages=['benchmark'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'bench = benchmark.train:main'
|
||||
'cifar10 = benchmark.cifar10.__main__:cli',
|
||||
'imagenet = benchmark.imagenet.__main__:cli'
|
||||
]
|
||||
},
|
||||
install_requires=[
|
||||
'tqdm',
|
||||
'torchvision',
|
||||
'click',
|
||||
'progressbar2'
|
||||
]
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue