384 lines
14 KiB
Python
384 lines
14 KiB
Python
|
|
import math
|
|
import numpy as np
|
|
import torch
|
|
|
|
# eps for numerical stability
|
|
eps = 1e-15
|
|
|
|
class YFOptimizer(object):
|
|
def __init__(self, var_list, lr=0.1, mu=0.0, clip_thresh=None, weight_decay=0.0,
|
|
beta=0.999, curv_win_width=20, zero_debias=True, sparsity_debias=True, delta_mu=0.0,
|
|
auto_clip_fac=None, force_non_inc_step=False):
|
|
'''
|
|
clip thresh is the threshold value on ||lr * gradient||
|
|
delta_mu can be place holder/variable/python scalar. They are used for additional
|
|
momentum in situations such as asynchronous-parallel training. The default is 0.0
|
|
for basic usage of the optimizer.
|
|
Args:
|
|
lr: python scalar. The initial value of learning rate, we use 1.0 in our paper.
|
|
mu: python scalar. The initial value of momentum, we use 0.0 in our paper.
|
|
clip_thresh: python scalar. The manaully-set clipping threshold for tf.clip_by_global_norm.
|
|
if None, the automatic clipping can be carried out. The automatic clipping
|
|
feature is parameterized by argument auto_clip_fac. The auto clip feature
|
|
can be switched off with auto_clip_fac = None
|
|
beta: python scalar. The smoothing parameter for estimations.
|
|
sparsity_debias: gradient norm and curvature are biased to larger values when
|
|
calculated with sparse gradient. This is useful when the model is very sparse,
|
|
e.g. LSTM with word embedding. For non-sparse CNN, turning it off could slightly
|
|
accelerate the speed.
|
|
delta_mu: for extensions. Not necessary in the basic use.
|
|
force_non_inc_step: in some very rare cases, it is necessary to force ||lr * gradient||
|
|
to be not increasing dramatically for stableness after some iterations.
|
|
In practice, if turned on, we enforce lr * sqrt(smoothed ||grad||^2)
|
|
to be less than 2x of the minimal value of historical value on smoothed || lr * grad ||.
|
|
This feature is turned off by default.
|
|
Other features:
|
|
If you want to manually control the learning rates, self.lr_factor is
|
|
an interface to the outside, it is an multiplier for the internal learning rate
|
|
in YellowFin. It is helpful when you want to do additional hand tuning
|
|
or some decaying scheme to the tuned learning rate in YellowFin.
|
|
Example on using lr_factor can be found here:
|
|
https://github.com/JianGoForIt/YellowFin_Pytorch/blob/master/pytorch-cifar/main.py#L109
|
|
'''
|
|
self._lr = lr
|
|
self._mu = mu
|
|
# we convert var_list from generator to list so that
|
|
# it can be used for multiple times
|
|
self._var_list = list(var_list)
|
|
self._clip_thresh = clip_thresh
|
|
self._auto_clip_fac = auto_clip_fac
|
|
self._beta = beta
|
|
self._curv_win_width = curv_win_width
|
|
self._zero_debias = zero_debias
|
|
self._sparsity_debias = sparsity_debias
|
|
self._force_non_inc_step = force_non_inc_step
|
|
self._optimizer = torch.optim.SGD(self._var_list, lr=self._lr,
|
|
momentum=self._mu, weight_decay=weight_decay)
|
|
self._iter = 0
|
|
# global states are the statistics
|
|
self._global_state = {}
|
|
|
|
# for decaying learning rate and etc.
|
|
self._lr_factor = 1.0
|
|
|
|
|
|
def state_dict(self):
|
|
# for checkpoint saving
|
|
sgd_state_dict = self._optimizer.state_dict()
|
|
global_state = self._global_state
|
|
lr_factor = self._lr_factor
|
|
iter = self._iter
|
|
lr = self._lr
|
|
mu = self._mu
|
|
clip_thresh = self._clip_thresh
|
|
beta = self._beta
|
|
curv_win_width = self._curv_win_width
|
|
zero_debias = self._zero_debias
|
|
h_min = self._h_min
|
|
h_max = self._h_max
|
|
|
|
return {
|
|
"sgd_state_dict": sgd_state_dict,
|
|
"global_state": global_state,
|
|
"lr_factor": lr_factor,
|
|
"iter": iter,
|
|
"lr": lr,
|
|
"mu": mu,
|
|
"clip_thresh": clip_thresh,
|
|
"beta": beta,
|
|
"curv_win_width": curv_win_width,
|
|
"zero_debias": zero_debias,
|
|
"h_min": h_min,
|
|
"h_max": h_max
|
|
}
|
|
|
|
|
|
def load_state_dict(self, state_dict):
|
|
# for checkpoint saving
|
|
self._optimizer.load_state_dict(state_dict['sgd_state_dict'])
|
|
self._global_state = state_dict['global_state']
|
|
self._lr_factor = state_dict['lr_factor']
|
|
self._iter = state_dict['iter']
|
|
self._lr = state_dict['lr']
|
|
self._mu = state_dict['mu']
|
|
self._clip_thresh = state_dict['clip_thresh']
|
|
self._beta = state_dict['beta']
|
|
self._curv_win_width = state_dict['curv_win_width']
|
|
self._zero_debias = state_dict['zero_debias']
|
|
self._h_min = state_dict["h_min"]
|
|
self._h_max = state_dict["h_max"]
|
|
return
|
|
|
|
|
|
def set_lr_factor(self, factor):
|
|
self._lr_factor = factor
|
|
return
|
|
|
|
|
|
def get_lr_factor(self):
|
|
return self._lr_factor
|
|
|
|
|
|
def zero_grad(self):
|
|
self._optimizer.zero_grad()
|
|
return
|
|
|
|
|
|
def zero_debias_factor(self):
|
|
return 1.0 - self._beta ** (self._iter + 1)
|
|
|
|
|
|
def zero_debias_factor_delay(self, delay):
|
|
# for exponentially averaged stat which starts at non-zero iter
|
|
return 1.0 - self._beta ** (self._iter - delay + 1)
|
|
|
|
|
|
def curvature_range(self):
|
|
global_state = self._global_state
|
|
if self._iter == 0:
|
|
global_state["curv_win"] = torch.FloatTensor(self._curv_win_width, 1).zero_()
|
|
curv_win = global_state["curv_win"]
|
|
grad_norm_squared = self._global_state["grad_norm_squared"]
|
|
curv_win[self._iter % self._curv_win_width] = np.log(grad_norm_squared + eps)
|
|
valid_end = min(self._curv_win_width, self._iter + 1)
|
|
# we use running average over log scale, accelerating
|
|
# h_max / min in the begining to follow the varying trend of curvature.
|
|
beta = self._beta
|
|
if self._iter == 0:
|
|
global_state["h_min_avg"] = 0.0
|
|
global_state["h_max_avg"] = 0.0
|
|
self._h_min = 0.0
|
|
self._h_max = 0.0
|
|
global_state["h_min_avg"] = \
|
|
global_state["h_min_avg"] * beta + (1 - beta) * torch.min(curv_win[:valid_end] )
|
|
global_state["h_max_avg"] = \
|
|
global_state["h_max_avg"] * beta + (1 - beta) * torch.max(curv_win[:valid_end] )
|
|
if self._zero_debias:
|
|
debias_factor = self.zero_debias_factor()
|
|
self._h_min = np.exp(global_state["h_min_avg"] / debias_factor)
|
|
self._h_max = np.exp(global_state["h_max_avg"] / debias_factor)
|
|
else:
|
|
self._h_min = np.exp(global_state["h_min_avg"] )
|
|
self._h_max = np.exp(global_state["h_max_avg"] )
|
|
if self._sparsity_debias:
|
|
self._h_min *= self._sparsity_avg
|
|
self._h_max *= self._sparsity_avg
|
|
return
|
|
|
|
|
|
def grad_variance(self):
|
|
global_state = self._global_state
|
|
beta = self._beta
|
|
self._grad_var = np.array(0.0, dtype=np.float32)
|
|
for group in self._optimizer.param_groups:
|
|
for p in group['params']:
|
|
if p.grad is None:
|
|
continue
|
|
grad = p.grad.data
|
|
state = self._optimizer.state[p]
|
|
|
|
if self._iter == 0:
|
|
state["grad_avg"] = grad.new().resize_as_(grad).zero_()
|
|
state["grad_avg_squared"] = 0.0
|
|
state["grad_avg"].mul_(beta).add_(1 - beta, grad)
|
|
self._grad_var += torch.sum(state["grad_avg"] * state["grad_avg"] )
|
|
|
|
if self._zero_debias:
|
|
debias_factor = self.zero_debias_factor()
|
|
else:
|
|
debias_factor = 1.0
|
|
|
|
self._grad_var /= -(debias_factor**2)
|
|
self._grad_var += global_state['grad_norm_squared_avg'] / debias_factor
|
|
# in case of negative variance: the two term are using different debias factors
|
|
self._grad_var = max(self._grad_var, eps)
|
|
if self._sparsity_debias:
|
|
self._grad_var *= self._sparsity_avg
|
|
return
|
|
|
|
|
|
def dist_to_opt(self):
|
|
global_state = self._global_state
|
|
beta = self._beta
|
|
if self._iter == 0:
|
|
global_state["grad_norm_avg"] = 0.0
|
|
global_state["dist_to_opt_avg"] = 0.0
|
|
global_state["grad_norm_avg"] = \
|
|
global_state["grad_norm_avg"] * beta + (1 - beta) * math.sqrt(global_state["grad_norm_squared"] )
|
|
global_state["dist_to_opt_avg"] = \
|
|
global_state["dist_to_opt_avg"] * beta \
|
|
+ (1 - beta) * global_state["grad_norm_avg"] / (global_state['grad_norm_squared_avg'] + eps)
|
|
if self._zero_debias:
|
|
debias_factor = self.zero_debias_factor()
|
|
self._dist_to_opt = global_state["dist_to_opt_avg"] / debias_factor
|
|
else:
|
|
self._dist_to_opt = global_state["dist_to_opt_avg"]
|
|
if self._sparsity_debias:
|
|
self._dist_to_opt /= (np.sqrt(self._sparsity_avg) + eps)
|
|
return
|
|
|
|
|
|
def grad_sparsity(self):
|
|
global_state = self._global_state
|
|
if self._iter == 0:
|
|
global_state["sparsity_avg"] = 0.0
|
|
non_zero_cnt = 0.0
|
|
all_entry_cnt = 0.0
|
|
for group in self._optimizer.param_groups:
|
|
for p in group['params']:
|
|
if p.grad is None:
|
|
continue
|
|
grad = p.grad.data
|
|
grad_non_zero = grad.nonzero()
|
|
if grad_non_zero.dim() > 0:
|
|
non_zero_cnt += grad_non_zero.size()[0]
|
|
all_entry_cnt += torch.numel(grad)
|
|
beta = self._beta
|
|
global_state["sparsity_avg"] = beta * global_state["sparsity_avg"] \
|
|
+ (1 - beta) * non_zero_cnt / float(all_entry_cnt)
|
|
self._sparsity_avg = \
|
|
global_state["sparsity_avg"] / self.zero_debias_factor()
|
|
return
|
|
|
|
|
|
def lr_grad_norm_avg(self):
|
|
# this is for enforcing lr * grad_norm not
|
|
# increasing dramatically in case of instability.
|
|
# Not necessary for basic use.
|
|
global_state = self._global_state
|
|
beta = self._beta
|
|
if "lr_grad_norm_avg" not in global_state:
|
|
global_state['grad_norm_squared_avg_log'] = 0.0
|
|
global_state['grad_norm_squared_avg_log'] = \
|
|
global_state['grad_norm_squared_avg_log'] * beta \
|
|
+ (1 - beta) * np.log(global_state['grad_norm_squared'] + eps)
|
|
if "lr_grad_norm_avg" not in global_state:
|
|
global_state["lr_grad_norm_avg"] = \
|
|
0.0 * beta + (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
|
|
# we monitor the minimal smoothed ||lr * grad||
|
|
global_state["lr_grad_norm_avg_min"] = \
|
|
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() )
|
|
else:
|
|
global_state["lr_grad_norm_avg"] = global_state["lr_grad_norm_avg"] * beta \
|
|
+ (1 - beta) * np.log(self._lr * np.sqrt(global_state['grad_norm_squared'] ) + eps)
|
|
global_state["lr_grad_norm_avg_min"] = \
|
|
min(global_state["lr_grad_norm_avg_min"],
|
|
np.exp(global_state["lr_grad_norm_avg"] / self.zero_debias_factor() ) )
|
|
|
|
|
|
def after_apply(self):
|
|
# compute running average of gradient and norm of gradient
|
|
beta = self._beta
|
|
global_state = self._global_state
|
|
if self._iter == 0:
|
|
global_state["grad_norm_squared_avg"] = 0.0
|
|
|
|
global_state["grad_norm_squared"] = 0.0
|
|
for group in self._optimizer.param_groups:
|
|
for p in group['params']:
|
|
if p.grad is None:
|
|
continue
|
|
grad = p.grad.data
|
|
global_state['grad_norm_squared'] += torch.sum(grad * grad)
|
|
|
|
global_state['grad_norm_squared_avg'] = \
|
|
global_state['grad_norm_squared_avg'] * beta + (1 - beta) * global_state['grad_norm_squared']
|
|
|
|
if self._sparsity_debias:
|
|
self.grad_sparsity()
|
|
|
|
self.curvature_range()
|
|
self.grad_variance()
|
|
self.dist_to_opt()
|
|
|
|
if self._iter > 0:
|
|
self.get_mu()
|
|
self.get_lr()
|
|
|
|
self._lr = beta * self._lr + (1 - beta) * self._lr_t
|
|
self._mu = beta * self._mu + (1 - beta) * self._mu_t
|
|
return
|
|
|
|
|
|
def get_lr(self):
|
|
self._lr_t = (1.0 - math.sqrt(self._mu_t) )**2 / (self._h_min + eps)
|
|
return
|
|
|
|
|
|
def get_cubic_root(self):
|
|
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
|
|
# where x = sqrt(mu).
|
|
# We substitute x, which is sqrt(mu), with x = y + 1.
|
|
# It gives y^3 + py = q
|
|
# where p = (D^2 h_min^2)/(2*C) and q = -p.
|
|
# We use the Vieta's substution to compute the root.
|
|
# There is only one real solution y (which is in [0, 1] ).
|
|
# http://mathworld.wolfram.com/VietasSubstitution.html
|
|
# eps in the numerator is to prevent momentum = 1 in case of zero gradient
|
|
p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps)
|
|
w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
|
|
w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0)
|
|
y = w - p / 3.0 / (w + eps)
|
|
x = y + 1
|
|
return x
|
|
|
|
|
|
def get_mu(self):
|
|
root = self.get_cubic_root()
|
|
dr = self._h_max / self._h_min
|
|
self._mu_t = max(root**2, ( (np.sqrt(dr) - 1) / (np.sqrt(dr) + 1) )**2 )
|
|
return
|
|
|
|
|
|
def update_hyper_param(self):
|
|
for group in self._optimizer.param_groups:
|
|
group['momentum'] = self._mu
|
|
if self._force_non_inc_step == False:
|
|
group['lr'] = self._lr * self._lr_factor
|
|
elif self._iter > self._curv_win_width:
|
|
# force to guarantee lr * grad_norm not increasing dramatically.
|
|
# Not necessary for basic use. Please refer to the comments
|
|
# in YFOptimizer.__init__ for more details
|
|
self.lr_grad_norm_avg()
|
|
debias_factor = self.zero_debias_factor()
|
|
group['lr'] = min(self._lr * self._lr_factor,
|
|
2.0 * self._global_state["lr_grad_norm_avg_min"] \
|
|
/ np.sqrt(np.exp(self._global_state['grad_norm_squared_avg_log'] / debias_factor) ) )
|
|
return
|
|
|
|
|
|
def auto_clip_thresh(self):
|
|
# Heuristic to automatically prevent sudden exploding gradient
|
|
# Not necessary for basic use.
|
|
return math.sqrt(self._h_max) * self._auto_clip_fac
|
|
|
|
|
|
def step(self):
|
|
# add weight decay
|
|
for group in self._optimizer.param_groups:
|
|
for p in group['params']:
|
|
if p.grad is None:
|
|
continue
|
|
grad = p.grad.data
|
|
|
|
if group['weight_decay'] != 0:
|
|
grad = grad.add(group['weight_decay'], p.data)
|
|
|
|
if self._clip_thresh != None:
|
|
torch.nn.utils.clip_grad_norm(self._var_list, self._clip_thresh)
|
|
elif (self._iter != 0 and self._auto_clip_fac != None):
|
|
# do not clip the first iteration
|
|
torch.nn.utils.clip_grad_norm(self._var_list, self.auto_clip_thresh() )
|
|
|
|
# apply update
|
|
self._optimizer.step()
|
|
|
|
# after appply
|
|
self.after_apply()
|
|
|
|
# update learning rate and momentum
|
|
self.update_hyper_param()
|
|
|
|
self._iter += 1
|
|
return
|