tensorflow_privacy/research/dp_newton/src/opt_algs.py

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

"""file containing all auxillary functions for running the optimization algorithms"""

# pylint: skip-file
# pyformat: disable

import time
from my_logistic_regression import MyLogisticRegression
import numpy as np


class CompareAlgs:
  """Class to run multiple iterative algorithms and compare the results."""

  def __init__(self, lrp, optimal_w, hyper_dict):
    """Initialize the problem.

    lr = an instance of MyLogisticRegression
    dataset = dataset in the format of (features,label)
    optimal_w = optimal minimizer of logistic loss on dataset without privacy
    pb = hyperparameters
    """
    self.w_opt = optimal_w
    self.lrp = lrp
    self.iters = hyper_dict["num_iteration"]
    self.hyper_params = hyper_dict
    self.clock_time = []
    self.params = []
    self.names = []

  def add_algo(self, update_rule, name):
    """Run an iterative update method"""
    _, dim = self.lrp.num_samples, self.lrp.dim
    wint_un = np.random.multivariate_normal(np.zeros(dim), np.eye(dim))
    w_int = wint_un / np.linalg.norm(wint_un)
    cutoff_norm = (
        100 * np.linalg.norm(self.w_opt) + 100 * np.linalg.norm(w_int) + 100
    )
    w_cur = w_int
    params = [w_cur]
    start_t = time.time()
    wall_clock = [0]
    for _ in range(self.iters):
      w_cur = update_rule(w_cur, self.lrp, self.hyper_params)
      if np.linalg.norm(w_cur) > cutoff_norm:
        w_cur = w_int
        print("Stop Things Exploding!")
      params.append(w_cur)
      wall_clock.append(time.time() - start_t)
    self.clock_time.append(wall_clock)
    self.params.append(params)
    self.names.append(name)

  def wall_clock_alg(self):
    """compute the wall clock of different algorithms"""
    clock_time_dict = {}
    for time_alg, name in zip(self.clock_time, self.names):
      clock_time_dict[name] = [time_alg]
    return clock_time_dict

  def loss_vals(self):
    """output the loss per iteration for different methods"""
    baseline = self.lrp.loss_wor(self.w_opt)
    loss_dict = {}
    for params, name in zip(self.params, self.names):
      losses = [self.lrp.loss_wor(w) - baseline for w in params]
      loss_dict[name] = [losses]
    return loss_dict

  def accuracy_vals(self):
    """output the accuracy per iteration for different methods"""
    acc_dict = {}
    for params, name in zip(self.params, self.names):
      acc_vec = [self.lrp.accuracy(w) for w in params]
      acc_dict[name] = [acc_vec]
    return acc_dict

  def accuracy_np(self):
    """output the accuracy of the optimal model without privacy"""
    return self.lrp.accuracy(self.w_opt)

  def gradnorm_vals(self):
    """output the gradient norm per iteration for different methods"""
    gradnorm_dict = {}
    for params, name in zip(self.params, self.names):
      grad_norms = [np.linalg.norm(self.lrp.grad_wor(w)) for w in params]
      gradnorm_dict[name] = [grad_norms]
    return gradnorm_dict


def private_newton(w_cur, lrp, hyper_dict):
  """implementation of private newton method from [ABL21]

  w = current iterate
  lr = an instance of MyLogisticRegression
  i = the index of current iterate
  iters = total number of iterations
  pb =  privacy budget and other info
  return the next iterate
  """
  total = hyper_dict["total"]
  grad_frac = hyper_dict["grad_frac"]
  iters = hyper_dict["num_iteration"]
  hess = lrp.hess(w_cur)
  rho_grad = grad_frac * total / iters  # divide total privacy budget up.
  rho_hess = (1 - grad_frac) * total / iters
  hess_noise = np.random.normal(
      scale=(0.25 / lrp.num_samples) * np.sqrt(0.5 / rho_hess),
      size=(lrp.dim, lrp.dim),
  )
  hess_noise = (hess_noise + hess_noise.T) / 2
  hess_noisy = eigenclip(hess + hess_noise)
  grad = lrp.grad(w_cur)
  grad_noisy = grad + np.random.normal(
      scale=(1 / lrp.num_samples) * np.sqrt(0.5 / rho_grad), size=lrp.dim
  )
  dir_noisy = np.linalg.solve(hess_noisy, grad_noisy)
  dir_size = np.linalg.norm(np.linalg.solve(hess, grad))
  return w_cur - min(np.log(1 + dir_size) * (1 / dir_size), 1) * dir_noisy


def eigenclip(sym_mat, min_eval=1e-5):
  """operation of the eigenclip

  A = symmetric matrix
  min_eval = minimum eigenvalue for clipping

  return the modified matrix
  """
  eig_val, eig_vec = np.linalg.eigh(sym_mat)
  eval_mod = np.maximum(eig_val, min_eval * np.ones(eig_val.shape))
  clipped_mat = np.dot(eig_vec * eval_mod, eig_vec.T)
  return clipped_mat


def gd_priv(w_cur, lrp, hyper_dict):
  """Implementation of DP-GD.

  w = current point
  lr = logistic regression
  i = iteration number
  pb = auxillary information

  output is the next iterate
  """
  iters = hyper_dict["num_iteration"]
  inv_lr_gd = 0.25  # learning rate based on the smoothness
  sens = 1 / (lrp.num_samples * (inv_lr_gd))  # sensitivity
  rho = hyper_dict["total"] / iters  # divide total privacy budget up
  noise = np.random.normal(scale=sens / np.sqrt(2 * rho), size=lrp.dim)
  return w_cur - lrp.grad_wor(w_cur) / (inv_lr_gd) + noise


def sgd_priv(w_cur, lrp, hyper_dict):
  """Implementation of DP-SGD.

  w = current point
  lr = logistic regression
  i = iteration number
  pb = auxillary information

  output is the next iterate
  """
  batch_size = hyper_dict["batch_size"]
  sigma_privacy = hyper_dict["noise_multiplier"]
  lr_sgd = 4  # learning rate based on the smoothness
  sample_rate = batch_size / lrp.num_samples  # sampling probability
  sample_vec = np.random.binomial(n=1, p=sample_rate, size=lrp.num_samples)
  batch_idx = np.where(sample_vec == 1)[0]  # index of batch
  batch_size_t = len(batch_idx)
  noise = np.random.normal(scale=sigma_privacy, size=lrp.dim)
  grad_minibatch = lrp.grad_wor(
      w_cur, batch_idx
  )  # average gradient over batch_idx
  return w_cur - lr_sgd * (
      batch_size_t / batch_size * grad_minibatch + noise / batch_size
  )


def gd_priv_optls(w_cur, lrp, hyper_dict):
  """Implementation of DP-GD with back-tracking line search !!!

  this method is not private. We only use it as a baseline.

  w = current point
  lr = logistic regression
  i = iteration number
  pb = auxillary information

  output is the next iterate
  """
  iters = hyper_dict["num_iteration"]
  rho_grad = hyper_dict["total"] / iters  # divide total privacy budget up
  grad_scale = (1 / lrp.num_samples) * np.sqrt(0.5 / rho_grad)
  grad_noise = np.random.normal(scale=grad_scale, size=lrp.dim)
  dir_srch = lrp.grad(w_cur) + grad_noise
  stepsize_opt = backtracking_ls(lrp, dir_srch, w_cur)
  return w_cur - stepsize_opt * dir_srch


def backtracking_ls(lrp, dir_srch, w_start, alpha=0.4, beta=0.95):
  """Implementation of backtracking line search

  lr = logistic regression
  dir = the "noisy" gradient direction
  w_start = current point
  alpha and beta tradeoff the precision and complexity of the linesearch

  output is an (close to) optimal stepsize
  """
  step_size = 100
  val_0 = lrp.loss(w_start)
  inner_prod = np.dot(dir_srch, lrp.grad(w_start))
  while (
      lrp.loss(w_start - step_size * dir_srch)
      >= val_0 - step_size * alpha * inner_prod
  ):
    step_size = beta * step_size
    if step_size < 1e-6:
      break
  return step_size


def newton(dataset, w_init, bias=True):
  """Implementation of the newton method with linesearch without privacy constraints

  dataset = dataset
  w_init = initialization point

  output is the model parameter
  """
  feature_vecs, labels = dataset
  if bias is True:
    feature_vecs = np.hstack(
        (np.ones(shape=(np.shape(feature_vecs)[0], 1)), feature_vecs)
    )
  lrp = MyLogisticRegression(feature_vecs, labels, reg=1e-9)
  w_cur = w_init
  for _ in range(8):
    hess = lrp.hess(w_cur)
    dir_srch = np.linalg.solve(hess, lrp.grad_wor(w_cur))
    step_size = backtracking_ls(lrp, dir_srch, w_cur)
    w_cur = w_cur - step_size * dir
  if lrp.loss_wor(w_cur) < lrp.loss_wor(w_init):
    w_out = w_cur
  else:
    w_out = w_init
  return w_out


class DoubleNoiseMech:
  """Our Method: Double Noise Mechanism"""

  def __init__(self, lrp, type_reg="add", curvature_info="hessian"):
    """Initializer of the double noise mechanism

    lr = an instance of MyLogisticRegression
    type_reg = minimum eigenvalue modification type, it can be either 'add' or
    'clip'
    curvature_info = type of the second-order information
    """
    self.type_reg = type_reg
    self.curvature_info = curvature_info
    if self.curvature_info == "hessian":
      self.hess = lrp.hess_wor
    elif self.curvature_info == "ub":
      self.hess = lrp.upperbound_wor

  def update_rule(self, w_cur, lrp, hyper_dict):
    """Implementation of the double noise mechanism update rule--full batch"""
    noisy_grad_cur = self.noisy_grad(w_cur, lrp, hyper_dict)
    w_next = self.noisy_direction(w_cur, lrp, hyper_dict, noisy_grad_cur)
    return w_next

  def update_rule_stochastic(self, w_cur, lrp, hyper_dict):
    """Implementation of the double noise mechanism update rule--full batch"""
    noisy_grad_cur = self.noisy_grad(w_cur, lrp, hyper_dict, True)
    w_next = self.noisy_direction_stochastic(
        w_cur, lrp, hyper_dict, noisy_grad_cur
    )
    return w_next

  def noisy_grad(self, w_cur, lrp, hyper_dict, batch=False):
    """computing gradient"""
    if batch is False:
      rho_grad = (hyper_dict["grad_frac"] * hyper_dict["total"]) / hyper_dict[
          "num_iteration"
      ]
      noise_grad = np.random.normal(
          scale=(1 / lrp.num_samples) * np.sqrt(0.5 / rho_grad), size=lrp.dim
      )
      noisy_grad = lrp.grad(w_cur) + noise_grad
    else:
      std_grad = hyper_dict["noise_multiplier_grad"]
      pgrad = hyper_dict["batchsize_grad"] / lrp.num_samples
      sample_vec = np.random.binomial(n=1, p=pgrad, size=lrp.num_samples)
      batch_idx_grad = np.where(sample_vec == 1)[0]
      grad_minibatch = lrp.grad_wor(w_cur, batch_idx_grad)
      noise_grad = np.random.normal(scale=std_grad, size=lrp.dim)
      noisy_grad = (
          len(batch_idx_grad) / (lrp.num_samples * pgrad)
      ) * grad_minibatch + (noise_grad) / (lrp.num_samples * pgrad)
    return noisy_grad

  def noisy_direction(self, w_cur, lrp, hyper_dict, noisy_grad):
    """computing direction"""
    total = hyper_dict["total"]
    grad_frac = hyper_dict["grad_frac"]
    frac_trace = hyper_dict["trace_frac"]
    trace_coeff = hyper_dict["trace_coeff"]
    iters = hyper_dict["num_iteration"]
    rho_hess = (1 - grad_frac) * total / iters
    smooth_param = 0.25
    hess_cur = self.hess(w_cur)
    noisy_trace = trace_coeff * max(
        np.trace(hess_cur)
        + np.random.normal(
            scale=(0.25 / lrp.num_samples)
            * np.sqrt(0.5 / (frac_trace * rho_hess))
        ),
        0,
    )
    min_eval = max(
        (noisy_trace / ((lrp.num_samples) ** 2 * (1 - frac_trace) * rho_hess))
        ** (1 / 3),
        1 / (lrp.num_samples),
    )
    grad_norm = np.linalg.norm(noisy_grad)
    if self.type_reg == "add":  # Sensitivity is different for add vs clip
      sens2 = (
          grad_norm
          * smooth_param
          / (lrp.num_samples * min_eval**2 + smooth_param * min_eval)
      )
      noise2 = np.random.normal(
          scale=sens2 * np.sqrt(0.5 / ((1 - frac_trace) * rho_hess)),
          size=lrp.dim,
      )
      return (
          w_cur
          - np.linalg.solve(hess_cur + min_eval * np.eye(lrp.dim), noisy_grad)
          + noise2
      )
    # type_reg=clip
    sens2 = (
        grad_norm
        * smooth_param
        / (lrp.num_samples * min_eval**2 - smooth_param * min_eval)
    )
    noise2 = np.random.normal(
        scale=sens2 * np.sqrt(0.5 / ((1 - frac_trace) * rho_hess)), size=lrp.dim
    )
    eval_hess, evec_hess = np.linalg.eigh(hess_cur)
    eval_trunc = eval_hess[eval_hess >= min_eval]
    num_eig = len(eval_trunc)
    if num_eig == 0:
      hess_modified_inv = 1 / min_eval * np.eye(lrp.dim)
    else:
      evec_trun = evec_hess[:, -num_eig:]
      hess_modified_inv = np.dot(
          evec_trun * (1 / eval_trunc - 1 / min_eval * np.ones(num_eig)),
          evec_trun.T,
      ) + 1 / min_eval * np.eye(lrp.dim)
    return w_cur - (hess_modified_inv @ noisy_grad) + noise2

  def noisy_direction_stochastic(self, w_cur, lrp, hyper_dict, noisy_grad):
    """noisy direction for stochastic variant"""
    std_hess = hyper_dict["noise_multiplier_hess"]
    phess = hyper_dict["batchsize_hess"] / lrp.num_samples
    min_eval = hyper_dict["min_eval"]
    sample_vec = np.random.binomial(n=1, p=phess, size=lrp.num_samples)
    batch_idx_hess = np.where(sample_vec == 1)[0]
    batch_size_hess_t = len(batch_idx_hess)
    hess_cur = (
        (batch_size_hess_t)
        / (lrp.num_samples * phess)
        * self.hess(w_cur, batch_idx_hess)
    )
    smooth_param = 0.25  # smoothness parameter
    grad_norm = np.linalg.norm(noisy_grad)
    if self.type_reg == "add":  # Sensitivity is different for add vs clip
      sens2 = (
          grad_norm
          * smooth_param
          / (
              (lrp.num_samples * phess) * min_eval**2
              + smooth_param * min_eval
          )
      )
      noise2 = np.random.normal(scale=sens2 * std_hess, size=lrp.dim)
      return (
          w_cur
          - np.linalg.solve(
              hess_cur + min_eval * np.eye(len(hess_cur)), noisy_grad
          )
          + noise2
      )
    # type_reg=clip
    min_eval_c = max(min_eval, 1 / ((lrp.num_samples * phess)))
    sens2 = (
        grad_norm
        * smooth_param
        / (
            (lrp.num_samples * phess) * min_eval_c**2
            - smooth_param * min_eval_c
        )
    )
    noise2 = np.random.normal(scale=sens2 * std_hess, size=lrp.dim)
    eval_hess, evec_hess = np.linalg.eigh(hess_cur)
    eval_trunc = eval_hess[eval_hess >= min_eval_c]
    num_eig = len(eval_trunc)
    if num_eig == 0:
      hess_modified_inv = 1 / min_eval_c * np.eye(lrp.dim)
    else:
      evec_trun = evec_hess[:, -num_eig:]
      hess_modified_inv = np.dot(
          evec_trun * (1 / eval_trunc - 1 / min_eval_c * np.ones(num_eig)),
          evec_trun.T,
      ) + 1 / min_eval_c * np.eye(lrp.dim)
    return w_cur - (hess_modified_inv @ noisy_grad) + noise2