diff --git a/research/dp_newton/dpso-logistic.py b/research/dp_newton/dpso-logistic.py new file mode 100644 index 0000000..3e5ac43 --- /dev/null +++ b/research/dp_newton/dpso-logistic.py @@ -0,0 +1,1238 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# -*- coding: utf-8 -*- +"""Differentially Private Second-Order Methods for Logistic Regression. + +This script implements several algorithms for DP logistic regression and +tests them on various datasets. It produces plots for our upcoming paper. + +Code exported from Colab. Written by Mahdi Haghifam. + +""" + +# pylint: disable=invalid-name +# We use upper case to denote matrices and lower case for vectors. +# This conflicts with pylint's variable naming rules. +# pylint: disable=redefined-outer-name +# This is a script hence we have "global" variables. +# pylint: disable=unused-argument +# The update rule functions are meant to have the same signature, +# so cannot just remove arguments. Ideally this should have been +# implemented as a class, but much easier to define a function. + +import math +import urllib.request + +# from keras.utils.np_utils import to_categorical +import matplotlib.pyplot as plt +import numpy as np +from sklearn import preprocessing +import sklearn.datasets +from sklearn.linear_model import LogisticRegression +import tensorflow as tf +# from tensorflow import keras + + +class MyLogisticRegression: + """Represents a logistic regression problem. + + There is a dataset consisting of features (vectors of norm <=1) + and labels (+1,-1), represented as a numpy array. + There is also an L2 regularizer. + """ + + def __init__(self, X, y, reg=1e-8): + """Initialize the data and the regularizer. + + Args: + X: n x d numpy array representing features + y: n x 1 numpy array representing labels + reg: L2 regularizing coefficient (to ensure solution is finite) + + Data will be rescaled so that ||X[i,:]|| * |y[i]| <= 1 for all i. + """ + self.reg = float(reg) + X = np.array(X) + y = np.array(y) + assert len(X.shape) == 2 + assert len(y.shape) == 1 + self.n, self.d = X.shape + assert y.shape[0] == self.n + signed_data = X * y[:, np.newaxis] + norm = np.linalg.norm(signed_data, axis=1) + scale = np.maximum(norm, np.ones_like(norm)) + self.data = (1 / scale[:, None]) * signed_data + + def loss(self, w): + """Computes the loss represented by this object at w. + + Args: + w: weight vector + + Returns: + If X,y is the data and reg is the regularizer, then the loss is + (1/n)sum_i^n log(1+exp(-)) + (reg/2)||w||^2 + """ + data_loss = np.mean(np.log1p(np.exp(-np.dot(self.data, w)))) + reg_loss = 0.5 * self.reg * np.linalg.norm(w)**2 + return data_loss + reg_loss + + def loss_wor(self, w): + """Computes the loss represented by this object at w without regularizer. + + Args: + w: weight vector + + Returns: + If X,y is the data and reg is the regularizer, then the loss is + (1/n)sum_i^n log(1+exp(-)) + """ + data_loss = np.mean(np.log1p(np.exp(-np.dot(self.data, w)))) + return data_loss + + def grad(self, w): + """Computes the gradient of the logistic regression at a given point w. + + Args: + w: weight vector + + Returns: + If X,y is the data and reg is the regularizer, then the gradient is + (-1/n)sum_i^n X[i,:]*y[i]/(1+exp()) + reg*w + """ + coeff_grad = -1/(1+np.exp(np.dot(self.data, w))) + data_grad = np.mean(self.data * coeff_grad[:, np.newaxis], axis=0) + return data_grad + self.reg * w + + def grad_wor(self, w): + """Computes the gradient of the logistic regression at a given point w. + + Args: + w: weight vector + + Returns: + If X,y is the data and reg is the regularizer, then the gradient is + (-1/n)sum_i^n X[i,:]*y[i]/(1+exp()) + reg*w + """ + coeff_grad = -1/(1+np.exp(np.dot(self.data, w))) + data_grad = np.mean(self.data * coeff_grad[:, np.newaxis], axis=0) + return data_grad + + def hess(self, w): + """Computes the Hessian of the logistic regression at a given point w. + + Args: + w: weight vector + + Returns: + The Hessian is the matrix of second derivatives. + If X,y is the data and reg is the regularizer, then the Hessian is + (1/n)sum_i^n X[i,:]*X[i,:]^T / (cosh(/2)*2)^2 + where we assume y[i]^2==1. + """ + a = np.dot(self.data, w)/2 + coeff_hess = 1 / (np.exp(a)+np.exp(-a))**2 + raw_hess = np.dot(self.data.T * coeff_hess, self.data) + return raw_hess/self.n + self.reg * np.eye(self.d) + + def hess_wor(self, w): + """Computes the Hessian of the logistic regression at a given point w. + + Args: + w: weight vector + + Returns: + The Hessian is the matrix of second derivatives. + If X,y is the data, then the Hessian is + (1/n)sum_i^n X[i,:]*X[i,:]^T / (cosh(/2)*2)^2 + where we assume y[i]^2==1. + """ + a = np.dot(self.data, w)/2 + coeff_hess = 1 / (np.exp(a)+np.exp(-a))**2 + raw_hess = np.dot(self.data.T * coeff_hess, self.data) + return raw_hess/self.n + + def upperbound(self, w): + """Computes tightest universal quadratic upper bound on the loss function. + + log(1+exp(x))<=log(1+exp(a))+(x-a)/(1+exp(-a))+(x-a)^2*tanh(a/2)/(4*a) + Constant and linear terms are just first-order Taylor + This function gives the quadratic term (which replaces the Hessian) + https://twitter.com/shortstein/status/1557961202256318464 + + Args: + w: weight vector + + Returns: + Matrix H such that for all v + loss(v) <= loss(w)+ + /2 + """ + a = -np.dot(self.data, w) # vector of y_i for i in [n] + # v = 0.5*np.tanh(a/2)/a + # But avoid 0/0 by special rule + v = np.divide( + 0.5 * np.tanh(a / 2), + a, + out=(np.ones(a.shape) * 0.25), + where=(np.abs(a) > 1e-9)) + H = np.dot(self.data.T * v, self.data) + return H / self.n + self.reg * np.eye(self.d) + + def upperbound_wor(self, w): + """Computes tightest quadratic upper bound on the unregularized loss. + + log(1+exp(x))<=log(1+exp(a))+(x-a)/(1+exp(-a))+(x-a)^2*tanh(a/2)/(4*a) + Constant and linear terms are just first-order Taylor + This function gives the quadratic term (which replaces the Hessian) + https://twitter.com/shortstein/status/1557961202256318464 + + Args: + w: weight vector + + Returns: + Matrix H such that for all v + loss(v) <= loss(w)+ + /2 + """ + a = -np.dot(self.data, w) # vector of y_i for i in [n] + # v = 0.5*np.tanh(a/2)/a + # But avoid 0/0 by special rule + v = np.divide( + 0.5 * np.tanh(a / 2), + a, + out=(np.ones(a.shape) * 0.25), + where=(np.abs(a) > 1e-9)) + H = np.dot(self.data.T * v, self.data) + return H / self.n + + +class Mydatasets: + """Represents datasets we use for testing the algorithms. + """ + + def __init__(self): + pass + + def find_optimal_classifier(self, dataset, reg=1e-9): + """Find the optimal weight vector for the logistic regression. + + Args: + dataset: training dataset + reg: regularizer + + Returns: + Optimal weight vector. + """ + X, y = dataset + model_lr = LogisticRegression(max_iter=10000, C=1/reg).fit(X, y) + w_opt1 = np.concatenate([model_lr.intercept_, np.squeeze(model_lr.coef_)]) + w_opt = newton(dataset, w_opt1) + print("optimal weight vector norms", np.linalg.norm(w_opt)) + return w_opt + + def mnist_binary(self): + """Download and extract MNIST data. + + We also select only two labels for the binary classification task. + + Returns: + Features, labels, and optimal weight vector. + """ + labels = [1, 7] + label0, label1 = int(labels[0]), int(labels[1]) + mnist = tf.keras.datasets.mnist + (x_train, y_train), (_, _) = mnist.load_data() + x_train = x_train.reshape(x_train.shape[0], -1) + scaler = preprocessing.StandardScaler().fit(x_train) + x_train = scaler.transform(x_train) + nrm = np.linalg.norm(x_train, axis=1) + x_train = x_train * 1/nrm[:, None] + y_train = y_train.astype(float) + indx0 = np.nonzero(y_train == label0)[0] + indx1 = np.nonzero(y_train == label1)[0] + y_train[indx0] = -1 + y_train[indx1] = 1 + indx = np.concatenate((indx0, indx1)) + x_train = x_train[indx] + labels = y_train[indx] + dataset = x_train, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(x_train)[0], 1)), + x_train)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def w8a_dataset(self): + """w8a dataset for logistic regression. + """ + num_points = 15e3 + w8a_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/w8a" + data_path = "./w8a" + urllib.request.urlretrieve(w8a_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + selected_samples = np.random.choice(len(X), int(num_points)) + X = X[selected_samples, :] + labels = labels[selected_samples] + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def a1a_dataset(self): + """Loads a1a dataset for logistic regression. + """ + a1a_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a" + data_path = "./a1a" + urllib.request.urlretrieve(a1a_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def phishing(self): + """phishing dataset for logistic regression. + """ + phishing_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/phishing" + data_path = "./phishing" + urllib.request.urlretrieve(phishing_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def a5a_dataset(self): + """a5a dataset for logistic regression. + """ + a5a_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a5a" + data_path = "./a5a" + urllib.request.urlretrieve(a5a_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def a6a_dataset(self): + """a6a dataset for logistic regression. + """ + a6a_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a5a" + data_path = "./a6a" + urllib.request.urlretrieve(a6a_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def madelon(self): + """madelon dataset for logistic regression. + """ + madelon_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/madelon" + data_path = "./madelon" + urllib.request.urlretrieve(madelon_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def mushroom_dataset(self): + """mushroom dataset for logistic regression. + """ + mushroom_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms" + data_path = "./mushrooms" + urllib.request.urlretrieve(mushroom_url, data_path) + X, labels = sklearn.datasets.load_svmlight_file(data_path) + X = X.toarray() + scaler = preprocessing.StandardScaler().fit(X) + X = scaler.transform(X) + nrm = np.linalg.norm(X, axis=1) + X = X * 1/nrm[:, None] + labels = labels.astype(float) + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + def synthetic_data(self, n=5000, d=50, cov=None, w=None): + """Generates a synthetic dataset for logistic regression. + + Args: + n: number of samples + d: dimension + cov: covariance of the data (optional, default: identity) + w: true coefficient vector (optional, default:first standard basis vector) + + Returns: + Synthetic dataset. + Features are unit vectors (by default uniformly random). + Labels are sampled from logistic distribution, + where argument w is the "true" solution. + """ + mean = np.zeros(d) + if cov is None: + cov = np.eye(d) + X_un = np.random.multivariate_normal(mean, cov, n) + nrm = np.linalg.norm(X_un, axis=1) + X = X_un * 1/nrm[:, None] + if w is None: + w = np.ones(d) + w[0] = 1 + inner_prod = np.dot(X, w) + params = np.exp(inner_prod)/(1+np.exp(inner_prod)) + labels = 2*np.random.binomial(1, params)-1 + dataset = X, labels + w_opt = self.find_optimal_classifier(dataset) + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), + X)) # adding a dummy dimension for the bias term. + return X, labels, w_opt + + +class CompareAlgs: + """Class to run multiple iterative algorithms and compare the results.""" + + def __init__(self, + lr, + dataset, + optimal_w, + iters=10, + w0=None, + pb=None): + """Initialize the problem.""" + X, _ = dataset + self.w_opt = optimal_w + n, d = np.shape(X) + print("dataset is created: (number of samples, dimension)=" + str(n) + "," + + str(d)) + + if w0 is None: + w0_un = np.random.multivariate_normal(np.zeros(d), np.eye(d)) + w0 = w0_un/np.linalg.norm(w0_un) + self.w0 = w0 # initial iterate + self.iters = iters + self.pb = pb + self.lr = lr + self.plots = [] # List of lists of iterates + self.names = [] # List of names + self.linestyles = [] # List of line styles for plotting + self.cutoff = 20 * np.linalg.norm(self.w_opt) + 20 * np.linalg.norm( + self.w0) + 10 # how do you set this value? is it problem-specific? + + def add_plot(self, update_rule, name, linestyle): + """Run an iterative update method & add to plot. + + Args: + update_rule: a function that takes 4 arguments: + current iterate + LogisticRegression problem + index of current iterate + total number of iterations + pb = privacy budget or similar + name: string to display in legend + linestyle: line style for plot + """ + baseline = self.lr.loss_wor(self.w_opt) + print(name) + w = self.w0 + plot = [w] + for i in range(self.iters): + w = update_rule(w, self.lr, i, self.iters, self.pb) + if np.linalg.norm(w) > self.cutoff: + w = self.w0 # Stop things exploding + print("Stop Things Exploding!") + plot.append(w) + print( + str(i) + ": ||grad||=" + str(np.linalg.norm(self.lr.grad_wor(w))) + + " ex_loss=" + str(self.lr.loss_wor(w) - baseline)) + self.plots.append(plot) + self.names.append(name) + self.linestyles.append(linestyle) + print() + + def plot_grad_norms(self, legend=True): + """Plot gradient norms for each iteration. + """ + plt.clf() + for plot, name, ls in zip(self.plots, self.names, self.linestyles): + grad_norms = [np.linalg.norm(self.lr.grad_wor(w)) for w in plot] + plt.plot(range(self.iters+1), grad_norms, ls, label=name) + plt.yscale("log") + ymax = np.linalg.norm(self.lr.grad(self.plots[0][0])) + plt.ylim(top=ymax) + # plt.ylim((1e-3, 1e-1)) + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + if legend: plt.legend() + plt.show() + + def loss_vals(self): + """Outputs the loss vector for different methods. + """ + baseline = self.lr.loss_wor(self.w_opt) + loss_dict = {} + for plot, name in zip(self.plots, self.names): + losses = [self.lr.loss_wor(w)-baseline for w in plot] + loss_dict[name] = [losses] + return loss_dict + + def gradnorm_vals(self): + """Outputs the gradient norm for different methods. + """ + gradnorm_dict = {} + for plot, name in zip(self.plots, self.names): + grad_norms = [np.linalg.norm(self.lr.grad_wor(w)) for w in plot] + gradnorm_dict[name] = [grad_norms] + return gradnorm_dict + + def plot_losses(self): + """Plots excess loss for each iteration. + + output is a dictionary where the keys are name of method and value is + the loss for each iteration. + """ + baseline = self.lr.loss_wor(self.w_opt) + plt.clf() + for plot, name, ls in zip(self.plots, self.names, self.linestyles): + losses = [self.lr.loss_wor(w)-baseline for w in plot] + plt.plot(range(self.iters+1), losses, ls, label=name) + # plt.yscale('log') + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.legend() + plt.show() + + +def gd_priv(w, lr, i, iters, pb): + """Implementation of DP-GD. + + Args: + w: current point + lr: logistic regression + i: iteration number + iters: total number of iterations + pb: auxillary information + + Returns: + The next iterate. + """ + inv_lr_gd = 0.25 # We select the learning rate based on the smoothness + sens = 1/(lr.n*(inv_lr_gd+lr.reg)) # Sensitivity + rho = pb["total"] / iters # divide total privacy budget up + noise = np.random.normal(scale=sens/math.sqrt(2*rho), size=lr.d) + return w - lr.grad(w)/(inv_lr_gd+lr.reg) + noise + + +def gd_priv_backtrackingls(w, lr, i, iters, pb): + """Implementation of DP-GD with back-tracking line search. + + !!! this method is not private. We only use it as a baseline. + + Args: + w: current point + lr: logistic regression + i: iteration number + iters: total number of iterations + pb: auxillary information + + Returns: + The next iterate + """ + rho_grad = pb["total"] / iters # divide total privacy budget up + grad_scale = (1/lr.n)*math.sqrt(0.5/rho_grad) + grad_noise = np.random.normal(scale=grad_scale, size=lr.d) + direction = lr.grad(w)+grad_noise + stepsize_opt = backtracking_ls(lr, direction, w) + return w - stepsize_opt * direction + + +def backtracking_ls(lr, direction, w_0, alpha=0.4, beta=0.95): + """Implementation of backtracking line search. + + Args: + lr: logistic regression + direction: the "noisy" gradient direction + w_0: current point + alpha: tradeoff the precision and complexity of the linesearch + beta: tradeoff the precision and complexity of the linesearch + + Returns: + A good stepsize + """ + t = 100 + while lr.loss(w_0 - t * direction + ) >= lr.loss(w_0) - t * alpha * np.dot(direction, lr.grad(w_0)): + t = beta * t + if t < 1e-10: + break + return t + + +def newton(dataset, w_opt2): + """Newton update rule. + """ + X, y = dataset + X = np.hstack((np.ones(shape=(np.shape(X)[0], 1)), X)) + lr = MyLogisticRegression(X, y, reg=1e-9) + w_opt = w_opt2 + _, d = np.shape(X) + w = np.zeros(d) + for _ in range(30): + H = lr.hess(w) + direction = np.linalg.solve(H, lr.grad(w)) + step_size = backtracking_ls(lr, direction, w) + w = w - step_size * direction + if lr.loss_wor(w) < lr.loss_wor(w_opt2): + w_opt = w + return w_opt + + +def newton_ur(w, lr, i, iters, pb): + H = lr.hess(w) + direction = np.linalg.solve(H, lr.grad(w)) + step_size = backtracking_ls(lr, direction, w) + return w - step_size * direction + + +class DoubleNoiseMech: + """Our Method: Double Noise.""" + + def __init__(self, + lr, + type_reg="add", + hyper_tuning=False, + curvature_info="hessian", + plot_eigen=False): + """Initializes the algorithm. + + Args: + lr: logistic regression problem we are solving. + type_reg: "add" or "clip" -- how we regularize eigenvalues. + hyper_tuning: do we tune the hyperparameters. + curvature_info: "hessian" or "ub" -- what quadratic we use. + plot_eigen: show eigenvalues for debugging purposes. + + """ + self.type_reg = type_reg + self.hyper_tuning = hyper_tuning + self.curvature_info = curvature_info + self.plot_eigen = plot_eigen + if self.curvature_info == "hessian": + self.H = lr.hess_wor + elif self.curvature_info == "ub": + self.H = lr.upperbound_wor + + def find_opt_reg_wop(self, w, lr, noisy_grad, rho_hess): + """Implementation of finding the optimal lambda. + + Here, we don't pay for privacy of doing it. + + Args: + w: current point + lr: logistic regression problem + noisy_grad: the gradient estimate + rho_hess: the privacy budget + + Returns: + The next iterate. + """ + increase_factor = 1.5 # at each step we increase the clipping + if self.type_reg == "add": + lambda_cur = 5e-6 # starting parameter + elif self.type_reg == "clip": + lambda_cur = 0.25/lr.n + 1e-5 # starting parameter, + num_noise_sample = 5 # we want to estimate expected value over the noise + grad_norm = np.linalg.norm(noisy_grad) + H = self.H(w) + best_loss = 1e6 # a large dummy number + while lambda_cur <= 0.25: + H = self.hess_mod(w, lambda_cur) + if self.type_reg == "add": # Sensitivity is different for add vs clip + sens2 = grad_norm * 0.25/(lr.n*lambda_cur**2 + 0.25*lambda_cur) + elif self.type_reg == "clip": + sens2 = grad_norm * 0.25/(lr.n*lambda_cur**2 - 0.25*lambda_cur) + loss_ = 0 + for _ in range(num_noise_sample): + noise2 = np.random.normal(scale=sens2*math.sqrt(.5/rho_hess), size=lr.d) + loss_ = loss_ + lr.loss_wor(w - np.linalg.solve(H, noisy_grad) + noise2) + if loss_ < best_loss: + best_loss = loss_ + lambda_star = lambda_cur + lambda_cur = lambda_cur * increase_factor + return lambda_star + + def update_rule(self, w, lr, i, iters, pb): + """update rule.""" + total = pb["total"] + grad_frac = pb["grad_frac4"] + rho1 = grad_frac * total / iters # divide total privacy budget for gradient + rho2 = (1-grad_frac) * total / iters # divide total privacy budget + sc1 = (1/lr.n) * math.sqrt(0.5/rho1) + noise1 = np.random.normal(scale=sc1, size=lr.d) + noisy_grad = lr.grad(w)+noise1 + grad_norm = np.linalg.norm(noisy_grad) + m = 0.25 # smoothness parameter + frac_trace = 0.1 # fraction of privacy budget for estimating the trace. + H = self.H(w) + if self.plot_eigen: + val, _ = np.linalg.eigh(H) + hist, bin_edges = np.histogram(val, bins=300, range=(0, 0.01)) + cdf_vals = np.cumsum(hist) + plt.clf() + plt.plot(bin_edges[1:], cdf_vals) + plt.show() + if self.hyper_tuning: + min_eval = self.find_opt_reg_wop(w, lr, noisy_grad, rho2) + print("optimized min_eval", min_eval) + else: + noisy_trace = max( + np.trace(H) + np.random.normal( + scale=(0.25 / lr.n) * math.sqrt(0.5 / (frac_trace * rho2))), 0) + min_eval = (noisy_trace / ((lr.n)**2 * + (1 - frac_trace) * rho2))**(1 / 3) + 5e-4 + print("approx min_eval ", min_eval) + + H = self.hess_mod(w, min_eval, lr.reg) + if self.type_reg == "add": # Sensitivity is different for add vs clip + sens2 = grad_norm * m/(lr.n * min_eval**2 + m * min_eval) + elif self.type_reg == "clip": + sens2 = grad_norm * m / (lr.n * min_eval**2 - m * min_eval) + noise2 = np.random.normal( + scale=sens2 * math.sqrt(0.5 / ((1 - frac_trace) * rho2)), size=lr.d) + return w - np.linalg.solve(H, noisy_grad) + noise2 + + def hess_mod(self, w, min_eval, reg=1e-9): + if self.type_reg == "clip": + evals, evec = np.linalg.eigh(self.H(w)) + # true_min = np.min(evals) + evals = np.maximum(evals, min_eval*np.ones(evals.shape)) + Hclipped = np.dot(evec * evals, evec.T) + return Hclipped + elif self.type_reg == "add": + return self.H(w) + min_eval*np.eye(len(self.H(w))) + + +def helper_fun(datasetname, pb, num_rep=5, Tuning=False, plot_eigen=False): + """This function loads the data & runs the algorithms. + + Args: + datasetname: name of the dataset + pb: a dictionary with the parameters + num_rep: number of times we repeat the optimization for reporting average + Tuning: True or False exhustive search for fining the best min eigenval + plot_eigen: Show eigenvalues + + Returns: + losses and gradient norms + """ + datasets = Mydatasets() + X, y, w_opt = getattr(datasets, datasetname)() + dataset = X, y + lr = MyLogisticRegression(X, y, reg=1e-8) + dnm_hess_add = DoubleNoiseMech( + lr, + type_reg="add", + hyper_tuning=False, + curvature_info="hessian", + plot_eigen=plot_eigen).update_rule + dnm_ub_add = DoubleNoiseMech( + lr, + type_reg="add", + hyper_tuning=False, + curvature_info="ub", + plot_eigen=plot_eigen).update_rule + dnm_hess_clip = DoubleNoiseMech( + lr, + type_reg="clip", + hyper_tuning=False, + curvature_info="hessian", + plot_eigen=plot_eigen).update_rule + dnm_ub_clip = DoubleNoiseMech( + lr, + type_reg="clip", + hyper_tuning=False, + curvature_info="ub", + plot_eigen=plot_eigen).update_rule + if Tuning: + # dnm_hess_add_ht = DoubleNoiseMech(lr,type_reg='add', + # hyper_tuning=True,curvature_info='hessian').update_rule + # dnm_ub_add_ht = DoubleNoiseMech(lr,type_reg='add', + # hyper_tuning=True,curvature_info='ub').update_rule + dnm_hess_clip_ht = DoubleNoiseMech( + lr, + type_reg="clip", + hyper_tuning=True, + curvature_info="hessian", + plot_eigen=plot_eigen).update_rule + # dnm_ub_clip_ht = DoubleNoiseMech(lr,type_reg='clip', + # hyper_tuning=True,curvature_info='ub').update_rule + c = CompareAlgs(lr, dataset, w_opt, iters=10, pb=pb) + for rep in range(num_rep): + c.add_plot(gd_priv, "DPGD", "y--") + c.add_plot(dnm_hess_add, "DN-Hess-add", "k-") + c.add_plot(dnm_ub_add, "DN-UB-add", "b-") + c.add_plot(dnm_hess_clip, "DN-Hess-clip", "k*-") + c.add_plot(dnm_ub_clip, "DN-UB-clip", "b*-") + c.add_plot(gd_priv_backtrackingls, "DP-GD-Oracle", "m") + if Tuning: + c.add_plot(dnm_hess_clip_ht, "DN-Hess-clip-T", "r*-") + # c.add_plot(dnm_hess_add_ht,"DN-Hess-add-T",'r-') + # c.add_plot(dnm_ub_clip_ht,"DN-UB-clip-T",'g*-') + # c.add_plot(dnm_ub_add_ht,"DN-UB-add-T",'g-') + losses_dict = c.loss_vals() + gradnorm_dict = c.gradnorm_vals() + if rep == 0: + losses_total = losses_dict + gradnorm_total = gradnorm_dict + else: + for names in losses_total: + losses_total[names].extend(losses_dict[names]) + gradnorm_total[names].extend(gradnorm_dict[names]) + return losses_total, gradnorm_total + +linestyle = { + "DPGD": "y-", + "DN-Hess-add": "k+-", + "DN-UB-add": "b-", + "DN-Hess-clip": "r*-", + "DN-UB-clip": "g-", + "DP-GD-Oracle": "c-" +} +facecolor = { + "DPGD": "yellow", + "DN-Hess-add": "black", + "DN-UB-add": "blue", + "DN-Hess-clip": "red", + "DN-UB-clip": "green", + "DP-GD-Oracle": "cyan" +} +alg_plt = [ + "DPGD", + "DN-Hess-add", + "DN-UB-add", + "DN-Hess-clip", + "DN-UB-clip", + "DP-GD-Oracle" +] + +# Synthethic Data + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": + 0.75 # Fraction of privacy budget for gradient vs matrix sensitivity +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "synthetic_data", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(1) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("synth.pdf") + plt.figure(2) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("synth-grad.pdf") + +# a5a dataset + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": + 0.75 # Fraction of privacy budget for gradient vs matrix sensitivity +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "a5a_dataset", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(3) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("a5a.pdf") + plt.figure(4) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("a5a-grad.pdf") + +# w8a dataset + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": + 0.75 # Fraction of privacy budget for gradient vs matrix sensitivity +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "w8a_dataset", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(5) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("w8a.pdf") + plt.figure(6) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("w8a-grad.pdf") + +# a1a dataset + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": 0.75 # Fraction of privacy budget for gradient vs matrix +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "a1a_dataset", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(7) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("a1a.pdf") + plt.figure(8) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("a1a-grad.pdf") + +# mushroom dataset + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": 0.75 # Fraction of privacy budget for gradient vs matrix +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "mushroom_dataset", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(9) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("mushroom.pdf") + plt.figure(10) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("mushroom-grad.pdf") + +# MNIST + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": 0.75 # Fraction of privacy budget for gradient vs matrix +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "mnist_binary", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(11) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("mnist.pdf") + plt.figure(12) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("mnist-grad.pdf") + +# Dataset: phishing + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": 0.75 # Fraction of privacy budget for gradient vs matrix +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "phishing", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(13) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("phishing.pdf") + plt.figure(14) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("phishing-grad.pdf") + +# Dataset: Madelon + +# pb = { +# "total": 1, # Total privacy budget for zCDP +# "min_eval4": 5e-3, # Min eigenvalue for clipping +# "grad_frac4": 0.4 # Fraction of privacy budget for gradient vs matrix +# } +# num_rep = 1 +# losses_total,gradnorm_total = helper_fun('madelon',pb,num_rep = num_rep, +# Tuning=True,plot_eigen=True) +# for alg in losses_total.keys(): +# losses = np.array(losses_total[alg]) +# gradnorm = np.array(gradnorm_total[alg]) +# loss_avg, gradnorm_avg = np.mean(losses,axis=0), np.mean(gradnorm,axis=0) +# loss_std, gradnorm_std = np.std(losses,axis=0)/np.sqrt(num_rep), +# np.std(gradnorm,axis=0)/np.sqrt(num_rep) +# print(str(alg)+ ':' + " ex_loss="+str(loss_avg[-1])+ ', +# std='+str(loss_std[-1])) +# if alg in alg_plt: +# iters = len(loss_avg) +# plt.figure(1) +# plt.plot(range(iters),loss_avg,linestyle[alg],label=alg) +# plt.fill_between(range(iters), loss_avg-loss_std, loss_avg+loss_std, +# facecolor=facecolor[alg]) +# plt.legend() +# plt.xlabel("Iteration") +# plt.ylabel("Excess Loss") +# plt.savefig('madelon.pdf') +# plt.figure(2) +# plt.plot(range(iters),gradnorm_avg,linestyle[alg],label=alg) +# plt.yscale('log') +# plt.legend() +# plt.xlabel("Iteration") +# plt.ylabel("Norm of Gradient") +# plt.savefig('madelon-grad.pdf') + +# Test) a6a Dataset + +pb = { + "total": 1, # Total privacy budget for zCDP + "min_eval4": 5e-3, # Min eigenvalue for clipping + "grad_frac4": 0.75 # Fraction of privacy budget for gradient vs matrix +} +num_rep = 30 +losses_total, gradnorm_total = helper_fun( + "a6a_dataset", pb, num_rep=num_rep, Tuning=False) +for alg in losses_total: + losses = np.array(losses_total[alg]) + gradnorm = np.array(gradnorm_total[alg]) + loss_avg, gradnorm_avg = np.mean(losses, axis=0), np.mean(gradnorm, axis=0) + loss_std, gradnorm_std = np.std( + losses, axis=0) / np.sqrt(num_rep), np.std( + gradnorm, axis=0) / np.sqrt(num_rep) + print( + str(alg) + ":" + " ex_loss=" + str(loss_avg[-1]) + ", std=" + + str(loss_std[-1])) + if alg in alg_plt: + iters = len(loss_avg) + plt.figure(15) + plt.plot(range(iters), loss_avg, linestyle[alg], label=alg) + plt.fill_between( + range(iters), + loss_avg - loss_std, + loss_avg + loss_std, + facecolor=facecolor[alg]) + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Excess Loss") + plt.savefig("a6a.pdf") + plt.figure(16) + plt.plot(range(iters), gradnorm_avg, linestyle[alg], label=alg) + plt.yscale("log") + plt.legend() + plt.xlabel("Iteration") + plt.ylabel("Norm of Gradient") + plt.savefig("a6a-grad.pdf")