tensorflow_privacy/research/pate_2018/ICLR2018/rdp_cumulative.py

379 lines
13 KiB
Python
Raw Normal View History

# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots three graphs illustrating cost of privacy per answered query.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of three pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import pickle
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_boolean('cache', False,
'Read results of privacy analysis from cache.')
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
flags.mark_flag_as_required('counts_file')
def run_analysis(votes, mechanism, noise_scale, params):
"""Computes data-dependent privacy.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
noise_scale: A mechanism privacy parameter.
params: Other privacy parameters.
Returns:
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
how many queries were answered, optimal order.
"""
def compute_partition(order_opt, eps):
order_opt_idx = np.searchsorted(orders, order_opt)
if mechanism == 'gnmax_conf':
p = (rdp_select_cum[order_opt_idx],
rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx],
-math.log(delta) / (order_opt - 1))
else:
p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1))
return [x / eps for x in p] # Ensures that sum(x) == 1
# Short list of orders.
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500), num=100)))
delta = 1e-8
n = votes.shape[0]
eps_total = np.zeros(n)
partition = [None] * n
order_opt = np.full(n, np.nan, dtype=float)
answered = np.zeros(n, dtype=float)
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
rdp_select_cum = np.zeros(len(orders))
answered_sum = 0
for i in range(n):
v = votes[i,]
if mechanism == 'lnmax':
logq_lnmax = pate.compute_logq_laplace(v, noise_scale)
rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders)
rdp_sqrd = rdp_query ** 2
pr_answered = 1
elif mechanism == 'gnmax':
logq_gmax = pate.compute_logq_gaussian(v, noise_scale)
rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders)
rdp_sqrd = rdp_query ** 2
pr_answered = 1
elif mechanism == 'gnmax_conf':
logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v)
logq_step2 = pate.compute_logq_gaussian(v, noise_scale)
q_step1 = np.exp(logq_step1)
logq_step1_min = min(logq_step1, math.log1p(-q_step1))
rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min,
2 ** .5 * params['sigma1'], orders)
rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders)
rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2
+ q_step1 * rdp_gnmax_step2 ** 2)
rdp_select_cum += rdp_gnmax_step1
pr_answered = q_step1
else:
raise ValueError(
'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]')
rdp_cum += rdp_query
rdp_sqrd_cum += rdp_sqrd
answered_sum += pr_answered
answered[i] = answered_sum
eps_total[i], order_opt[i] = pate.compute_eps_from_delta(
orders, rdp_cum, delta)
partition[i] = compute_partition(order_opt[i], eps_total[i])
if i > 0 and (i + 1) % 1000 == 0:
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i) ** 2 # Ignore Bessel's correction.
order_opt_idx = np.searchsorted(orders, order_opt[i])
eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered_sum, eps_total[i], eps_std, order_opt[i],
-math.log(delta) / (order_opt[i] - 1)))
sys.stdout.flush()
return eps_total, partition, answered, order_opt
def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax):
"""Plots a graph of LNMax vs GNMax.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
answered_gnmax: The cumulative count of queries answered.
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
y_lap = np.zeros(len(x_axis), dtype=float)
y_gnmax = np.full(len(x_axis), np.nan, dtype=float)
for i in range(len(x_axis)):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx = np.searchsorted(answered_gnmax, x)
if idx < len(eps_gnmax):
y_gnmax[i] = eps_gnmax[idx]
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5)
ax.plot(
x_axis,
y_gnmax,
color='g',
ls='-',
label='Confident-GNMax',
alpha=.5,
linewidth=5)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, 6000])
plt.ylim([0, 6.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1,
eps_gnmax2, partition_gnmax2, answered_gnmax2):
"""Plots a graph of LNMax vs GNMax with two parameters.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
answered_gnmax1: The cumulative count of queries answered (set 1).
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
partition_gnmax2: Allocation of eps for set 2.
answered_gnmax2: The cumulative count of queries answered (set 2).
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
lenx = len(x_axis)
y_lap = np.zeros(lenx)
y_gnmax1 = np.full(lenx, np.nan, dtype=float)
y_gnmax2 = np.full(lenx, np.nan, dtype=float)
y1_gnmax2 = np.full(lenx, np.nan, dtype=float)
for i in range(lenx):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx1 = np.searchsorted(answered_gnmax1, x)
if idx1 < len(eps_gnmax1):
y_gnmax1[i] = eps_gnmax1[idx1]
idx2 = np.searchsorted(answered_gnmax2, x)
if idx2 < len(eps_gnmax2):
y_gnmax2[i] = eps_gnmax2[idx2]
fraction_step1, fraction_step2, _ = partition_gnmax2[idx2]
y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / (
fraction_step1 + fraction_step2)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis,
y_lap,
color='r',
ls='dashed',
label='LNMax',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax1,
color='g',
ls='-',
label='Confident-GNMax (moderate)',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax2,
color='b',
ls='-',
label='Confident-GNMax (aggressive)',
alpha=.5,
linewidth=5)
ax.fill_between(
x_axis, [0] * lenx,
y1_gnmax2.tolist(),
facecolor='b',
alpha=.3,
hatch='\\')
ax.plot(
x_axis,
y1_gnmax2,
color='b',
ls='-',
label='_nolegend_',
alpha=.5,
linewidth=1)
ax.fill_between(
x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
plt.ylim([0, 1.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2):
"""Sequentially runs all analyses.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
lambda_laplace: The scale of the Laplace noise (lambda).
gnmax_parameters: A list of parameters for GNMax.
sigma2: Shared parameter for the GNMax mechanisms.
Returns:
Five lists whose length is the number of queries.
"""
print('=== Laplace Mechanism ===')
eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None)
print()
# Does not go anywhere, for now
# print('=== Gaussian Mechanism (simple) ===')
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
eps_gnmax = [[] for p in gnmax_parameters]
partition_gmax = [[] for p in gnmax_parameters]
answered = [[] for p in gnmax_parameters]
order_opt = [[] for p in gnmax_parameters]
for i, p in enumerate(gnmax_parameters):
print('=== Gaussian Mechanism (confident) {}: ==='.format(p))
eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis(
votes, 'gnmax_conf', sigma2, p)
print()
return eps_lap, eps_gnmax, partition_gmax, answered, order_opt
def main(argv):
del argv # Unused.
lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace
# Paramaters of the GNMax
gnmax_parameters = ({
't': 1000,
'sigma1': 500
}, {
't': 3500,
'sigma1': 1500
}, {
't': 5000,
'sigma1': 1500
})
sigma2 = 100 # GNMax parameters differ only in Step 1 (selection).
ftemp_name = '/tmp/precomputed.pkl'
figures_dir = os.path.expanduser(FLAGS.figures_dir)
if FLAGS.cache and os.path.isfile(ftemp_name):
print('Reading from cache ' + ftemp_name)
with open(ftemp_name, 'rb') as f:
(eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax) = pickle.load(f)
else:
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
(eps_lap, eps_gnmax, partition_gmax,
answered_gnmax, orders_opt_gnmax) = run_all_analyses(
votes, lambda_laplace, gnmax_parameters, sigma2)
print('Writing to cache ' + ftemp_name)
with open(ftemp_name, 'wb') as f:
pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax), f)
print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0])
print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1],
eps_gnmax[2], partition_gmax[2], answered_gnmax[2])
plt.close('all')
if __name__ == '__main__':
app.run(main)