forked from 626_privacy/tensorflow_privacy
379 lines
13 KiB
Python
379 lines
13 KiB
Python
|
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
|
||
|
"""Plots three graphs illustrating cost of privacy per answered query.
|
||
|
|
||
|
A script in support of the paper "Scalable Private Learning with PATE" by
|
||
|
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
|
||
|
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
|
||
|
|
||
|
The input is a file containing a numpy array of votes, one query per row, one
|
||
|
class per column. Ex:
|
||
|
43, 1821, ..., 3
|
||
|
31, 16, ..., 0
|
||
|
...
|
||
|
0, 86, ..., 438
|
||
|
The output is written to a specified directory and consists of three pdf files.
|
||
|
"""
|
||
|
from __future__ import absolute_import
|
||
|
from __future__ import division
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import math
|
||
|
import os
|
||
|
import pickle
|
||
|
import sys
|
||
|
|
||
|
sys.path.append('..') # Main modules reside in the parent directory.
|
||
|
|
||
|
from absl import app
|
||
|
from absl import flags
|
||
|
import matplotlib
|
||
|
|
||
|
matplotlib.use('TkAgg')
|
||
|
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
|
||
|
import numpy as np
|
||
|
import core as pate
|
||
|
|
||
|
plt.style.use('ggplot')
|
||
|
|
||
|
FLAGS = flags.FLAGS
|
||
|
flags.DEFINE_boolean('cache', False,
|
||
|
'Read results of privacy analysis from cache.')
|
||
|
flags.DEFINE_string('counts_file', None, 'Counts file.')
|
||
|
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
|
||
|
|
||
|
flags.mark_flag_as_required('counts_file')
|
||
|
|
||
|
def run_analysis(votes, mechanism, noise_scale, params):
|
||
|
"""Computes data-dependent privacy.
|
||
|
|
||
|
Args:
|
||
|
votes: A matrix of votes, where each row contains votes in one instance.
|
||
|
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
|
||
|
noise_scale: A mechanism privacy parameter.
|
||
|
params: Other privacy parameters.
|
||
|
|
||
|
Returns:
|
||
|
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
|
||
|
how many queries were answered, optimal order.
|
||
|
"""
|
||
|
|
||
|
def compute_partition(order_opt, eps):
|
||
|
order_opt_idx = np.searchsorted(orders, order_opt)
|
||
|
if mechanism == 'gnmax_conf':
|
||
|
p = (rdp_select_cum[order_opt_idx],
|
||
|
rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx],
|
||
|
-math.log(delta) / (order_opt - 1))
|
||
|
else:
|
||
|
p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1))
|
||
|
return [x / eps for x in p] # Ensures that sum(x) == 1
|
||
|
|
||
|
# Short list of orders.
|
||
|
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
|
||
|
# np.logspace(np.log10(50), np.log10(1000), num=20))))
|
||
|
|
||
|
# Long list of orders.
|
||
|
orders = np.concatenate((np.arange(2, 100 + 1, .5),
|
||
|
np.logspace(np.log10(100), np.log10(500), num=100)))
|
||
|
delta = 1e-8
|
||
|
|
||
|
n = votes.shape[0]
|
||
|
eps_total = np.zeros(n)
|
||
|
partition = [None] * n
|
||
|
order_opt = np.full(n, np.nan, dtype=float)
|
||
|
answered = np.zeros(n, dtype=float)
|
||
|
|
||
|
rdp_cum = np.zeros(len(orders))
|
||
|
rdp_sqrd_cum = np.zeros(len(orders))
|
||
|
rdp_select_cum = np.zeros(len(orders))
|
||
|
answered_sum = 0
|
||
|
|
||
|
for i in range(n):
|
||
|
v = votes[i,]
|
||
|
if mechanism == 'lnmax':
|
||
|
logq_lnmax = pate.compute_logq_laplace(v, noise_scale)
|
||
|
rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders)
|
||
|
rdp_sqrd = rdp_query ** 2
|
||
|
pr_answered = 1
|
||
|
elif mechanism == 'gnmax':
|
||
|
logq_gmax = pate.compute_logq_gaussian(v, noise_scale)
|
||
|
rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders)
|
||
|
rdp_sqrd = rdp_query ** 2
|
||
|
pr_answered = 1
|
||
|
elif mechanism == 'gnmax_conf':
|
||
|
logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v)
|
||
|
logq_step2 = pate.compute_logq_gaussian(v, noise_scale)
|
||
|
q_step1 = np.exp(logq_step1)
|
||
|
logq_step1_min = min(logq_step1, math.log1p(-q_step1))
|
||
|
rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min,
|
||
|
2 ** .5 * params['sigma1'], orders)
|
||
|
rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders)
|
||
|
rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2
|
||
|
# The expression below evaluates
|
||
|
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
|
||
|
rdp_sqrd = (
|
||
|
rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2
|
||
|
+ q_step1 * rdp_gnmax_step2 ** 2)
|
||
|
rdp_select_cum += rdp_gnmax_step1
|
||
|
pr_answered = q_step1
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]')
|
||
|
|
||
|
rdp_cum += rdp_query
|
||
|
rdp_sqrd_cum += rdp_sqrd
|
||
|
answered_sum += pr_answered
|
||
|
|
||
|
answered[i] = answered_sum
|
||
|
eps_total[i], order_opt[i] = pate.compute_eps_from_delta(
|
||
|
orders, rdp_cum, delta)
|
||
|
partition[i] = compute_partition(order_opt[i], eps_total[i])
|
||
|
|
||
|
if i > 0 and (i + 1) % 1000 == 0:
|
||
|
rdp_var = rdp_sqrd_cum / i - (
|
||
|
rdp_cum / i) ** 2 # Ignore Bessel's correction.
|
||
|
order_opt_idx = np.searchsorted(orders, order_opt[i])
|
||
|
eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum.
|
||
|
print(
|
||
|
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
|
||
|
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
|
||
|
i + 1, answered_sum, eps_total[i], eps_std, order_opt[i],
|
||
|
-math.log(delta) / (order_opt[i] - 1)))
|
||
|
sys.stdout.flush()
|
||
|
|
||
|
return eps_total, partition, answered, order_opt
|
||
|
|
||
|
|
||
|
def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax):
|
||
|
"""Plots a graph of LNMax vs GNMax.
|
||
|
|
||
|
Args:
|
||
|
figures_dir: A name of the directory where to save the plot.
|
||
|
eps_lap: The cumulative privacy costs of the Laplace mechanism.
|
||
|
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
|
||
|
answered_gnmax: The cumulative count of queries answered.
|
||
|
"""
|
||
|
xlim = 6000
|
||
|
x_axis = range(0, int(xlim), 10)
|
||
|
y_lap = np.zeros(len(x_axis), dtype=float)
|
||
|
y_gnmax = np.full(len(x_axis), np.nan, dtype=float)
|
||
|
|
||
|
for i in range(len(x_axis)):
|
||
|
x = x_axis[i]
|
||
|
y_lap[i] = eps_lap[x]
|
||
|
idx = np.searchsorted(answered_gnmax, x)
|
||
|
if idx < len(eps_gnmax):
|
||
|
y_gnmax[i] = eps_gnmax[idx]
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
fig.set_figheight(4.5)
|
||
|
fig.set_figwidth(4.7)
|
||
|
ax.plot(
|
||
|
x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5)
|
||
|
ax.plot(
|
||
|
x_axis,
|
||
|
y_gnmax,
|
||
|
color='g',
|
||
|
ls='-',
|
||
|
label='Confident-GNMax',
|
||
|
alpha=.5,
|
||
|
linewidth=5)
|
||
|
plt.xticks(np.arange(0, 7000, 1000))
|
||
|
plt.xlim([0, 6000])
|
||
|
plt.ylim([0, 6.])
|
||
|
plt.xlabel('Number of queries answered', fontsize=16)
|
||
|
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
|
||
|
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
|
||
|
ax.tick_params(labelsize=14)
|
||
|
fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf')
|
||
|
print('Saving the graph to ' + fout_name)
|
||
|
fig.savefig(fout_name, bbox_inches='tight')
|
||
|
plt.show()
|
||
|
|
||
|
|
||
|
def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1,
|
||
|
eps_gnmax2, partition_gnmax2, answered_gnmax2):
|
||
|
"""Plots a graph of LNMax vs GNMax with two parameters.
|
||
|
|
||
|
Args:
|
||
|
figures_dir: A name of the directory where to save the plot.
|
||
|
eps_lap: The cumulative privacy costs of the Laplace mechanism.
|
||
|
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
|
||
|
answered_gnmax1: The cumulative count of queries answered (set 1).
|
||
|
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
|
||
|
partition_gnmax2: Allocation of eps for set 2.
|
||
|
answered_gnmax2: The cumulative count of queries answered (set 2).
|
||
|
"""
|
||
|
xlim = 6000
|
||
|
x_axis = range(0, int(xlim), 10)
|
||
|
lenx = len(x_axis)
|
||
|
y_lap = np.zeros(lenx)
|
||
|
y_gnmax1 = np.full(lenx, np.nan, dtype=float)
|
||
|
y_gnmax2 = np.full(lenx, np.nan, dtype=float)
|
||
|
y1_gnmax2 = np.full(lenx, np.nan, dtype=float)
|
||
|
|
||
|
for i in range(lenx):
|
||
|
x = x_axis[i]
|
||
|
y_lap[i] = eps_lap[x]
|
||
|
idx1 = np.searchsorted(answered_gnmax1, x)
|
||
|
if idx1 < len(eps_gnmax1):
|
||
|
y_gnmax1[i] = eps_gnmax1[idx1]
|
||
|
idx2 = np.searchsorted(answered_gnmax2, x)
|
||
|
if idx2 < len(eps_gnmax2):
|
||
|
y_gnmax2[i] = eps_gnmax2[idx2]
|
||
|
fraction_step1, fraction_step2, _ = partition_gnmax2[idx2]
|
||
|
y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / (
|
||
|
fraction_step1 + fraction_step2)
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
fig.set_figheight(4.5)
|
||
|
fig.set_figwidth(4.7)
|
||
|
ax.plot(
|
||
|
x_axis,
|
||
|
y_lap,
|
||
|
color='r',
|
||
|
ls='dashed',
|
||
|
label='LNMax',
|
||
|
alpha=.5,
|
||
|
linewidth=5)
|
||
|
ax.plot(
|
||
|
x_axis,
|
||
|
y_gnmax1,
|
||
|
color='g',
|
||
|
ls='-',
|
||
|
label='Confident-GNMax (moderate)',
|
||
|
alpha=.5,
|
||
|
linewidth=5)
|
||
|
ax.plot(
|
||
|
x_axis,
|
||
|
y_gnmax2,
|
||
|
color='b',
|
||
|
ls='-',
|
||
|
label='Confident-GNMax (aggressive)',
|
||
|
alpha=.5,
|
||
|
linewidth=5)
|
||
|
ax.fill_between(
|
||
|
x_axis, [0] * lenx,
|
||
|
y1_gnmax2.tolist(),
|
||
|
facecolor='b',
|
||
|
alpha=.3,
|
||
|
hatch='\\')
|
||
|
ax.plot(
|
||
|
x_axis,
|
||
|
y1_gnmax2,
|
||
|
color='b',
|
||
|
ls='-',
|
||
|
label='_nolegend_',
|
||
|
alpha=.5,
|
||
|
linewidth=1)
|
||
|
ax.fill_between(
|
||
|
x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3)
|
||
|
plt.xticks(np.arange(0, 7000, 1000))
|
||
|
plt.xlim([0, xlim])
|
||
|
plt.ylim([0, 1.])
|
||
|
plt.xlabel('Number of queries answered', fontsize=16)
|
||
|
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
|
||
|
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
|
||
|
ax.tick_params(labelsize=14)
|
||
|
fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf')
|
||
|
print('Saving the graph to ' + fout_name)
|
||
|
fig.savefig(fout_name, bbox_inches='tight')
|
||
|
plt.show()
|
||
|
|
||
|
|
||
|
def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2):
|
||
|
"""Sequentially runs all analyses.
|
||
|
|
||
|
Args:
|
||
|
votes: A matrix of votes, where each row contains votes in one instance.
|
||
|
lambda_laplace: The scale of the Laplace noise (lambda).
|
||
|
gnmax_parameters: A list of parameters for GNMax.
|
||
|
sigma2: Shared parameter for the GNMax mechanisms.
|
||
|
|
||
|
Returns:
|
||
|
Five lists whose length is the number of queries.
|
||
|
"""
|
||
|
print('=== Laplace Mechanism ===')
|
||
|
eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None)
|
||
|
print()
|
||
|
|
||
|
# Does not go anywhere, for now
|
||
|
# print('=== Gaussian Mechanism (simple) ===')
|
||
|
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
|
||
|
|
||
|
eps_gnmax = [[] for p in gnmax_parameters]
|
||
|
partition_gmax = [[] for p in gnmax_parameters]
|
||
|
answered = [[] for p in gnmax_parameters]
|
||
|
order_opt = [[] for p in gnmax_parameters]
|
||
|
for i, p in enumerate(gnmax_parameters):
|
||
|
print('=== Gaussian Mechanism (confident) {}: ==='.format(p))
|
||
|
eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis(
|
||
|
votes, 'gnmax_conf', sigma2, p)
|
||
|
print()
|
||
|
|
||
|
return eps_lap, eps_gnmax, partition_gmax, answered, order_opt
|
||
|
|
||
|
|
||
|
def main(argv):
|
||
|
del argv # Unused.
|
||
|
lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace
|
||
|
|
||
|
# Paramaters of the GNMax
|
||
|
gnmax_parameters = ({
|
||
|
't': 1000,
|
||
|
'sigma1': 500
|
||
|
}, {
|
||
|
't': 3500,
|
||
|
'sigma1': 1500
|
||
|
}, {
|
||
|
't': 5000,
|
||
|
'sigma1': 1500
|
||
|
})
|
||
|
sigma2 = 100 # GNMax parameters differ only in Step 1 (selection).
|
||
|
ftemp_name = '/tmp/precomputed.pkl'
|
||
|
|
||
|
figures_dir = os.path.expanduser(FLAGS.figures_dir)
|
||
|
|
||
|
if FLAGS.cache and os.path.isfile(ftemp_name):
|
||
|
print('Reading from cache ' + ftemp_name)
|
||
|
with open(ftemp_name, 'rb') as f:
|
||
|
(eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
|
||
|
orders_opt_gnmax) = pickle.load(f)
|
||
|
else:
|
||
|
fin_name = os.path.expanduser(FLAGS.counts_file)
|
||
|
print('Reading raw votes from ' + fin_name)
|
||
|
sys.stdout.flush()
|
||
|
|
||
|
votes = np.load(fin_name)
|
||
|
|
||
|
(eps_lap, eps_gnmax, partition_gmax,
|
||
|
answered_gnmax, orders_opt_gnmax) = run_all_analyses(
|
||
|
votes, lambda_laplace, gnmax_parameters, sigma2)
|
||
|
|
||
|
print('Writing to cache ' + ftemp_name)
|
||
|
with open(ftemp_name, 'wb') as f:
|
||
|
pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
|
||
|
orders_opt_gnmax), f)
|
||
|
|
||
|
print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0])
|
||
|
print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1],
|
||
|
eps_gnmax[2], partition_gmax[2], answered_gnmax[2])
|
||
|
plt.close('all')
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
app.run(main)
|