From 93e9585f185a2b9b10f6f4c98433526bb7300a78 Mon Sep 17 00:00:00 2001 From: Nicolas Papernot Date: Mon, 14 Jan 2019 13:18:31 -0800 Subject: [PATCH] Add missing licenses. PiperOrigin-RevId: 229241117 --- research/README.md | 9 + research/pate_2017/README.md | 123 ++++ research/pate_2017/__init__.py | 1 + research/pate_2017/aggregation.py | 130 ++++ research/pate_2017/analysis.py | 304 +++++++++ research/pate_2017/deep_cnn.py | 603 ++++++++++++++++++ research/pate_2017/input.py | 424 ++++++++++++ research/pate_2017/metrics.py | 49 ++ research/pate_2017/train_student.py | 208 ++++++ ...nt_mnist_250_lap_20_count_50_epochs_600.sh | 25 + research/pate_2017/train_teachers.py | 104 +++ research/pate_2017/utils.py | 35 + research/pate_2018/ICLR2018/README.md | 61 ++ research/pate_2018/ICLR2018/download.py | 43 ++ .../pate_2018/ICLR2018/generate_figures.sh | 43 ++ research/pate_2018/ICLR2018/generate_table.sh | 93 +++ .../generate_table_data_independent.sh | 99 +++ research/pate_2018/ICLR2018/plot_ls_q.py | 105 +++ research/pate_2018/ICLR2018/plot_partition.py | 412 ++++++++++++ .../pate_2018/ICLR2018/plots_for_slides.py | 283 ++++++++ research/pate_2018/ICLR2018/rdp_bucketized.py | 263 ++++++++ research/pate_2018/ICLR2018/rdp_cumulative.py | 378 +++++++++++ .../ICLR2018/smooth_sensitivity_table.py | 358 +++++++++++ .../ICLR2018/utility_queries_answered.py | 90 +++ research/pate_2018/README.md | 71 +++ research/pate_2018/core.py | 370 +++++++++++ research/pate_2018/core_test.py | 124 ++++ research/pate_2018/smooth_sensitivity.py | 419 ++++++++++++ research/pate_2018/smooth_sensitivity_test.py | 126 ++++ 29 files changed, 5353 insertions(+) create mode 100644 research/README.md create mode 100644 research/pate_2017/README.md create mode 100644 research/pate_2017/__init__.py create mode 100644 research/pate_2017/aggregation.py create mode 100644 research/pate_2017/analysis.py create mode 100644 research/pate_2017/deep_cnn.py create mode 100644 research/pate_2017/input.py create mode 100644 research/pate_2017/metrics.py create mode 100644 research/pate_2017/train_student.py create mode 100644 research/pate_2017/train_student_mnist_250_lap_20_count_50_epochs_600.sh create mode 100644 research/pate_2017/train_teachers.py create mode 100644 research/pate_2017/utils.py create mode 100644 research/pate_2018/ICLR2018/README.md create mode 100644 research/pate_2018/ICLR2018/download.py create mode 100644 research/pate_2018/ICLR2018/generate_figures.sh create mode 100644 research/pate_2018/ICLR2018/generate_table.sh create mode 100644 research/pate_2018/ICLR2018/generate_table_data_independent.sh create mode 100644 research/pate_2018/ICLR2018/plot_ls_q.py create mode 100644 research/pate_2018/ICLR2018/plot_partition.py create mode 100644 research/pate_2018/ICLR2018/plots_for_slides.py create mode 100644 research/pate_2018/ICLR2018/rdp_bucketized.py create mode 100644 research/pate_2018/ICLR2018/rdp_cumulative.py create mode 100644 research/pate_2018/ICLR2018/smooth_sensitivity_table.py create mode 100644 research/pate_2018/ICLR2018/utility_queries_answered.py create mode 100644 research/pate_2018/README.md create mode 100644 research/pate_2018/core.py create mode 100644 research/pate_2018/core_test.py create mode 100644 research/pate_2018/smooth_sensitivity.py create mode 100644 research/pate_2018/smooth_sensitivity_test.py diff --git a/research/README.md b/research/README.md new file mode 100644 index 0000000..84ac2a9 --- /dev/null +++ b/research/README.md @@ -0,0 +1,9 @@ +# Research + +This folder contains code to reproduce results from research papers. Currently, +the following papers are included: + +* Semi-supervised Knowledge Transfer for Deep Learning from Private Training + Data (ICLR 2017): `pate_2017` + +* Scalable Private Learning with PATE (ICLR 2018): `pate_2018` diff --git a/research/pate_2017/README.md b/research/pate_2017/README.md new file mode 100644 index 0000000..b08d63a --- /dev/null +++ b/research/pate_2017/README.md @@ -0,0 +1,123 @@ +# Learning private models with multiple teachers + +This repository contains code to create a setup for learning privacy-preserving +student models by transferring knowledge from an ensemble of teachers trained +on disjoint subsets of the data for which privacy guarantees are to be provided. + +Knowledge acquired by teachers is transferred to the student in a differentially +private manner by noisily aggregating the teacher decisions before feeding them +to the student during training. + +The paper describing the approach is [arXiv:1610.05755](https://arxiv.org/abs/1610.05755) + +## Dependencies + +This model uses `TensorFlow` to perform numerical computations associated with +machine learning models, as well as common Python libraries like: `numpy`, +`scipy`, and `six`. Instructions to install these can be found in their +respective documentations. + +## How to run + +This repository supports the MNIST and SVHN datasets. The following +instructions are given for MNIST but can easily be adapted by replacing the +flag `--dataset=mnist` by `--dataset=svhn`. +There are 2 steps: teacher training and student training. Data will be +automatically downloaded when you start the teacher training. + +The following is a two-step process: first we train an ensemble of teacher +models and second we train a student using predictions made by this ensemble. + +**Training the teachers:** first run the `train_teachers.py` file with at least +three flags specifying (1) the number of teachers, (2) the ID of the teacher +you are training among these teachers, and (3) the dataset on which to train. +For instance, to train teacher number 10 among an ensemble of 100 teachers for +MNIST, you use the following command: + +``` +python train_teachers.py --nb_teachers=100 --teacher_id=10 --dataset=mnist +``` + +Other flags like `train_dir` and `data_dir` should optionally be set to +respectively point to the directory where model checkpoints and temporary data +(like the dataset) should be saved. The flag `max_steps` (default at 3000) +controls the length of training. See `train_teachers.py` and `deep_cnn.py` +to find available flags and their descriptions. + +**Training the student:** once the teachers are all trained, e.g., teachers +with IDs `0` to `99` are trained for `nb_teachers=100`, we are ready to train +the student. The student is trained by labeling some of the test data with +predictions from the teachers. The predictions are aggregated by counting the +votes assigned to each class among the ensemble of teachers, adding Laplacian +noise to these votes, and assigning the label with the maximum noisy vote count +to the sample. This is detailed in function `noisy_max` in the file +`aggregation.py`. To learn the student, use the following command: + +``` +python train_student.py --nb_teachers=100 --dataset=mnist --stdnt_share=5000 +``` + +The flag `--stdnt_share=5000` indicates that the student should be able to +use the first `5000` samples of the dataset's test subset as unlabeled +training points (they will be labeled using the teacher predictions). The +remaining samples are used for evaluation of the student's accuracy, which +is displayed upon completion of training. + +## Using semi-supervised GANs to train the student + +In the paper, we describe how to train the student in a semi-supervised +fashion using Generative Adversarial Networks. This can be reproduced for MNIST +by cloning the [improved-gan](https://github.com/openai/improved-gan) +repository and adding to your `PATH` variable before running the shell +script `train_student_mnist_250_lap_20_count_50_epochs_600.sh`. + +``` +export PATH="/path/to/improved-gan/mnist_svhn_cifar10":$PATH +sh train_student_mnist_250_lap_20_count_50_epochs_600.sh +``` + + +## Alternative deeper convolutional architecture + +Note that a deeper convolutional model is available. Both the default and +deeper models graphs are defined in `deep_cnn.py`, respectively by +functions `inference` and `inference_deeper`. Use the flag `--deeper=true` +to switch to that model when launching `train_teachers.py` and +`train_student.py`. + +## Privacy analysis + +In the paper, we detail how data-dependent differential privacy bounds can be +computed to estimate the cost of training the student. In order to reproduce +the bounds given in the paper, we include the label predicted by our two +teacher ensembles: MNIST and SVHN. You can run the privacy analysis for each +dataset with the following commands: + +``` +python analysis.py --counts_file=mnist_250_teachers_labels.npy --indices_file=mnist_250_teachers_100_indices_used_by_student.npy + +python analysis.py --counts_file=svhn_250_teachers_labels.npy --max_examples=1000 --delta=1e-6 +``` + +To expedite experimentation with the privacy analysis of student training, +the `analysis.py` file is configured to download the labels produced by 250 +teacher models, for MNIST and SVHN when running the two commands included +above. These 250 teacher models were trained using the following command lines, +where `XXX` takes values between `0` and `249`: + +``` +python train_teachers.py --nb_teachers=250 --teacher_id=XXX --dataset=mnist +python train_teachers.py --nb_teachers=250 --teacher_id=XXX --dataset=svhn +``` + +Note that these labels may also be used in lieu of function `ensemble_preds` +in `train_student.py`, to compare the performance of alternative student model +architectures and learning techniques. This facilitates future work, by +removing the need for training the MNIST and SVHN teacher ensembles when +proposing new student training approaches. + +## Contact + +To ask questions, please email `nicolas@papernot.fr` or open an issue on +the `tensorflow/models` issues tracker. Please assign issues to +[@npapernot](https://github.com/npapernot). diff --git a/research/pate_2017/__init__.py b/research/pate_2017/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/research/pate_2017/__init__.py @@ -0,0 +1 @@ + diff --git a/research/pate_2017/aggregation.py b/research/pate_2017/aggregation.py new file mode 100644 index 0000000..5cad35c --- /dev/null +++ b/research/pate_2017/aggregation.py @@ -0,0 +1,130 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange + + +def labels_from_probs(probs): + """ + Helper function: computes argmax along last dimension of array to obtain + labels (max prob or max logit value) + :param probs: numpy array where probabilities or logits are on last dimension + :return: array with same shape as input besides last dimension with shape 1 + now containing the labels + """ + # Compute last axis index + last_axis = len(np.shape(probs)) - 1 + + # Label is argmax over last dimension + labels = np.argmax(probs, axis=last_axis) + + # Return as np.int32 + return np.asarray(labels, dtype=np.int32) + + +def noisy_max(logits, lap_scale, return_clean_votes=False): + """ + This aggregation mechanism takes the softmax/logit output of several models + resulting from inference on identical inputs and computes the noisy-max of + the votes for candidate classes to select a label for each sample: it + adds Laplacian noise to label counts and returns the most frequent label. + :param logits: logits or probabilities for each sample + :param lap_scale: scale of the Laplacian noise to be added to counts + :param return_clean_votes: if set to True, also returns clean votes (without + Laplacian noise). This can be used to perform the + privacy analysis of this aggregation mechanism. + :return: pair of result and (if clean_votes is set to True) the clean counts + for each class per sample and the original labels produced by + the teachers. + """ + + # Compute labels from logits/probs and reshape array properly + labels = labels_from_probs(logits) + labels_shape = np.shape(labels) + labels = labels.reshape((labels_shape[0], labels_shape[1])) + + # Initialize array to hold final labels + result = np.zeros(int(labels_shape[1])) + + if return_clean_votes: + # Initialize array to hold clean votes for each sample + clean_votes = np.zeros((int(labels_shape[1]), 10)) + + # Parse each sample + for i in xrange(int(labels_shape[1])): + # Count number of votes assigned to each class + label_counts = np.bincount(labels[:, i], minlength=10) + + if return_clean_votes: + # Store vote counts for export + clean_votes[i] = label_counts + + # Cast in float32 to prepare before addition of Laplacian noise + label_counts = np.asarray(label_counts, dtype=np.float32) + + # Sample independent Laplacian noise for each class + for item in xrange(10): + label_counts[item] += np.random.laplace(loc=0.0, scale=float(lap_scale)) + + # Result is the most frequent label + result[i] = np.argmax(label_counts) + + # Cast labels to np.int32 for compatibility with deep_cnn.py feed dictionaries + result = np.asarray(result, dtype=np.int32) + + if return_clean_votes: + # Returns several array, which are later saved: + # result: labels obtained from the noisy aggregation + # clean_votes: the number of teacher votes assigned to each sample and class + # labels: the labels assigned by teachers (before the noisy aggregation) + return result, clean_votes, labels + else: + # Only return labels resulting from noisy aggregation + return result + + +def aggregation_most_frequent(logits): + """ + This aggregation mechanism takes the softmax/logit output of several models + resulting from inference on identical inputs and computes the most frequent + label. It is deterministic (no noise injection like noisy_max() above. + :param logits: logits or probabilities for each sample + :return: + """ + # Compute labels from logits/probs and reshape array properly + labels = labels_from_probs(logits) + labels_shape = np.shape(labels) + labels = labels.reshape((labels_shape[0], labels_shape[1])) + + # Initialize array to hold final labels + result = np.zeros(int(labels_shape[1])) + + # Parse each sample + for i in xrange(int(labels_shape[1])): + # Count number of votes assigned to each class + label_counts = np.bincount(labels[:, i], minlength=10) + + label_counts = np.asarray(label_counts, dtype=np.int32) + + # Result is the most frequent label + result[i] = np.argmax(label_counts) + + return np.asarray(result, dtype=np.int32) diff --git a/research/pate_2017/analysis.py b/research/pate_2017/analysis.py new file mode 100644 index 0000000..8a291f3 --- /dev/null +++ b/research/pate_2017/analysis.py @@ -0,0 +1,304 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +This script computes bounds on the privacy cost of training the +student model from noisy aggregation of labels predicted by teachers. +It should be used only after training the student (and therefore the +teachers as well). We however include the label files required to +reproduce key results from our paper (https://arxiv.org/abs/1610.05755): +the epsilon bounds for MNIST and SVHN students. + +The command that computes the epsilon bound associated +with the training of the MNIST student model (100 label queries +with a (1/20)*2=0.1 epsilon bound each) is: + +python analysis.py + --counts_file=mnist_250_teachers_labels.npy + --indices_file=mnist_250_teachers_100_indices_used_by_student.npy + +The command that computes the epsilon bound associated +with the training of the SVHN student model (1000 label queries +with a (1/20)*2=0.1 epsilon bound each) is: + +python analysis.py + --counts_file=svhn_250_teachers_labels.npy + --max_examples=1000 + --delta=1e-6 +""" +import os +import math +import numpy as np +from six.moves import xrange +import tensorflow as tf + +from differential_privacy.multiple_teachers.input import maybe_download + +# These parameters can be changed to compute bounds for different failure rates +# or different model predictions. + +tf.flags.DEFINE_integer("moments",8, "Number of moments") +tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.") +tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.") +tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity") +tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts") +tf.flags.DEFINE_string("indices_file","", + "File containting a numpy matrix with indices used." + "Optional. Use the first max_examples indices if this is not provided.") +tf.flags.DEFINE_integer("max_examples",1000, + "Number of examples to use. We will use the first" + " max_examples many examples from the counts_file" + " or indices_file to do the privacy cost estimate") +tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0") +tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts") + +FLAGS = tf.flags.FLAGS + + +def compute_q_noisy_max(counts, noise_eps): + """returns ~ Pr[outcome != winner]. + + Args: + counts: a list of scores + noise_eps: privacy parameter for noisy_max + Returns: + q: the probability that outcome is different from true winner. + """ + # For noisy max, we only get an upper bound. + # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*) + # proof at http://mathoverflow.net/questions/66763/ + # tight-bounds-on-probability-of-sum-of-laplace-random-variables + + winner = np.argmax(counts) + counts_normalized = noise_eps * (counts - counts[winner]) + counts_rest = np.array( + [counts_normalized[i] for i in xrange(len(counts)) if i != winner]) + q = 0.0 + for c in counts_rest: + gap = -c + q += (gap + 2.0) / (4.0 * math.exp(gap)) + return min(q, 1.0 - (1.0/len(counts))) + + +def compute_q_noisy_max_approx(counts, noise_eps): + """returns ~ Pr[outcome != winner]. + + Args: + counts: a list of scores + noise_eps: privacy parameter for noisy_max + Returns: + q: the probability that outcome is different from true winner. + """ + # For noisy max, we only get an upper bound. + # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*) + # proof at http://mathoverflow.net/questions/66763/ + # tight-bounds-on-probability-of-sum-of-laplace-random-variables + # This code uses an approximation that is faster and easier + # to get local sensitivity bound on. + + winner = np.argmax(counts) + counts_normalized = noise_eps * (counts - counts[winner]) + counts_rest = np.array( + [counts_normalized[i] for i in xrange(len(counts)) if i != winner]) + gap = -max(counts_rest) + q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap)) + return min(q, 1.0 - (1.0/len(counts))) + + +def logmgf_exact(q, priv_eps, l): + """Computes the logmgf value given q and privacy eps. + + The bound used is the min of three terms. The first term is from + https://arxiv.org/pdf/1605.02065.pdf. + The second term is based on the fact that when event has probability (1-q) for + q close to zero, q can only change by exp(eps), which corresponds to a + much smaller multiplicative change in (1-q) + The third term comes directly from the privacy guarantee. + Args: + q: pr of non-optimal outcome + priv_eps: eps parameter for DP + l: moment to compute. + Returns: + Upper bound on logmgf + """ + if q < 0.5: + t_one = (1-q) * math.pow((1-q) / (1 - math.exp(priv_eps) * q), l) + t_two = q * math.exp(priv_eps * l) + t = t_one + t_two + try: + log_t = math.log(t) + except ValueError: + print("Got ValueError in math.log for values :" + str((q, priv_eps, l, t))) + log_t = priv_eps * l + else: + log_t = priv_eps * l + + return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l) + + +def logmgf_from_counts(counts, noise_eps, l): + """ + ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP + in our setting where one count can go up by one and another + can go down by 1. + """ + + q = compute_q_noisy_max(counts, noise_eps) + return logmgf_exact(q, 2.0 * noise_eps, l) + + +def sens_at_k(counts, noise_eps, l, k): + """Return sensitivity at distane k. + + Args: + counts: an array of scores + noise_eps: noise parameter used + l: moment whose sensitivity is being computed + k: distance + Returns: + sensitivity: at distance k + """ + counts_sorted = sorted(counts, reverse=True) + if 0.5 * noise_eps * l > 1: + print("l too large to compute sensitivity") + return 0 + # Now we can assume that at k, gap remains positive + # or we have reached the point where logmgf_exact is + # determined by the first term and ind of q. + if counts[0] < counts[1] + k: + return 0 + counts_sorted[0] -= k + counts_sorted[1] += k + val = logmgf_from_counts(counts_sorted, noise_eps, l) + counts_sorted[0] -= 1 + counts_sorted[1] += 1 + val_changed = logmgf_from_counts(counts_sorted, noise_eps, l) + return val_changed - val + + +def smoothed_sens(counts, noise_eps, l, beta): + """Compute beta-smooth sensitivity. + + Args: + counts: array of scors + noise_eps: noise parameter + l: moment of interest + beta: smoothness parameter + Returns: + smooth_sensitivity: a beta smooth upper bound + """ + k = 0 + smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k) + while k < max(counts): + k += 1 + sensitivity_at_k = sens_at_k(counts, noise_eps, l, k) + smoothed_sensitivity = max( + smoothed_sensitivity, + math.exp(-beta * k) * sensitivity_at_k) + if sensitivity_at_k == 0.0: + break + return smoothed_sensitivity + + +def main(unused_argv): + ################################################################## + # If we are reproducing results from paper https://arxiv.org/abs/1610.05755, + # download the required binaries with label information. + ################################################################## + + # Binaries for MNIST results + paper_binaries_mnist = \ + ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_labels.npy?raw=true", + "https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_100_indices_used_by_student.npy?raw=true"] + if FLAGS.counts_file == "mnist_250_teachers_labels.npy" \ + or FLAGS.indices_file == "mnist_250_teachers_100_indices_used_by_student.npy": + maybe_download(paper_binaries_mnist, os.getcwd()) + + # Binaries for SVHN results + paper_binaries_svhn = ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/svhn_250_teachers_labels.npy?raw=true"] + if FLAGS.counts_file == "svhn_250_teachers_labels.npy": + maybe_download(paper_binaries_svhn, os.getcwd()) + + input_mat = np.load(FLAGS.counts_file) + if FLAGS.input_is_counts: + counts_mat = input_mat + else: + # In this case, the input is the raw predictions. Transform + num_teachers, n = input_mat.shape + counts_mat = np.zeros((n, 10)).astype(np.int32) + for i in range(n): + for j in range(num_teachers): + counts_mat[i, int(input_mat[j, i])] += 1 + n = counts_mat.shape[0] + num_examples = min(n, FLAGS.max_examples) + + if not FLAGS.indices_file: + indices = np.array(range(num_examples)) + else: + index_list = np.load(FLAGS.indices_file) + indices = index_list[:num_examples] + + l_list = 1.0 + np.array(xrange(FLAGS.moments)) + beta = FLAGS.beta + total_log_mgf_nm = np.array([0.0 for _ in l_list]) + total_ss_nm = np.array([0.0 for _ in l_list]) + noise_eps = FLAGS.noise_eps + + for i in indices: + total_log_mgf_nm += np.array( + [logmgf_from_counts(counts_mat[i], noise_eps, l) + for l in l_list]) + total_ss_nm += np.array( + [smoothed_sens(counts_mat[i], noise_eps, l, beta) + for l in l_list]) + delta = FLAGS.delta + + # We want delta = exp(alpha - eps l). + # Solving gives eps = (alpha - ln (delta))/l + eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list + + print("Epsilons (Noisy Max): " + str(eps_list_nm)) + print("Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)) + + # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps + # is eps,delta DP + # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps + # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP + # Both from Corolloary 2.4 in + # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf + # Print the first one's scale + ss_eps = 2.0 * beta * math.log(1/delta) + ss_scale = 2.0 / ss_eps + print("To get an " + str(ss_eps) + "-DP estimate of epsilon, ") + print("..add noise ~ " + str(ss_scale)) + print("... times " + str(total_ss_nm / l_list)) + print("Epsilon = " + str(min(eps_list_nm)) + ".") + if min(eps_list_nm) == eps_list_nm[-1]: + print("Warning: May not have used enough values of l") + + # Data independent bound, as mechanism is + # 2*noise_eps DP. + data_ind_log_mgf = np.array([0.0 for _ in l_list]) + data_ind_log_mgf += num_examples * np.array( + [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list]) + + data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list + print("Data independent bound = " + str(min(data_ind_eps_list)) + ".") + + return + + +if __name__ == "__main__": + tf.app.run() diff --git a/research/pate_2017/deep_cnn.py b/research/pate_2017/deep_cnn.py new file mode 100644 index 0000000..917ddc3 --- /dev/null +++ b/research/pate_2017/deep_cnn.py @@ -0,0 +1,603 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from datetime import datetime +import math +import numpy as np +from six.moves import xrange +import tensorflow as tf +import time + +from differential_privacy.multiple_teachers import utils + +FLAGS = tf.app.flags.FLAGS + +# Basic model parameters. +tf.app.flags.DEFINE_integer('dropout_seed', 123, """seed for dropout.""") +tf.app.flags.DEFINE_integer('batch_size', 128, """Nb of images in a batch.""") +tf.app.flags.DEFINE_integer('epochs_per_decay', 350, """Nb epochs per decay""") +tf.app.flags.DEFINE_integer('learning_rate', 5, """100 * learning rate""") +tf.app.flags.DEFINE_boolean('log_device_placement', False, """see TF doc""") + + +# Constants describing the training process. +MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. +LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. + + +def _variable_on_cpu(name, shape, initializer): + """Helper to create a Variable stored on CPU memory. + + Args: + name: name of the variable + shape: list of ints + initializer: initializer for Variable + + Returns: + Variable Tensor + """ + with tf.device('/cpu:0'): + var = tf.get_variable(name, shape, initializer=initializer) + return var + + +def _variable_with_weight_decay(name, shape, stddev, wd): + """Helper to create an initialized Variable with weight decay. + + Note that the Variable is initialized with a truncated normal distribution. + A weight decay is added only if one is specified. + + Args: + name: name of the variable + shape: list of ints + stddev: standard deviation of a truncated Gaussian + wd: add L2Loss weight decay multiplied by this float. If None, weight + decay is not added for this Variable. + + Returns: + Variable Tensor + """ + var = _variable_on_cpu(name, shape, + tf.truncated_normal_initializer(stddev=stddev)) + if wd is not None: + weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') + tf.add_to_collection('losses', weight_decay) + return var + + +def inference(images, dropout=False): + """Build the CNN model. + Args: + images: Images returned from distorted_inputs() or inputs(). + dropout: Boolean controlling whether to use dropout or not + Returns: + Logits + """ + if FLAGS.dataset == 'mnist': + first_conv_shape = [5, 5, 1, 64] + else: + first_conv_shape = [5, 5, 3, 64] + + # conv1 + with tf.variable_scope('conv1') as scope: + kernel = _variable_with_weight_decay('weights', + shape=first_conv_shape, + stddev=1e-4, + wd=0.0) + conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv1 = tf.nn.relu(bias, name=scope.name) + if dropout: + conv1 = tf.nn.dropout(conv1, 0.3, seed=FLAGS.dropout_seed) + + + # pool1 + pool1 = tf.nn.max_pool(conv1, + ksize=[1, 3, 3, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool1') + + # norm1 + norm1 = tf.nn.lrn(pool1, + 4, + bias=1.0, + alpha=0.001 / 9.0, + beta=0.75, + name='norm1') + + # conv2 + with tf.variable_scope('conv2') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[5, 5, 64, 128], + stddev=1e-4, + wd=0.0) + conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [128], tf.constant_initializer(0.1)) + bias = tf.nn.bias_add(conv, biases) + conv2 = tf.nn.relu(bias, name=scope.name) + if dropout: + conv2 = tf.nn.dropout(conv2, 0.3, seed=FLAGS.dropout_seed) + + + # norm2 + norm2 = tf.nn.lrn(conv2, + 4, + bias=1.0, + alpha=0.001 / 9.0, + beta=0.75, + name='norm2') + + # pool2 + pool2 = tf.nn.max_pool(norm2, + ksize=[1, 3, 3, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool2') + + # local3 + with tf.variable_scope('local3') as scope: + # Move everything into depth so we can perform a single matrix multiply. + reshape = tf.reshape(pool2, [FLAGS.batch_size, -1]) + dim = reshape.get_shape()[1].value + weights = _variable_with_weight_decay('weights', + shape=[dim, 384], + stddev=0.04, + wd=0.004) + biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) + local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) + if dropout: + local3 = tf.nn.dropout(local3, 0.5, seed=FLAGS.dropout_seed) + + # local4 + with tf.variable_scope('local4') as scope: + weights = _variable_with_weight_decay('weights', + shape=[384, 192], + stddev=0.04, + wd=0.004) + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) + local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) + if dropout: + local4 = tf.nn.dropout(local4, 0.5, seed=FLAGS.dropout_seed) + + # compute logits + with tf.variable_scope('softmax_linear') as scope: + weights = _variable_with_weight_decay('weights', + [192, FLAGS.nb_labels], + stddev=1/192.0, + wd=0.0) + biases = _variable_on_cpu('biases', + [FLAGS.nb_labels], + tf.constant_initializer(0.0)) + logits = tf.add(tf.matmul(local4, weights), biases, name=scope.name) + + return logits + + +def inference_deeper(images, dropout=False): + """Build a deeper CNN model. + Args: + images: Images returned from distorted_inputs() or inputs(). + dropout: Boolean controlling whether to use dropout or not + Returns: + Logits + """ + if FLAGS.dataset == 'mnist': + first_conv_shape = [3, 3, 1, 96] + else: + first_conv_shape = [3, 3, 3, 96] + + # conv1 + with tf.variable_scope('conv1') as scope: + kernel = _variable_with_weight_decay('weights', + shape=first_conv_shape, + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [96], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv1 = tf.nn.relu(bias, name=scope.name) + + # conv2 + with tf.variable_scope('conv2') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[3, 3, 96, 96], + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(conv1, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [96], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv2 = tf.nn.relu(bias, name=scope.name) + + # conv3 + with tf.variable_scope('conv3') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[3, 3, 96, 96], + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(conv2, kernel, [1, 2, 2, 1], padding='SAME') + biases = _variable_on_cpu('biases', [96], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv3 = tf.nn.relu(bias, name=scope.name) + if dropout: + conv3 = tf.nn.dropout(conv3, 0.5, seed=FLAGS.dropout_seed) + + # conv4 + with tf.variable_scope('conv4') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[3, 3, 96, 192], + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv4 = tf.nn.relu(bias, name=scope.name) + + # conv5 + with tf.variable_scope('conv5') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[3, 3, 192, 192], + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv5 = tf.nn.relu(bias, name=scope.name) + + # conv6 + with tf.variable_scope('conv6') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[3, 3, 192, 192], + stddev=0.05, + wd=0.0) + conv = tf.nn.conv2d(conv5, kernel, [1, 2, 2, 1], padding='SAME') + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.0)) + bias = tf.nn.bias_add(conv, biases) + conv6 = tf.nn.relu(bias, name=scope.name) + if dropout: + conv6 = tf.nn.dropout(conv6, 0.5, seed=FLAGS.dropout_seed) + + + # conv7 + with tf.variable_scope('conv7') as scope: + kernel = _variable_with_weight_decay('weights', + shape=[5, 5, 192, 192], + stddev=1e-4, + wd=0.0) + conv = tf.nn.conv2d(conv6, kernel, [1, 1, 1, 1], padding='SAME') + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) + bias = tf.nn.bias_add(conv, biases) + conv7 = tf.nn.relu(bias, name=scope.name) + + + # local1 + with tf.variable_scope('local1') as scope: + # Move everything into depth so we can perform a single matrix multiply. + reshape = tf.reshape(conv7, [FLAGS.batch_size, -1]) + dim = reshape.get_shape()[1].value + weights = _variable_with_weight_decay('weights', + shape=[dim, 192], + stddev=0.05, + wd=0) + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) + local1 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) + + # local2 + with tf.variable_scope('local2') as scope: + weights = _variable_with_weight_decay('weights', + shape=[192, 192], + stddev=0.05, + wd=0) + biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) + local2 = tf.nn.relu(tf.matmul(local1, weights) + biases, name=scope.name) + if dropout: + local2 = tf.nn.dropout(local2, 0.5, seed=FLAGS.dropout_seed) + + # compute logits + with tf.variable_scope('softmax_linear') as scope: + weights = _variable_with_weight_decay('weights', + [192, FLAGS.nb_labels], + stddev=0.05, + wd=0.0) + biases = _variable_on_cpu('biases', + [FLAGS.nb_labels], + tf.constant_initializer(0.0)) + logits = tf.add(tf.matmul(local2, weights), biases, name=scope.name) + + return logits + + +def loss_fun(logits, labels): + """Add L2Loss to all the trainable variables. + + Add summary for "Loss" and "Loss/avg". + Args: + logits: Logits from inference(). + labels: Labels from distorted_inputs or inputs(). 1-D tensor + of shape [batch_size] + distillation: if set to True, use probabilities and not class labels to + compute softmax loss + + Returns: + Loss tensor of type float. + """ + + # Calculate the cross entropy between labels and predictions + labels = tf.cast(labels, tf.int64) + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=labels, name='cross_entropy_per_example') + + # Calculate the average cross entropy loss across the batch. + cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') + + # Add to TF collection for losses + tf.add_to_collection('losses', cross_entropy_mean) + + # The total loss is defined as the cross entropy loss plus all of the weight + # decay terms (L2 loss). + return tf.add_n(tf.get_collection('losses'), name='total_loss') + + +def moving_av(total_loss): + """ + Generates moving average for all losses + + Args: + total_loss: Total loss from loss(). + Returns: + loss_averages_op: op for generating moving averages of losses. + """ + # Compute the moving average of all individual losses and the total loss. + loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') + losses = tf.get_collection('losses') + loss_averages_op = loss_averages.apply(losses + [total_loss]) + + return loss_averages_op + + +def train_op_fun(total_loss, global_step): + """Train model. + + Create an optimizer and apply to all trainable variables. Add moving + average for all trainable variables. + + Args: + total_loss: Total loss from loss(). + global_step: Integer Variable counting the number of training steps + processed. + Returns: + train_op: op for training. + """ + # Variables that affect learning rate. + nb_ex_per_train_epoch = int(60000 / FLAGS.nb_teachers) + + num_batches_per_epoch = nb_ex_per_train_epoch / FLAGS.batch_size + decay_steps = int(num_batches_per_epoch * FLAGS.epochs_per_decay) + + initial_learning_rate = float(FLAGS.learning_rate) / 100.0 + + # Decay the learning rate exponentially based on the number of steps. + lr = tf.train.exponential_decay(initial_learning_rate, + global_step, + decay_steps, + LEARNING_RATE_DECAY_FACTOR, + staircase=True) + tf.summary.scalar('learning_rate', lr) + + # Generate moving averages of all losses and associated summaries. + loss_averages_op = moving_av(total_loss) + + # Compute gradients. + with tf.control_dependencies([loss_averages_op]): + opt = tf.train.GradientDescentOptimizer(lr) + grads = opt.compute_gradients(total_loss) + + # Apply gradients. + apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) + + # Add histograms for trainable variables. + for var in tf.trainable_variables(): + tf.summary.histogram(var.op.name, var) + + # Track the moving averages of all trainable variables. + variable_averages = tf.train.ExponentialMovingAverage( + MOVING_AVERAGE_DECAY, global_step) + variables_averages_op = variable_averages.apply(tf.trainable_variables()) + + with tf.control_dependencies([apply_gradient_op, variables_averages_op]): + train_op = tf.no_op(name='train') + + return train_op + + +def _input_placeholder(): + """ + This helper function declares a TF placeholder for the graph input data + :return: TF placeholder for the graph input data + """ + if FLAGS.dataset == 'mnist': + image_size = 28 + num_channels = 1 + else: + image_size = 32 + num_channels = 3 + + # Declare data placeholder + train_node_shape = (FLAGS.batch_size, image_size, image_size, num_channels) + return tf.placeholder(tf.float32, shape=train_node_shape) + + +def train(images, labels, ckpt_path, dropout=False): + """ + This function contains the loop that actually trains the model. + :param images: a numpy array with the input data + :param labels: a numpy array with the output labels + :param ckpt_path: a path (including name) where model checkpoints are saved + :param dropout: Boolean, whether to use dropout or not + :return: True if everything went well + """ + + # Check training data + assert len(images) == len(labels) + assert images.dtype == np.float32 + assert labels.dtype == np.int32 + + # Set default TF graph + with tf.Graph().as_default(): + global_step = tf.Variable(0, trainable=False) + + # Declare data placeholder + train_data_node = _input_placeholder() + + # Create a placeholder to hold labels + train_labels_shape = (FLAGS.batch_size,) + train_labels_node = tf.placeholder(tf.int32, shape=train_labels_shape) + + print("Done Initializing Training Placeholders") + + # Build a Graph that computes the logits predictions from the placeholder + if FLAGS.deeper: + logits = inference_deeper(train_data_node, dropout=dropout) + else: + logits = inference(train_data_node, dropout=dropout) + + # Calculate loss + loss = loss_fun(logits, train_labels_node) + + # Build a Graph that trains the model with one batch of examples and + # updates the model parameters. + train_op = train_op_fun(loss, global_step) + + # Create a saver. + saver = tf.train.Saver(tf.global_variables()) + + print("Graph constructed and saver created") + + # Build an initialization operation to run below. + init = tf.global_variables_initializer() + + # Create and init sessions + sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) #NOLINT(long-line) + sess.run(init) + + print("Session ready, beginning training loop") + + # Initialize the number of batches + data_length = len(images) + nb_batches = math.ceil(data_length / FLAGS.batch_size) + + for step in xrange(FLAGS.max_steps): + # for debug, save start time + start_time = time.time() + + # Current batch number + batch_nb = step % nb_batches + + # Current batch start and end indices + start, end = utils.batch_indices(batch_nb, data_length, FLAGS.batch_size) + + # Prepare dictionnary to feed the session with + feed_dict = {train_data_node: images[start:end], + train_labels_node: labels[start:end]} + + # Run training step + _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) + + # Compute duration of training step + duration = time.time() - start_time + + # Sanity check + assert not np.isnan(loss_value), 'Model diverged with loss = NaN' + + # Echo loss once in a while + if step % 100 == 0: + num_examples_per_step = FLAGS.batch_size + examples_per_sec = num_examples_per_step / duration + sec_per_batch = float(duration) + + format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' + 'sec/batch)') + print (format_str % (datetime.now(), step, loss_value, + examples_per_sec, sec_per_batch)) + + # Save the model checkpoint periodically. + if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: + saver.save(sess, ckpt_path, global_step=step) + + return True + + +def softmax_preds(images, ckpt_path, return_logits=False): + """ + Compute softmax activations (probabilities) with the model saved in the path + specified as an argument + :param images: a np array of images + :param ckpt_path: a TF model checkpoint + :param logits: if set to True, return logits instead of probabilities + :return: probabilities (or logits if logits is set to True) + """ + # Compute nb samples and deduce nb of batches + data_length = len(images) + nb_batches = math.ceil(len(images) / FLAGS.batch_size) + + # Declare data placeholder + train_data_node = _input_placeholder() + + # Build a Graph that computes the logits predictions from the placeholder + if FLAGS.deeper: + logits = inference_deeper(train_data_node) + else: + logits = inference(train_data_node) + + if return_logits: + # We are returning the logits directly (no need to apply softmax) + output = logits + else: + # Add softmax predictions to graph: will return probabilities + output = tf.nn.softmax(logits) + + # Restore the moving average version of the learned variables for eval. + variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY) + variables_to_restore = variable_averages.variables_to_restore() + saver = tf.train.Saver(variables_to_restore) + + # Will hold the result + preds = np.zeros((data_length, FLAGS.nb_labels), dtype=np.float32) + + # Create TF session + with tf.Session() as sess: + # Restore TF session from checkpoint file + saver.restore(sess, ckpt_path) + + # Parse data by batch + for batch_nb in xrange(0, int(nb_batches+1)): + # Compute batch start and end indices + start, end = utils.batch_indices(batch_nb, data_length, FLAGS.batch_size) + + # Prepare feed dictionary + feed_dict = {train_data_node: images[start:end]} + + # Run session ([0] because run returns a batch with len 1st dim == 1) + preds[start:end, :] = sess.run([output], feed_dict=feed_dict)[0] + + # Reset graph to allow multiple calls + tf.reset_default_graph() + + return preds diff --git a/research/pate_2017/input.py b/research/pate_2017/input.py new file mode 100644 index 0000000..0a1d89f --- /dev/null +++ b/research/pate_2017/input.py @@ -0,0 +1,424 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cPickle +import gzip +import math +import numpy as np +import os +from scipy.io import loadmat as loadmat +from six.moves import urllib +from six.moves import xrange +import sys +import tarfile + +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + + +def create_dir_if_needed(dest_directory): + """ + Create directory if doesn't exist + :param dest_directory: + :return: True if everything went well + """ + if not tf.gfile.IsDirectory(dest_directory): + tf.gfile.MakeDirs(dest_directory) + + return True + + +def maybe_download(file_urls, directory): + """ + Download a set of files in temporary local folder + :param directory: the directory where to download + :return: a tuple of filepaths corresponding to the files given as input + """ + # Create directory if doesn't exist + assert create_dir_if_needed(directory) + + # This list will include all URLS of the local copy of downloaded files + result = [] + + # For each file of the dataset + for file_url in file_urls: + # Extract filename + filename = file_url.split('/')[-1] + + # If downloading from GitHub, remove suffix ?raw=True from local filename + if filename.endswith("?raw=true"): + filename = filename[:-9] + + # Deduce local file url + #filepath = os.path.join(directory, filename) + filepath = directory + '/' + filename + + # Add to result list + result.append(filepath) + + # Test if file already exists + if not tf.gfile.Exists(filepath): + def _progress(count, block_size, total_size): + sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, + float(count * block_size) / float(total_size) * 100.0)) + sys.stdout.flush() + filepath, _ = urllib.request.urlretrieve(file_url, filepath, _progress) + print() + statinfo = os.stat(filepath) + print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') + + return result + + +def image_whitening(data): + """ + Subtracts mean of image and divides by adjusted standard variance (for + stability). Operations are per image but performed for the entire array. + :param image: 4D array (ID, Height, Weight, Channel) + :return: 4D array (ID, Height, Weight, Channel) + """ + assert len(np.shape(data)) == 4 + + # Compute number of pixels in image + nb_pixels = np.shape(data)[1] * np.shape(data)[2] * np.shape(data)[3] + + # Subtract mean + mean = np.mean(data, axis=(1,2,3)) + + ones = np.ones(np.shape(data)[1:4], dtype=np.float32) + for i in xrange(len(data)): + data[i, :, :, :] -= mean[i] * ones + + # Compute adjusted standard variance + adj_std_var = np.maximum(np.ones(len(data), dtype=np.float32) / math.sqrt(nb_pixels), np.std(data, axis=(1,2,3))) #NOLINT(long-line) + + # Divide image + for i in xrange(len(data)): + data[i, :, :, :] = data[i, :, :, :] / adj_std_var[i] + + print(np.shape(data)) + + return data + + +def extract_svhn(local_url): + """ + Extract a MATLAB matrix into two numpy arrays with data and labels + :param local_url: + :return: + """ + + with tf.gfile.Open(local_url, mode='r') as file_obj: + # Load MATLAB matrix using scipy IO + dict = loadmat(file_obj) + + # Extract each dictionary (one for data, one for labels) + data, labels = dict["X"], dict["y"] + + # Set np type + data = np.asarray(data, dtype=np.float32) + labels = np.asarray(labels, dtype=np.int32) + + # Transpose data to match TF model input format + data = data.transpose(3, 0, 1, 2) + + # Fix the SVHN labels which label 0s as 10s + labels[labels == 10] = 0 + + # Fix label dimensions + labels = labels.reshape(len(labels)) + + return data, labels + + +def unpickle_cifar_dic(file): + """ + Helper function: unpickles a dictionary (used for loading CIFAR) + :param file: filename of the pickle + :return: tuple of (images, labels) + """ + fo = open(file, 'rb') + dict = cPickle.load(fo) + fo.close() + return dict['data'], dict['labels'] + + +def extract_cifar10(local_url, data_dir): + """ + Extracts the CIFAR-10 dataset and return numpy arrays with the different sets + :param local_url: where the tar.gz archive is located locally + :param data_dir: where to extract the archive's file + :return: a tuple (train data, train labels, test data, test labels) + """ + # These numpy dumps can be reloaded to avoid performing the pre-processing + # if they exist in the working directory. + # Changing the order of this list will ruin the indices below. + preprocessed_files = ['/cifar10_train.npy', + '/cifar10_train_labels.npy', + '/cifar10_test.npy', + '/cifar10_test_labels.npy'] + + all_preprocessed = True + for file in preprocessed_files: + if not tf.gfile.Exists(data_dir + file): + all_preprocessed = False + break + + if all_preprocessed: + # Reload pre-processed training data from numpy dumps + with tf.gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj: + train_data = np.load(file_obj) + with tf.gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj: + train_labels = np.load(file_obj) + + # Reload pre-processed testing data from numpy dumps + with tf.gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj: + test_data = np.load(file_obj) + with tf.gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj: + test_labels = np.load(file_obj) + + else: + # Do everything from scratch + # Define lists of all files we should extract + train_files = ["data_batch_" + str(i) for i in xrange(1,6)] + test_file = ["test_batch"] + cifar10_files = train_files + test_file + + # Check if all files have already been extracted + need_to_unpack = False + for file in cifar10_files: + if not tf.gfile.Exists(file): + need_to_unpack = True + break + + # We have to unpack the archive + if need_to_unpack: + tarfile.open(local_url, 'r:gz').extractall(data_dir) + + # Load training images and labels + images = [] + labels = [] + for file in train_files: + # Construct filename + filename = data_dir + "/cifar-10-batches-py/" + file + + # Unpickle dictionary and extract images and labels + images_tmp, labels_tmp = unpickle_cifar_dic(filename) + + # Append to lists + images.append(images_tmp) + labels.append(labels_tmp) + + # Convert to numpy arrays and reshape in the expected format + train_data = np.asarray(images, dtype=np.float32).reshape((50000,3,32,32)) + train_data = np.swapaxes(train_data, 1, 3) + train_labels = np.asarray(labels, dtype=np.int32).reshape(50000) + + # Save so we don't have to do this again + np.save(data_dir + preprocessed_files[0], train_data) + np.save(data_dir + preprocessed_files[1], train_labels) + + # Construct filename for test file + filename = data_dir + "/cifar-10-batches-py/" + test_file[0] + + # Load test images and labels + test_data, test_images = unpickle_cifar_dic(filename) + + # Convert to numpy arrays and reshape in the expected format + test_data = np.asarray(test_data,dtype=np.float32).reshape((10000,3,32,32)) + test_data = np.swapaxes(test_data, 1, 3) + test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000) + + # Save so we don't have to do this again + np.save(data_dir + preprocessed_files[2], test_data) + np.save(data_dir + preprocessed_files[3], test_labels) + + return train_data, train_labels, test_data, test_labels + + +def extract_mnist_data(filename, num_images, image_size, pixel_depth): + """ + Extract the images into a 4D tensor [image index, y, x, channels]. + + Values are rescaled from [0, 255] down to [-0.5, 0.5]. + """ + # if not os.path.exists(file): + if not tf.gfile.Exists(filename+".npy"): + with gzip.open(filename) as bytestream: + bytestream.read(16) + buf = bytestream.read(image_size * image_size * num_images) + data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32) + data = (data - (pixel_depth / 2.0)) / pixel_depth + data = data.reshape(num_images, image_size, image_size, 1) + np.save(filename, data) + return data + else: + with tf.gfile.Open(filename+".npy", mode='r') as file_obj: + return np.load(file_obj) + + +def extract_mnist_labels(filename, num_images): + """ + Extract the labels into a vector of int64 label IDs. + """ + # if not os.path.exists(file): + if not tf.gfile.Exists(filename+".npy"): + with gzip.open(filename) as bytestream: + bytestream.read(8) + buf = bytestream.read(1 * num_images) + labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int32) + np.save(filename, labels) + return labels + else: + with tf.gfile.Open(filename+".npy", mode='r') as file_obj: + return np.load(file_obj) + + +def ld_svhn(extended=False, test_only=False): + """ + Load the original SVHN data + :param extended: include extended training data in the returned array + :param test_only: disables loading of both train and extra -> large speed up + :return: tuple of arrays which depend on the parameters + """ + # Define files to be downloaded + # WARNING: changing the order of this list will break indices (cf. below) + file_urls = ['http://ufldl.stanford.edu/housenumbers/train_32x32.mat', + 'http://ufldl.stanford.edu/housenumbers/test_32x32.mat', + 'http://ufldl.stanford.edu/housenumbers/extra_32x32.mat'] + + # Maybe download data and retrieve local storage urls + local_urls = maybe_download(file_urls, FLAGS.data_dir) + + # Extra Train, Test, and Extended Train data + if not test_only: + # Load and applying whitening to train data + train_data, train_labels = extract_svhn(local_urls[0]) + train_data = image_whitening(train_data) + + # Load and applying whitening to extended train data + ext_data, ext_labels = extract_svhn(local_urls[2]) + ext_data = image_whitening(ext_data) + + # Load and applying whitening to test data + test_data, test_labels = extract_svhn(local_urls[1]) + test_data = image_whitening(test_data) + + if test_only: + return test_data, test_labels + else: + if extended: + # Stack train data with the extended training data + train_data = np.vstack((train_data, ext_data)) + train_labels = np.hstack((train_labels, ext_labels)) + + return train_data, train_labels, test_data, test_labels + else: + # Return training and extended training data separately + return train_data,train_labels, test_data,test_labels, ext_data,ext_labels + + +def ld_cifar10(test_only=False): + """ + Load the original CIFAR10 data + :param extended: include extended training data in the returned array + :param test_only: disables loading of both train and extra -> large speed up + :return: tuple of arrays which depend on the parameters + """ + # Define files to be downloaded + file_urls = ['https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'] + + # Maybe download data and retrieve local storage urls + local_urls = maybe_download(file_urls, FLAGS.data_dir) + + # Extract archives and return different sets + dataset = extract_cifar10(local_urls[0], FLAGS.data_dir) + + # Unpack tuple + train_data, train_labels, test_data, test_labels = dataset + + # Apply whitening to input data + train_data = image_whitening(train_data) + test_data = image_whitening(test_data) + + if test_only: + return test_data, test_labels + else: + return train_data, train_labels, test_data, test_labels + + +def ld_mnist(test_only=False): + """ + Load the MNIST dataset + :param extended: include extended training data in the returned array + :param test_only: disables loading of both train and extra -> large speed up + :return: tuple of arrays which depend on the parameters + """ + # Define files to be downloaded + # WARNING: changing the order of this list will break indices (cf. below) + file_urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', + 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', + ] + + # Maybe download data and retrieve local storage urls + local_urls = maybe_download(file_urls, FLAGS.data_dir) + + # Extract it into np arrays. + train_data = extract_mnist_data(local_urls[0], 60000, 28, 1) + train_labels = extract_mnist_labels(local_urls[1], 60000) + test_data = extract_mnist_data(local_urls[2], 10000, 28, 1) + test_labels = extract_mnist_labels(local_urls[3], 10000) + + if test_only: + return test_data, test_labels + else: + return train_data, train_labels, test_data, test_labels + + +def partition_dataset(data, labels, nb_teachers, teacher_id): + """ + Simple partitioning algorithm that returns the right portion of the data + needed by a given teacher out of a certain nb of teachers + :param data: input data to be partitioned + :param labels: output data to be partitioned + :param nb_teachers: number of teachers in the ensemble (affects size of each + partition) + :param teacher_id: id of partition to retrieve + :return: + """ + + # Sanity check + assert len(data) == len(labels) + assert int(teacher_id) < int(nb_teachers) + + # This will floor the possible number of batches + batch_len = int(len(data) / nb_teachers) + + # Compute start, end indices of partition + start = teacher_id * batch_len + end = (teacher_id+1) * batch_len + + # Slice partition off + partition_data = data[start:end] + partition_labels = labels[start:end] + + return partition_data, partition_labels diff --git a/research/pate_2017/metrics.py b/research/pate_2017/metrics.py new file mode 100644 index 0000000..d9c7119 --- /dev/null +++ b/research/pate_2017/metrics.py @@ -0,0 +1,49 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + + +def accuracy(logits, labels): + """ + Return accuracy of the array of logits (or label predictions) wrt the labels + :param logits: this can either be logits, probabilities, or a single label + :param labels: the correct labels to match against + :return: the accuracy as a float + """ + assert len(logits) == len(labels) + + if len(np.shape(logits)) > 1: + # Predicted labels are the argmax over axis 1 + predicted_labels = np.argmax(logits, axis=1) + else: + # Input was already labels + assert len(np.shape(logits)) == 1 + predicted_labels = logits + + # Check against correct labels to compute correct guesses + correct = np.sum(predicted_labels == labels.reshape(len(labels))) + + # Divide by number of labels to obtain accuracy + accuracy = float(correct) / len(labels) + + # Return float value + return accuracy + + diff --git a/research/pate_2017/train_student.py b/research/pate_2017/train_student.py new file mode 100644 index 0000000..3b84b18 --- /dev/null +++ b/research/pate_2017/train_student.py @@ -0,0 +1,208 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange +import tensorflow as tf + +from differential_privacy.multiple_teachers import aggregation +from differential_privacy.multiple_teachers import deep_cnn +from differential_privacy.multiple_teachers import input +from differential_privacy.multiple_teachers import metrics + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string('dataset', 'svhn', 'The name of the dataset to use') +tf.flags.DEFINE_integer('nb_labels', 10, 'Number of output classes') + +tf.flags.DEFINE_string('data_dir','/tmp','Temporary storage') +tf.flags.DEFINE_string('train_dir','/tmp/train_dir','Where model chkpt are saved') +tf.flags.DEFINE_string('teachers_dir','/tmp/train_dir', + 'Directory where teachers checkpoints are stored.') + +tf.flags.DEFINE_integer('teachers_max_steps', 3000, + 'Number of steps teachers were ran.') +tf.flags.DEFINE_integer('max_steps', 3000, 'Number of steps to run student.') +tf.flags.DEFINE_integer('nb_teachers', 10, 'Teachers in the ensemble.') +tf.flags.DEFINE_integer('stdnt_share', 1000, + 'Student share (last index) of the test data') +tf.flags.DEFINE_integer('lap_scale', 10, + 'Scale of the Laplacian noise added for privacy') +tf.flags.DEFINE_boolean('save_labels', False, + 'Dump numpy arrays of labels and clean teacher votes') +tf.flags.DEFINE_boolean('deeper', False, 'Activate deeper CNN model') + + +def ensemble_preds(dataset, nb_teachers, stdnt_data): + """ + Given a dataset, a number of teachers, and some input data, this helper + function queries each teacher for predictions on the data and returns + all predictions in a single array. (That can then be aggregated into + one single prediction per input using aggregation.py (cf. function + prepare_student_data() below) + :param dataset: string corresponding to mnist, cifar10, or svhn + :param nb_teachers: number of teachers (in the ensemble) to learn from + :param stdnt_data: unlabeled student training data + :return: 3d array (teacher id, sample id, probability per class) + """ + + # Compute shape of array that will hold probabilities produced by each + # teacher, for each training point, and each output class + result_shape = (nb_teachers, len(stdnt_data), FLAGS.nb_labels) + + # Create array that will hold result + result = np.zeros(result_shape, dtype=np.float32) + + # Get predictions from each teacher + for teacher_id in xrange(nb_teachers): + # Compute path of checkpoint file for teacher model with ID teacher_id + if FLAGS.deeper: + ckpt_path = FLAGS.teachers_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_teachers_' + str(teacher_id) + '_deep.ckpt-' + str(FLAGS.teachers_max_steps - 1) #NOLINT(long-line) + else: + ckpt_path = FLAGS.teachers_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt-' + str(FLAGS.teachers_max_steps - 1) # NOLINT(long-line) + + # Get predictions on our training data and store in result array + result[teacher_id] = deep_cnn.softmax_preds(stdnt_data, ckpt_path) + + # This can take a while when there are a lot of teachers so output status + print("Computed Teacher " + str(teacher_id) + " softmax predictions") + + return result + + +def prepare_student_data(dataset, nb_teachers, save=False): + """ + Takes a dataset name and the size of the teacher ensemble and prepares + training data for the student model, according to parameters indicated + in flags above. + :param dataset: string corresponding to mnist, cifar10, or svhn + :param nb_teachers: number of teachers (in the ensemble) to learn from + :param save: if set to True, will dump student training labels predicted by + the ensemble of teachers (with Laplacian noise) as npy files. + It also dumps the clean votes for each class (without noise) and + the labels assigned by teachers + :return: pairs of (data, labels) to be used for student training and testing + """ + assert input.create_dir_if_needed(FLAGS.train_dir) + + # Load the dataset + if dataset == 'svhn': + test_data, test_labels = input.ld_svhn(test_only=True) + elif dataset == 'cifar10': + test_data, test_labels = input.ld_cifar10(test_only=True) + elif dataset == 'mnist': + test_data, test_labels = input.ld_mnist(test_only=True) + else: + print("Check value of dataset flag") + return False + + # Make sure there is data leftover to be used as a test set + assert FLAGS.stdnt_share < len(test_data) + + # Prepare [unlabeled] student training data (subset of test set) + stdnt_data = test_data[:FLAGS.stdnt_share] + + # Compute teacher predictions for student training data + teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data) + + # Aggregate teacher predictions to get student training labels + if not save: + stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale) + else: + # Request clean votes and clean labels as well + stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line) + + # Prepare filepath for numpy dump of clean votes + filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_votes_lap_' + str(FLAGS.lap_scale) + '.npy' # NOLINT(long-line) + + # Prepare filepath for numpy dump of clean labels + filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_teachers_labels_lap_' + str(FLAGS.lap_scale) + '.npy' # NOLINT(long-line) + + # Dump clean_votes array + with tf.gfile.Open(filepath, mode='w') as file_obj: + np.save(file_obj, clean_votes) + + # Dump labels_for_dump array + with tf.gfile.Open(filepath_labels, mode='w') as file_obj: + np.save(file_obj, labels_for_dump) + + # Print accuracy of aggregated labels + ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share]) + print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) + + # Store unused part of test set for use as a test set after student training + stdnt_test_data = test_data[FLAGS.stdnt_share:] + stdnt_test_labels = test_labels[FLAGS.stdnt_share:] + + if save: + # Prepare filepath for numpy dump of labels produced by noisy aggregation + filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_labels_lap_' + str(FLAGS.lap_scale) + '.npy' #NOLINT(long-line) + + # Dump student noisy labels array + with tf.gfile.Open(filepath, mode='w') as file_obj: + np.save(file_obj, stdnt_labels) + + return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels + + +def train_student(dataset, nb_teachers): + """ + This function trains a student using predictions made by an ensemble of + teachers. The student and teacher models are trained using the same + neural network architecture. + :param dataset: string corresponding to mnist, cifar10, or svhn + :param nb_teachers: number of teachers (in the ensemble) to learn from + :return: True if student training went well + """ + assert input.create_dir_if_needed(FLAGS.train_dir) + + # Call helper function to prepare student data using teacher predictions + stdnt_dataset = prepare_student_data(dataset, nb_teachers, save=True) + + # Unpack the student dataset + stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels = stdnt_dataset + + # Prepare checkpoint filename and path + if FLAGS.deeper: + ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line) + else: + ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student.ckpt' # NOLINT(long-line) + + # Start student training + assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path) + + # Compute final checkpoint name for student (with max number of steps) + ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) + + # Compute student label predictions on remaining chunk of test set + student_preds = deep_cnn.softmax_preds(stdnt_test_data, ckpt_path_final) + + # Compute teacher accuracy + precision = metrics.accuracy(student_preds, stdnt_test_labels) + print('Precision of student after training: ' + str(precision)) + + return True + +def main(argv=None): # pylint: disable=unused-argument + # Run student training according to values specified in flags + assert train_student(FLAGS.dataset, FLAGS.nb_teachers) + +if __name__ == '__main__': + tf.app.run() diff --git a/research/pate_2017/train_student_mnist_250_lap_20_count_50_epochs_600.sh b/research/pate_2017/train_student_mnist_250_lap_20_count_50_epochs_600.sh new file mode 100644 index 0000000..de81e9b --- /dev/null +++ b/research/pate_2017/train_student_mnist_250_lap_20_count_50_epochs_600.sh @@ -0,0 +1,25 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +# Be sure to clone https://github.com/openai/improved-gan +# and add improved-gan/mnist_svhn_cifar10 to your PATH variable + +# Download labels used to train the student +wget https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_student_labels_lap_20.npy + +# Train the student using improved-gan +THEANO_FLAGS='floatX=float32,device=gpu,lib.cnmem=1' train_mnist_fm_custom_labels.py --labels mnist_250_student_labels_lap_20.npy --count 50 --epochs 600 + diff --git a/research/pate_2017/train_teachers.py b/research/pate_2017/train_teachers.py new file mode 100644 index 0000000..fdb7634 --- /dev/null +++ b/research/pate_2017/train_teachers.py @@ -0,0 +1,104 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from differential_privacy.multiple_teachers import deep_cnn +from differential_privacy.multiple_teachers import input +from differential_privacy.multiple_teachers import metrics + + +tf.flags.DEFINE_string('dataset', 'svhn', 'The name of the dataset to use') +tf.flags.DEFINE_integer('nb_labels', 10, 'Number of output classes') + +tf.flags.DEFINE_string('data_dir','/tmp','Temporary storage') +tf.flags.DEFINE_string('train_dir','/tmp/train_dir', + 'Where model ckpt are saved') + +tf.flags.DEFINE_integer('max_steps', 3000, 'Number of training steps to run.') +tf.flags.DEFINE_integer('nb_teachers', 50, 'Teachers in the ensemble.') +tf.flags.DEFINE_integer('teacher_id', 0, 'ID of teacher being trained.') + +tf.flags.DEFINE_boolean('deeper', False, 'Activate deeper CNN model') + +FLAGS = tf.flags.FLAGS + + +def train_teacher(dataset, nb_teachers, teacher_id): + """ + This function trains a teacher (teacher id) among an ensemble of nb_teachers + models for the dataset specified. + :param dataset: string corresponding to dataset (svhn, cifar10) + :param nb_teachers: total number of teachers in the ensemble + :param teacher_id: id of the teacher being trained + :return: True if everything went well + """ + # If working directories do not exist, create them + assert input.create_dir_if_needed(FLAGS.data_dir) + assert input.create_dir_if_needed(FLAGS.train_dir) + + # Load the dataset + if dataset == 'svhn': + train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True) + elif dataset == 'cifar10': + train_data, train_labels, test_data, test_labels = input.ld_cifar10() + elif dataset == 'mnist': + train_data, train_labels, test_data, test_labels = input.ld_mnist() + else: + print("Check value of dataset flag") + return False + + # Retrieve subset of data for this teacher + data, labels = input.partition_dataset(train_data, + train_labels, + nb_teachers, + teacher_id) + + print("Length of training data: " + str(len(labels))) + + # Define teacher checkpoint filename and full path + if FLAGS.deeper: + filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '_deep.ckpt' + else: + filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt' + ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename + + # Perform teacher training + assert deep_cnn.train(data, labels, ckpt_path) + + # Append final step value to checkpoint for evaluation + ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) + + # Retrieve teacher probability estimates on the test data + teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final) + + # Compute teacher accuracy + precision = metrics.accuracy(teacher_preds, test_labels) + print('Precision of teacher after training: ' + str(precision)) + + return True + + +def main(argv=None): # pylint: disable=unused-argument + # Make a call to train_teachers with values specified in flags + assert train_teacher(FLAGS.dataset, FLAGS.nb_teachers, FLAGS.teacher_id) + +if __name__ == '__main__': + tf.app.run() diff --git a/research/pate_2017/utils.py b/research/pate_2017/utils.py new file mode 100644 index 0000000..9e3db83 --- /dev/null +++ b/research/pate_2017/utils.py @@ -0,0 +1,35 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +def batch_indices(batch_nb, data_length, batch_size): + """ + This helper function computes a batch start and end index + :param batch_nb: the batch number + :param data_length: the total length of the data being parsed by batches + :param batch_size: the number of inputs in each batch + :return: pair of (start, end) indices + """ + # Batch start and end index + start = int(batch_nb * batch_size) + end = int((batch_nb + 1) * batch_size) + + # When there are not enough inputs left, we reuse some to complete the batch + if end > data_length: + shift = end - data_length + start -= shift + end -= shift + + return start, end diff --git a/research/pate_2018/ICLR2018/README.md b/research/pate_2018/ICLR2018/README.md new file mode 100644 index 0000000..baa1db5 --- /dev/null +++ b/research/pate_2018/ICLR2018/README.md @@ -0,0 +1,61 @@ +Scripts in support of the paper "Scalable Private Learning with PATE" by Nicolas +Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar +Erlingsson (ICLR 2018, https://arxiv.org/abs/1802.08908). + + +### Requirements + +* Python, version ≥ 2.7 +* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`) +* matplotlib +* numpy +* scipy +* sympy (for smooth sensitivity analysis) +* write access to the current directory (otherwise, output directories in download.py and *.sh +scripts must be changed) + +## Reproducing Figures 1 and 5, and Table 2 + +Before running any of the analysis scripts, create the data/ directory and download votes files by running\ +`$ python download.py` + +To generate Figures 1 and 5 run\ +`$ sh generate_figures.sh`\ +The output is written to the figures/ directory. + +For Table 2 run (may take several hours)\ +`$ sh generate_table.sh`\ +The output is written to the console. + +For data-independent bounds (for comparison with Table 2), run\ +`$ sh generate_table_data_independent.sh`\ +The output is written to the console. + +## Files in this directory + +* generate_figures.sh — Master script for generating Figures 1 and 5. + +* generate_table.sh — Master script for generating Table 2. + +* generate_table_data_independent.sh — Master script for computing data-independent + bounds. + +* rdp_bucketized.py — Script for producing Figure 1 (right) and Figure 5 (right). + +* rdp_cumulative.py — Script for producing Figure 1 (middle) and Figure 5 (left). + +* smooth_sensitivity_table.py — Script for generating Table 2. + +* utility_queries_answered — Script for producing Figure 1 (left). + +* plot_partition.py — Script for producing partition.pdf, a detailed breakdown of privacy +costs for Confident-GNMax with smooth sensitivity analysis (takes ~50 hours). + +* plots_for_slides.py — Script for producing several plots for the slide deck. + +* download.py — Utility script for populating the data/ directory. + +* plot_ls_q.py is not used. + + +All Python files take flags. Run script_name.py --help for help on flags. diff --git a/research/pate_2018/ICLR2018/download.py b/research/pate_2018/ICLR2018/download.py new file mode 100644 index 0000000..022df1d --- /dev/null +++ b/research/pate_2018/ICLR2018/download.py @@ -0,0 +1,43 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Script to download votes files to the data/ directory. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from six.moves import urllib +import os +import tarfile + +FILE_URI = 'https://storage.googleapis.com/pate-votes/votes.gz' +DATA_DIR = 'data/' + + +def download(): + print('Downloading ' + FILE_URI) + tar_filename, _ = urllib.request.urlretrieve(FILE_URI) + print('Unpacking ' + tar_filename) + with tarfile.open(tar_filename, "r:gz") as tar: + tar.extractall(DATA_DIR) + print('Done!') + + +if __name__ == '__main__': + if not os.path.exists(DATA_DIR): + print('Data directory does not exist. Creating ' + DATA_DIR) + os.makedirs(DATA_DIR) + download() diff --git a/research/pate_2018/ICLR2018/generate_figures.sh b/research/pate_2018/ICLR2018/generate_figures.sh new file mode 100644 index 0000000..cbcf248 --- /dev/null +++ b/research/pate_2018/ICLR2018/generate_figures.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +counts_file="data/glyph_5000_teachers.npy" +output_dir="figures/" + +mkdir -p $output_dir + +if [ ! -d "$output_dir" ]; then + echo "Directory $output_dir does not exist." + exit 1 +fi + +python rdp_bucketized.py \ + --plot=small \ + --counts_file=$counts_file \ + --plot_file=$output_dir"noisy_thresholding_check_perf.pdf" + +python rdp_bucketized.py \ + --plot=large \ + --counts_file=$counts_file \ + --plot_file=$output_dir"noisy_thresholding_check_perf_details.pdf" + +python rdp_cumulative.py \ + --cache=False \ + --counts_file=$counts_file \ + --figures_dir=$output_dir + +python utility_queries_answered.py --plot_file=$output_dir"utility_queries_answered.pdf" \ No newline at end of file diff --git a/research/pate_2018/ICLR2018/generate_table.sh b/research/pate_2018/ICLR2018/generate_table.sh new file mode 100644 index 0000000..7625bd4 --- /dev/null +++ b/research/pate_2018/ICLR2018/generate_table.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +echo "Reproducing Table 2. Takes a couple of hours." + +executable="python smooth_sensitivity_table.py" +data_dir="data" + +echo +echo "######## MNIST ########" +echo + +$executable \ + --counts_file=$data_dir"/mnist_250_teachers.npy" \ + --threshold=200 \ + --sigma1=150 \ + --sigma2=40 \ + --queries=640 \ + --delta=1e-5 + +echo +echo "######## SVHN ########" +echo + +$executable \ + --counts_file=$data_dir"/svhn_250_teachers.npy" \ + --threshold=300 \ + --sigma1=200 \ + --sigma2=40 \ + --queries=8500 \ + --delta=1e-6 + +echo +echo "######## Adult ########" +echo + +$executable \ + --counts_file=$data_dir"/adult_250_teachers.npy" \ + --threshold=300 \ + --sigma1=200 \ + --sigma2=40 \ + --queries=1500 \ + --delta=1e-5 + +echo +echo "######## Glyph (Confident) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_5000_teachers.npy" \ + --threshold=1000 \ + --sigma1=500 \ + --sigma2=100 \ + --queries=12000 \ + --delta=1e-8 + +echo +echo "######## Glyph (Interactive, Round 1) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_round1.npy" \ + --threshold=3500 \ + --sigma1=1500 \ + --sigma2=100 \ + --delta=1e-8 + +echo +echo "######## Glyph (Interactive, Round 2) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_round2.npy" \ + --baseline_file=$data_dir"/glyph_round2_student.npy" \ + --threshold=3500 \ + --sigma1=2000 \ + --sigma2=200 \ + --teachers=5000 \ + --delta=1e-8 diff --git a/research/pate_2018/ICLR2018/generate_table_data_independent.sh b/research/pate_2018/ICLR2018/generate_table_data_independent.sh new file mode 100644 index 0000000..3ac3ef7 --- /dev/null +++ b/research/pate_2018/ICLR2018/generate_table_data_independent.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +echo "Table 2 with data-independent analysis." + +executable="python smooth_sensitivity_table.py" +data_dir="data" + +echo +echo "######## MNIST ########" +echo + +$executable \ + --counts_file=$data_dir"/mnist_250_teachers.npy" \ + --threshold=200 \ + --sigma1=150 \ + --sigma2=40 \ + --queries=640 \ + --delta=1e-5 \ + --data_independent +echo +echo "######## SVHN ########" +echo + +$executable \ + --counts_file=$data_dir"/svhn_250_teachers.npy" \ + --threshold=300 \ + --sigma1=200 \ + --sigma2=40 \ + --queries=8500 \ + --delta=1e-6 \ + --data_independent + +echo +echo "######## Adult ########" +echo + +$executable \ + --counts_file=$data_dir"/adult_250_teachers.npy" \ + --threshold=300 \ + --sigma1=200 \ + --sigma2=40 \ + --queries=1500 \ + --delta=1e-5 \ + --data_independent + +echo +echo "######## Glyph (Confident) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_5000_teachers.npy" \ + --threshold=1000 \ + --sigma1=500 \ + --sigma2=100 \ + --queries=12000 \ + --delta=1e-8 \ + --data_independent + +echo +echo "######## Glyph (Interactive, Round 1) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_round1.npy" \ + --threshold=3500 \ + --sigma1=1500 \ + --sigma2=100 \ + --delta=1e-8 \ + --data_independent + +echo +echo "######## Glyph (Interactive, Round 2) ########" +echo + +$executable \ + --counts_file=$data_dir"/glyph_round2.npy" \ + --baseline_file=$data_dir"/glyph_round2_student.npy" \ + --threshold=3500 \ + --sigma1=2000 \ + --sigma2=200 \ + --teachers=5000 \ + --delta=1e-8 \ + --order=8.5 \ + --data_independent diff --git a/research/pate_2018/ICLR2018/plot_ls_q.py b/research/pate_2018/ICLR2018/plot_ls_q.py new file mode 100644 index 0000000..a1e0a49 --- /dev/null +++ b/research/pate_2018/ICLR2018/plot_ls_q.py @@ -0,0 +1,105 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Plots LS(q). + +A script in support of the PATE2 paper. NOT PRESENTLY USED. + +The output is written to a specified directory as a pdf file. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + + +from absl import app +from absl import flags +import matplotlib +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top +import numpy as np +import smooth_sensitivity as pate_ss + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS + +flags.DEFINE_string('figures_dir', '', 'Path where the output is written to.') + + +def compute_ls_q(sigma, order, num_classes): + + def beta(q): + return pate_ss._compute_rdp_gnmax(sigma, math.log(q), order) + + def bu(q): + return pate_ss._compute_bu_gnmax(q, sigma, order) + + def bl(q): + return pate_ss._compute_bl_gnmax(q, sigma, order) + + def delta_beta(q): + if q == 0 or q > .8: + return 0 + beta_q = beta(q) + beta_bu_q = beta(bu(q)) + beta_bl_q = beta(bl(q)) + assert beta_bl_q <= beta_q <= beta_bu_q + return beta_bu_q - beta_q # max(beta_bu_q - beta_q, beta_q - beta_bl_q) + + logq0 = pate_ss.compute_logq0_gnmax(sigma, order) + logq1 = pate_ss._compute_logq1(sigma, order, num_classes) + print(math.exp(logq1), math.exp(logq0)) + xs = np.linspace(0, .1, num=1000, endpoint=True) + ys = [delta_beta(x) for x in xs] + return xs, ys + + +def main(argv): + del argv # Unused. + + sigma = 20 + order = 20. + num_classes = 10 + + # sigma = 20 + # order = 25. + # num_classes = 10 + + x_axis, ys = compute_ls_q(sigma, order, num_classes) + + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + + ax.plot(x_axis, ys, alpha=.8, linewidth=5) + plt.xlabel('Number of queries answered', fontsize=16) + plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16) + ax.tick_params(labelsize=14) + fout_name = os.path.join(FLAGS.figures_dir, 'ls_of_q.pdf') + print('Saving the graph to ' + fout_name) + plt.show() + + plt.close('all') + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/plot_partition.py b/research/pate_2018/ICLR2018/plot_partition.py new file mode 100644 index 0000000..ed07a17 --- /dev/null +++ b/research/pate_2018/ICLR2018/plot_partition.py @@ -0,0 +1,412 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Produces two plots. One compares aggregators and their analyses. The other +illustrates sources of privacy loss for Confident-GNMax. + +A script in support of the paper "Scalable Private Learning with PATE" by +Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, +Ulfar Erlingsson (https://arxiv.org/abs/1802.08908). + +The input is a file containing a numpy array of votes, one query per row, one +class per column. Ex: + 43, 1821, ..., 3 + 31, 16, ..., 0 + ... + 0, 86, ..., 438 +The output is written to a specified directory and consists of two files. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import pickle +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + +from absl import app +from absl import flags +from collections import namedtuple +import matplotlib + +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top +import numpy as np +import core as pate +import smooth_sensitivity as pate_ss + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS +flags.DEFINE_boolean('cache', False, + 'Read results of privacy analysis from cache.') +flags.DEFINE_string('counts_file', None, 'Counts file.') +flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.') +flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).') +flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).') +flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).') +flags.DEFINE_integer('queries', None, 'Number of queries made by the student.') +flags.DEFINE_float('delta', 1e-8, 'Target delta.') + +flags.mark_flag_as_required('counts_file') +flags.mark_flag_as_required('threshold') +flags.mark_flag_as_required('sigma1') +flags.mark_flag_as_required('sigma2') + +Partition = namedtuple('Partition', ['step1', 'step2', 'ss', 'delta']) + + +def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta): + orders = np.logspace(np.log10(1.5), np.log10(500), num=100) + n = votes.shape[0] + + rdp_total = np.zeros(len(orders)) + answered_total = 0 + answered = np.zeros(n) + eps_cum = np.full(n, None, dtype=float) + + for i in range(n): + v = votes[i,] + if threshold is not None and sigma1 is not None: + q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v)) + rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders) + else: + q_step1 = 1. # always answer + + answered_total += q_step1 + answered[i] = answered_total + + rdp_total += q_step1 * pate.rdp_data_independent_gaussian(sigma2, orders) + + eps_cum[i], order_opt = pate.compute_eps_from_delta(orders, rdp_total, + delta) + + if i > 0 and (i + 1) % 1000 == 0: + print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} ' + 'at order = {:.2f}.'.format( + i + 1, + answered[i], + eps_cum[i], + order_opt)) + sys.stdout.flush() + + return eps_cum, answered + + +def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta): + # Short list of orders. + # orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20)) + + # Long list of orders. + orders = np.concatenate((np.arange(20, 40, .2), + np.arange(40, 75, .5), + np.logspace(np.log10(75), np.log10(200), num=20))) + + n = votes.shape[0] + num_classes = votes.shape[1] + num_teachers = int(sum(votes[0,])) + + if threshold is not None and sigma1 is not None: + is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian( + num_teachers, num_classes, sigma1, orders) + else: + is_data_ind_step1 = [True] * len(orders) + + is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian( + num_teachers, num_classes, sigma2, orders) + + eps_partitioned = np.full(n, None, dtype=Partition) + order_opt = np.full(n, None, dtype=float) + ss_std_opt = np.full(n, None, dtype=float) + answered = np.zeros(n) + + rdp_step1_total = np.zeros(len(orders)) + rdp_step2_total = np.zeros(len(orders)) + + ls_total = np.zeros((len(orders), num_teachers)) + answered_total = 0 + + for i in range(n): + v = votes[i,] + + if threshold is not None and sigma1 is not None: + logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v) + rdp_step1_total += pate.compute_rdp_threshold(logq_step1, sigma1, orders) + else: + logq_step1 = 0. # always answer + + pr_answered = np.exp(logq_step1) + logq_step2 = pate.compute_logq_gaussian(v, sigma2) + rdp_step2_total += pr_answered * pate.rdp_gaussian(logq_step2, sigma2, + orders) + + answered_total += pr_answered + + rdp_ss = np.zeros(len(orders)) + ss_std = np.zeros(len(orders)) + + for j, order in enumerate(orders): + if not is_data_ind_step1[j]: + ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(v, + num_teachers, threshold, sigma1, order) + else: + ls_step1 = np.full(num_teachers, 0, dtype=float) + + if not is_data_ind_step2[j]: + ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( + v, num_teachers, sigma2, order) + else: + ls_step2 = np.full(num_teachers, 0, dtype=float) + + ls_total[j,] += ls_step1 + pr_answered * ls_step2 + + beta_ss = .49 / order + + ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j,]) + sigma_ss = ((order * math.exp(2 * beta_ss)) / ss) ** (1 / 3) + rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( + beta_ss, sigma_ss, order) + ss_std[j] = ss * sigma_ss + + rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss + + answered[i] = answered_total + _, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta) + order_idx = np.searchsorted(orders, order_opt[i]) + + # Since optimal orders are always non-increasing, shrink orders array + # and all cumulative arrays to speed up computation. + if order_idx < len(orders): + orders = orders[:order_idx + 1] + rdp_step1_total = rdp_step1_total[:order_idx + 1] + rdp_step2_total = rdp_step2_total[:order_idx + 1] + + eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx], + step2=rdp_step2_total[order_idx], + ss=rdp_ss[order_idx], + delta=-math.log(delta) / (order_opt[i] - 1)) + ss_std_opt[i] = ss_std[order_idx] + if i > 0 and (i + 1) % 1 == 0: + print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} ' + 'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, ' + 'step2 = {:.3f}, ss = {:.3f}'.format( + i + 1, + answered[i], + sum(eps_partitioned[i]), + ss_std_opt[i], + order_opt[i], + eps_partitioned[i].delta, + eps_partitioned[i].step1, + eps_partitioned[i].step2, + eps_partitioned[i].ss)) + sys.stdout.flush() + + return eps_partitioned, answered, ss_std_opt, order_opt + + +def plot_comparison(figures_dir, simple_ind, conf_ind, simple_dep, conf_dep): + """Plots variants of GNMax algorithm and their analyses. + """ + + def pivot(x_axis, eps, answered): + y = np.full(len(x_axis), None, dtype=float) # delta + for i, x in enumerate(x_axis): + idx = np.searchsorted(answered, x) + if idx < len(eps): + y[i] = eps[idx] + return y + + def pivot_dep(x_axis, data_dep): + eps_partitioned, answered, _, _ = data_dep + eps = [sum(p) for p in eps_partitioned] # Flatten eps + return pivot(x_axis, eps, answered) + + xlim = 10000 + x_axis = range(0, xlim, 10) + + y_simple_ind = pivot(x_axis, *simple_ind) + y_conf_ind = pivot(x_axis, *conf_ind) + + y_simple_dep = pivot_dep(x_axis, simple_dep) + y_conf_dep = pivot_dep(x_axis, conf_dep) + + # plt.close('all') + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + + ax.plot(x_axis, y_simple_ind, ls='--', color='r', lw=3, label=r'Simple GNMax, data-ind analysis') + ax.plot(x_axis, y_conf_ind, ls='--', color='b', lw=3, label=r'Confident GNMax, data-ind analysis') + ax.plot(x_axis, y_simple_dep, ls='-', color='r', lw=3, label=r'Simple GNMax, data-dep analysis') + ax.plot(x_axis, y_conf_dep, ls='-', color='b', lw=3, label=r'Confident GNMax, data-dep analysis') + + plt.xticks(np.arange(0, xlim + 1000, 2000)) + plt.xlim([0, xlim]) + plt.ylim(bottom=0) + plt.legend(fontsize=16) + ax.set_xlabel('Number of queries answered', fontsize=16) + ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16) + + ax.tick_params(labelsize=14) + plot_filename = os.path.join(figures_dir, 'comparison.pdf') + print('Saving the graph to ' + plot_filename) + fig.savefig(plot_filename, bbox_inches='tight') + plt.show() + + +def plot_partition(figures_dir, gnmax_conf, print_order): + """Plots an expert version of the privacy-per-answered-query graph. + + Args: + figures_dir: A name of the directory where to save the plot. + eps: The cumulative privacy cost. + partition: Allocation of the privacy cost. + answered: Cumulative number of queries answered. + order_opt: The list of optimal orders. + """ + eps_partitioned, answered, ss_std_opt, order_opt = gnmax_conf + + xlim = 10000 + x = range(0, int(xlim), 10) + lenx = len(x) + y0 = np.full(lenx, np.nan, dtype=float) # delta + y1 = np.full(lenx, np.nan, dtype=float) # delta + step1 + y2 = np.full(lenx, np.nan, dtype=float) # delta + step1 + step2 + y3 = np.full(lenx, np.nan, dtype=float) # delta + step1 + step2 + ss + noise_std = np.full(lenx, np.nan, dtype=float) + + y_right = np.full(lenx, np.nan, dtype=float) + + for i in range(lenx): + idx = np.searchsorted(answered, x[i]) + if idx < len(eps_partitioned): + y0[i] = eps_partitioned[idx].delta + y1[i] = y0[i] + eps_partitioned[idx].step1 + y2[i] = y1[i] + eps_partitioned[idx].step2 + y3[i] = y2[i] + eps_partitioned[idx].ss + + noise_std[i] = ss_std_opt[idx] + y_right[i] = order_opt[idx] + + # plt.close('all') + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + fig.patch.set_alpha(0) + + l1 = ax.plot( + x, y3, color='b', ls='-', label=r'Total privacy cost', linewidth=1).pop() + + for y in (y0, y1, y2): + ax.plot(x, y, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1) + + ax.fill_between(x, [0] * lenx, y0.tolist(), facecolor='b', alpha=.5) + ax.fill_between(x, y0.tolist(), y1.tolist(), facecolor='b', alpha=.4) + ax.fill_between(x, y1.tolist(), y2.tolist(), facecolor='b', alpha=.3) + ax.fill_between(x, y2.tolist(), y3.tolist(), facecolor='b', alpha=.2) + + ax.fill_between(x, (y3 - noise_std).tolist(), (y3 + noise_std).tolist(), + facecolor='r', alpha=.5) + + + plt.xticks(np.arange(0, xlim + 1000, 2000)) + plt.xlim([0, xlim]) + ax.set_ylim([0, 3.]) + + ax.set_xlabel('Number of queries answered', fontsize=16) + ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16) + + # Merging legends. + if print_order: + ax2 = ax.twinx() + l2 = ax2.plot( + x, y_right, 'r', ls='-', label=r'Optimal order', linewidth=5, + alpha=.5).pop() + ax2.grid(False) + # ax2.set_ylabel(r'Optimal Renyi order', fontsize=16) + ax2.set_ylim([0, 200.]) + # ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13) + + ax.tick_params(labelsize=14) + plot_filename = os.path.join(figures_dir, 'partition.pdf') + print('Saving the graph to ' + plot_filename) + fig.savefig(plot_filename, bbox_inches='tight', dpi=800) + plt.show() + + +def run_all_analyses(votes, threshold, sigma1, sigma2, delta): + simple_ind = analyze_gnmax_conf_data_ind(votes, None, None, sigma2, + delta) + + conf_ind = analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, + delta) + + simple_dep = analyze_gnmax_conf_data_dep(votes, None, None, sigma2, + delta) + + conf_dep = analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, + delta) + + return (simple_ind, conf_ind, simple_dep, conf_dep) + + +def run_or_load_all_analyses(): + temp_filename = os.path.expanduser('~/tmp/partition_cached.pkl') + + if FLAGS.cache and os.path.isfile(temp_filename): + print('Reading from cache ' + temp_filename) + with open(temp_filename, 'rb') as f: + all_analyses = pickle.load(f) + else: + fin_name = os.path.expanduser(FLAGS.counts_file) + print('Reading raw votes from ' + fin_name) + sys.stdout.flush() + + votes = np.load(fin_name) + + if FLAGS.queries is not None: + if votes.shape[0] < FLAGS.queries: + raise ValueError('Expect {} rows, got {} in {}'.format( + FLAGS.queries, votes.shape[0], fin_name)) + # Truncate the votes matrix to the number of queries made. + votes = votes[:FLAGS.queries, ] + + all_analyses = run_all_analyses(votes, FLAGS.threshold, FLAGS.sigma1, + FLAGS.sigma2, FLAGS.delta) + + print('Writing to cache ' + temp_filename) + with open(temp_filename, 'wb') as f: + pickle.dump(all_analyses, f) + + return all_analyses + + +def main(argv): + del argv # Unused. + + simple_ind, conf_ind, simple_dep, conf_dep = run_or_load_all_analyses() + + figures_dir = os.path.expanduser(FLAGS.figures_dir) + + plot_comparison(figures_dir, simple_ind, conf_ind, simple_dep, conf_dep) + plot_partition(figures_dir, conf_dep, True) + plt.close('all') + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/plots_for_slides.py b/research/pate_2018/ICLR2018/plots_for_slides.py new file mode 100644 index 0000000..52c36b7 --- /dev/null +++ b/research/pate_2018/ICLR2018/plots_for_slides.py @@ -0,0 +1,283 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Plots graphs for the slide deck. + +A script in support of the PATE2 paper. The input is a file containing a numpy +array of votes, one query per row, one class per column. Ex: + 43, 1821, ..., 3 + 31, 16, ..., 0 + ... + 0, 86, ..., 438 +The output graphs are visualized using the TkAgg backend. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + +from absl import app +from absl import flags +import matplotlib + +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top +import numpy as np +import core as pate +import random + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS +flags.DEFINE_string('counts_file', None, 'Counts file.') +flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.') +flags.DEFINE_boolean('transparent', False, 'Set background to transparent.') + +flags.mark_flag_as_required('counts_file') + + +def setup_plot(): + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + + if FLAGS.transparent: + fig.patch.set_alpha(0) + + return fig, ax + + +def plot_rdp_curve_per_example(votes, sigmas): + orders = np.linspace(1., 100., endpoint=True, num=1000) + orders[0] = 1.001 + fig, ax = setup_plot() + + for i in range(votes.shape[0]): + for sigma in sigmas: + logq = pate.compute_logq_gaussian(votes[i,], sigma) + rdp = pate.rdp_gaussian(logq, sigma, orders) + ax.plot( + orders, + rdp, + alpha=1., + label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)), + linewidth=5) + + for sigma in sigmas: + ax.plot( + orders, + pate.rdp_data_independent_gaussian(sigma, orders), + alpha=.3, + label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)), + linewidth=10) + + plt.xlim(xmin=1, xmax=100) + plt.ylim(ymin=0) + plt.xticks([1, 20, 40, 60, 80, 100]) + plt.yticks([0, .0025, .005, .0075, .01]) + plt.xlabel(r'Order $\alpha$', fontsize=16) + plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16) + ax.tick_params(labelsize=14) + + plt.legend(loc=0, fontsize=13) + plt.show() + + +def plot_rdp_of_sigma(v, order): + sigmas = np.linspace(1., 1000., endpoint=True, num=1000) + fig, ax = setup_plot() + + y = np.zeros(len(sigmas)) + + for i, sigma in enumerate(sigmas): + logq = pate.compute_logq_gaussian(v, sigma) + y[i] = pate.rdp_gaussian(logq, sigma, order) + + ax.plot(sigmas, y, alpha=.8, linewidth=5) + + plt.xlim(xmin=1, xmax=1000) + plt.ylim(ymin=0) + # plt.yticks([0, .0004, .0008, .0012]) + ax.tick_params(labelleft='off') + plt.xlabel(r'Noise $\sigma$', fontsize=16) + plt.ylabel(r'RDP at order $\alpha={}$'.format(order), fontsize=16) + ax.tick_params(labelsize=14) + + # plt.legend(loc=0, fontsize=13) + plt.show() + + +def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders, + target_answered): + rdp_cum = np.zeros(len(orders)) + answered = 0 + for i, v in enumerate(votes): + v = sorted(v, reverse=True) + q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) + logq_step2 = pate.compute_logq_gaussian(v, sigma2) + rdp = pate.rdp_gaussian(logq_step2, sigma2, orders) + rdp_cum += q_step1 * rdp + + answered += q_step1 + if answered >= target_answered: + print('Processed {} queries to answer {}.'.format(i, target_answered)) + return rdp_cum + + assert False, 'Never reached {} answered queries.'.format(target_answered) + + +def plot_rdp_total(votes, sigmas): + orders = np.linspace(1., 100., endpoint=True, num=100) + orders[0] = 1.1 + + fig, ax = setup_plot() + + target_answered = 2000 + + for sigma in sigmas: + rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, target_answered) + ax.plot( + orders, + rdp, + alpha=.8, + label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)), + linewidth=5) + + # for sigma in sigmas: + # ax.plot( + # orders, + # target_answered * pate.rdp_data_independent_gaussian(sigma, orders), + # alpha=.3, + # label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)), + # linewidth=10) + + plt.xlim(xmin=1, xmax=100) + plt.ylim(ymin=0) + plt.xticks([1, 20, 40, 60, 80, 100]) + plt.yticks([0, .0005, .001, .0015, .002]) + + plt.xlabel(r'Order $\alpha$', fontsize=16) + plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16) + ax.tick_params(labelsize=14) + + plt.legend(loc=0, fontsize=13) + plt.show() + + +def plot_data_ind_curve(): + fig, ax = setup_plot() + + orders = np.linspace(1., 10., endpoint=True, num=1000) + orders[0] = 1.01 + + ax.plot( + orders, + pate.rdp_data_independent_gaussian(1., orders), + alpha=.5, + color='gray', + linewidth=10) + + # plt.yticks([]) + plt.xlim(xmin=1, xmax=10) + plt.ylim(ymin=0) + plt.xticks([1, 3, 5, 7, 9]) + ax.tick_params(labelsize=14) + plt.show() + + +def plot_two_data_ind_curves(): + orders = np.linspace(1., 100., endpoint=True, num=1000) + orders[0] = 1.001 + + fig, ax = setup_plot() + + for sigma in [100, 150]: + ax.plot( + orders, + pate.rdp_data_independent_gaussian(sigma, orders), + alpha=.3, + label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)), + linewidth=10) + + plt.xlim(xmin=1, xmax=100) + plt.ylim(ymin=0) + plt.xticks([1, 20, 40, 60, 80, 100]) + plt.yticks([0, .0025, .005, .0075, .01]) + plt.xlabel(r'Order $\alpha$', fontsize=16) + plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16) + ax.tick_params(labelsize=14) + + plt.legend(loc=0, fontsize=13) + plt.show() + + +def scatter_plot(votes, threshold, sigma1, sigma2, order): + fig, ax = setup_plot() + x = [] + y = [] + for i, v in enumerate(votes): + if threshold is not None and sigma1 is not None: + q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) + else: + q_step1 = 1. + if random.random() < q_step1: + logq_step2 = pate.compute_logq_gaussian(v, sigma2) + x.append(max(v)) + y.append(pate.rdp_gaussian(logq_step2, sigma2, order)) + + print('Selected {} queries.'.format(len(x))) + # Plot the data-independent curve: + # data_ind = pate.rdp_data_independent_gaussian(sigma, order) + # plt.plot([0, 5000], [data_ind, data_ind], color='tab:blue', linestyle='-', linewidth=2) + ax.set_yscale('log') + plt.xlim(xmin=0, xmax=5000) + plt.ylim(ymin=1e-300, ymax=1) + plt.yticks([1, 1e-100, 1e-200, 1e-300]) + plt.scatter(x, y, s=1, alpha=0.5) + plt.ylabel(r'RDP at $\alpha={}$'.format(order), fontsize=16) + plt.xlabel(r'max count', fontsize=16) + ax.tick_params(labelsize=14) + plt.show() + + +def main(argv): + del argv # Unused. + fin_name = os.path.expanduser(FLAGS.counts_file) + print('Reading raw votes from ' + fin_name) + sys.stdout.flush() + + plot_data_ind_curve() + plot_two_data_ind_curves() + + v1 = [2550, 2200, 250] # based on votes[2,] + # v2 = [2600, 2200, 200] # based on votes[381,] + plot_rdp_curve_per_example(np.array([v1]), (100., 150.)) + + plot_rdp_of_sigma(np.array(v1), 20.) + + votes = np.load(fin_name) + + plot_rdp_total(votes[:12000, ], (100., 150.)) + scatter_plot(votes[:6000, ], None, None, 100, 20) # w/o thresholding + scatter_plot(votes[:6000, ], 3500, 1500, 100, 20) # with thresholding + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/rdp_bucketized.py b/research/pate_2018/ICLR2018/rdp_bucketized.py new file mode 100644 index 0000000..a5e7785 --- /dev/null +++ b/research/pate_2018/ICLR2018/rdp_bucketized.py @@ -0,0 +1,263 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Illustrates how noisy thresholding check changes distribution of queries. + +A script in support of the paper "Scalable Private Learning with PATE" by +Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, +Ulfar Erlingsson (https://arxiv.org/abs/1802.08908). + +The input is a file containing a numpy array of votes, one query per row, one +class per column. Ex: + 43, 1821, ..., 3 + 31, 16, ..., 0 + ... + 0, 86, ..., 438 +The output is one of two graphs depending on the setting of the plot variable. +The output is written to a pdf file. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + +from absl import app +from absl import flags +import matplotlib +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top +import numpy as np +import core as pate + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS +flags.DEFINE_enum('plot', 'small', ['small', 'large'], 'Selects which of' + 'the two plots is produced.') +flags.DEFINE_string('counts_file', None, 'Counts file.') +flags.DEFINE_string('plot_file', '', 'Plot file to write.') + +flags.mark_flag_as_required('counts_file') + + +def compute_count_per_bin(bin_num, votes): + """Tabulates number of examples in each bin. + + Args: + bin_num: Number of bins. + votes: A matrix of votes, where each row contains votes in one instance. + + Returns: + Array of counts of length bin_num. + """ + sums = np.sum(votes, axis=1) + # Check that all rows contain the same number of votes. + assert max(sums) == min(sums) + + s = max(sums) + + counts = np.zeros(bin_num) + n = votes.shape[0] + + for i in xrange(n): + v = votes[i,] + bin_idx = int(math.floor(max(v) * bin_num / s)) + assert 0 <= bin_idx < bin_num + counts[bin_idx] += 1 + + return counts + + +def compute_privacy_cost_per_bins(bin_num, votes, sigma2, order): + """Outputs average privacy cost per bin. + + Args: + bin_num: Number of bins. + votes: A matrix of votes, where each row contains votes in one instance. + sigma2: The scale (std) of the Gaussian noise. (Same as sigma_2 in + Algorithms 1 and 2.) + order: The Renyi order for which privacy cost is computed. + + Returns: + Expected eps of RDP (ignoring delta) per example in each bin. + """ + n = votes.shape[0] + + bin_counts = np.zeros(bin_num) + bin_rdp = np.zeros(bin_num) # RDP at order=order + + for i in xrange(n): + v = votes[i,] + logq = pate.compute_logq_gaussian(v, sigma2) + rdp_at_order = pate.rdp_gaussian(logq, sigma2, order) + + bin_idx = int(math.floor(max(v) * bin_num / sum(v))) + assert 0 <= bin_idx < bin_num + bin_counts[bin_idx] += 1 + bin_rdp[bin_idx] += rdp_at_order + if (i + 1) % 1000 == 0: + print('example {}'.format(i + 1)) + sys.stdout.flush() + + return bin_rdp / bin_counts + + +def compute_expected_answered_per_bin(bin_num, votes, threshold, sigma1): + """Computes expected number of answers per bin. + + Args: + bin_num: Number of bins. + votes: A matrix of votes, where each row contains votes in one instance. + threshold: The threshold against which check is performed. + sigma1: The std of the Gaussian noise with which check is performed. (Same + as sigma_1 in Algorithms 1 and 2.) + + Returns: + Expected number of queries answered per bin. + """ + n = votes.shape[0] + + bin_answered = np.zeros(bin_num) + + for i in xrange(n): + v = votes[i,] + p = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) + bin_idx = int(math.floor(max(v) * bin_num / sum(v))) + assert 0 <= bin_idx < bin_num + bin_answered[bin_idx] += p + if (i + 1) % 1000 == 0: + print('example {}'.format(i + 1)) + sys.stdout.flush() + + return bin_answered + + +def main(argv): + del argv # Unused. + fin_name = os.path.expanduser(FLAGS.counts_file) + print('Reading raw votes from ' + fin_name) + sys.stdout.flush() + + votes = np.load(fin_name) + votes = votes[:4000,] # truncate to 4000 samples + + if FLAGS.plot == 'small': + bin_num = 5 + m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500) + elif FLAGS.plot == 'large': + bin_num = 10 + m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500) + a_check = compute_expected_answered_per_bin(bin_num, votes, 5000, 1500) + eps = compute_privacy_cost_per_bins(bin_num, votes, 100, 50) + else: + raise ValueError('--plot flag must be one of ["small", "large"]') + + counts = compute_count_per_bin(bin_num, votes) + bins = np.linspace(0, 100, num=bin_num, endpoint=False) + + plt.close('all') + fig, ax = plt.subplots() + if FLAGS.plot == 'small': + fig.set_figheight(5) + fig.set_figwidth(5) + ax.bar( + bins, + counts, + 20, + color='orangered', + linestyle='dotted', + linewidth=5, + edgecolor='red', + fill=False, + alpha=.5, + align='edge', + label='LNMax answers') + ax.bar( + bins, + m_check, + 20, + color='g', + alpha=.5, + linewidth=0, + edgecolor='g', + align='edge', + label='Confident-GNMax\nanswers') + elif FLAGS.plot == 'large': + fig.set_figheight(4.7) + fig.set_figwidth(7) + ax.bar( + bins, + counts, + 10, + linestyle='dashed', + linewidth=5, + edgecolor='red', + fill=False, + alpha=.5, + align='edge', + label='LNMax answers') + ax.bar( + bins, + m_check, + 10, + color='g', + alpha=.5, + linewidth=0, + edgecolor='g', + align='edge', + label='Confident-GNMax\nanswers (moderate)') + ax.bar( + bins, + a_check, + 10, + color='b', + alpha=.5, + align='edge', + label='Confident-GNMax\nanswers (aggressive)') + ax2 = ax.twinx() + bin_centers = [x + 5 for x in bins] + ax2.plot(bin_centers, eps, 'ko', alpha=.8) + ax2.set_ylim([1e-200, 1.]) + ax2.set_yscale('log') + ax2.grid(False) + ax2.set_yticks([1e-3, 1e-50, 1e-100, 1e-150, 1e-200]) + plt.tick_params(which='minor', right='off') + ax2.set_ylabel(r'Per query privacy cost $\varepsilon$', fontsize=16) + + plt.xlim([0, 100]) + ax.set_ylim([0, 2500]) + # ax.set_yscale('log') + ax.set_xlabel('Percentage of teachers that agree', fontsize=16) + ax.set_ylabel('Number of queries answered', fontsize=16) + vals = ax.get_xticks() + ax.set_xticklabels([str(int(x)) + '%' for x in vals]) + ax.tick_params(labelsize=14, bottom=True, top=True, left=True, right=True) + ax.legend(loc=2, prop={'size': 16}) + + # simple: 'figures/noisy_thresholding_check_perf.pdf') + # detailed: 'figures/noisy_thresholding_check_perf_details.pdf' + + print('Saving the graph to ' + FLAGS.plot_file) + plt.savefig(os.path.expanduser(FLAGS.plot_file), bbox_inches='tight') + plt.show() + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/rdp_cumulative.py b/research/pate_2018/ICLR2018/rdp_cumulative.py new file mode 100644 index 0000000..d4b1c65 --- /dev/null +++ b/research/pate_2018/ICLR2018/rdp_cumulative.py @@ -0,0 +1,378 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Plots three graphs illustrating cost of privacy per answered query. + +A script in support of the paper "Scalable Private Learning with PATE" by +Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, +Ulfar Erlingsson (https://arxiv.org/abs/1802.08908). + +The input is a file containing a numpy array of votes, one query per row, one +class per column. Ex: + 43, 1821, ..., 3 + 31, 16, ..., 0 + ... + 0, 86, ..., 438 +The output is written to a specified directory and consists of three pdf files. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import pickle +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + +from absl import app +from absl import flags +import matplotlib + +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top +import numpy as np +import core as pate + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS +flags.DEFINE_boolean('cache', False, + 'Read results of privacy analysis from cache.') +flags.DEFINE_string('counts_file', None, 'Counts file.') +flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.') + +flags.mark_flag_as_required('counts_file') + +def run_analysis(votes, mechanism, noise_scale, params): + """Computes data-dependent privacy. + + Args: + votes: A matrix of votes, where each row contains votes in one instance. + mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf') + noise_scale: A mechanism privacy parameter. + params: Other privacy parameters. + + Returns: + Four lists: cumulative privacy cost epsilon, how privacy budget is split, + how many queries were answered, optimal order. + """ + + def compute_partition(order_opt, eps): + order_opt_idx = np.searchsorted(orders, order_opt) + if mechanism == 'gnmax_conf': + p = (rdp_select_cum[order_opt_idx], + rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx], + -math.log(delta) / (order_opt - 1)) + else: + p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1)) + return [x / eps for x in p] # Ensures that sum(x) == 1 + + # Short list of orders. + # orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1), + # np.logspace(np.log10(50), np.log10(1000), num=20)))) + + # Long list of orders. + orders = np.concatenate((np.arange(2, 100 + 1, .5), + np.logspace(np.log10(100), np.log10(500), num=100))) + delta = 1e-8 + + n = votes.shape[0] + eps_total = np.zeros(n) + partition = [None] * n + order_opt = np.full(n, np.nan, dtype=float) + answered = np.zeros(n, dtype=float) + + rdp_cum = np.zeros(len(orders)) + rdp_sqrd_cum = np.zeros(len(orders)) + rdp_select_cum = np.zeros(len(orders)) + answered_sum = 0 + + for i in range(n): + v = votes[i,] + if mechanism == 'lnmax': + logq_lnmax = pate.compute_logq_laplace(v, noise_scale) + rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders) + rdp_sqrd = rdp_query ** 2 + pr_answered = 1 + elif mechanism == 'gnmax': + logq_gmax = pate.compute_logq_gaussian(v, noise_scale) + rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders) + rdp_sqrd = rdp_query ** 2 + pr_answered = 1 + elif mechanism == 'gnmax_conf': + logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v) + logq_step2 = pate.compute_logq_gaussian(v, noise_scale) + q_step1 = np.exp(logq_step1) + logq_step1_min = min(logq_step1, math.log1p(-q_step1)) + rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min, + 2 ** .5 * params['sigma1'], orders) + rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders) + rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2 + # The expression below evaluates + # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] + rdp_sqrd = ( + rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2 + + q_step1 * rdp_gnmax_step2 ** 2) + rdp_select_cum += rdp_gnmax_step1 + pr_answered = q_step1 + else: + raise ValueError( + 'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]') + + rdp_cum += rdp_query + rdp_sqrd_cum += rdp_sqrd + answered_sum += pr_answered + + answered[i] = answered_sum + eps_total[i], order_opt[i] = pate.compute_eps_from_delta( + orders, rdp_cum, delta) + partition[i] = compute_partition(order_opt[i], eps_total[i]) + + if i > 0 and (i + 1) % 1000 == 0: + rdp_var = rdp_sqrd_cum / i - ( + rdp_cum / i) ** 2 # Ignore Bessel's correction. + order_opt_idx = np.searchsorted(orders, order_opt[i]) + eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum. + print( + 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' + 'at order = {:.2f} (contribution from delta = {:.3f})'.format( + i + 1, answered_sum, eps_total[i], eps_std, order_opt[i], + -math.log(delta) / (order_opt[i] - 1))) + sys.stdout.flush() + + return eps_total, partition, answered, order_opt + + +def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax): + """Plots a graph of LNMax vs GNMax. + + Args: + figures_dir: A name of the directory where to save the plot. + eps_lap: The cumulative privacy costs of the Laplace mechanism. + eps_gnmax: The cumulative privacy costs of the Gaussian mechanism + answered_gnmax: The cumulative count of queries answered. + """ + xlim = 6000 + x_axis = range(0, int(xlim), 10) + y_lap = np.zeros(len(x_axis), dtype=float) + y_gnmax = np.full(len(x_axis), np.nan, dtype=float) + + for i in range(len(x_axis)): + x = x_axis[i] + y_lap[i] = eps_lap[x] + idx = np.searchsorted(answered_gnmax, x) + if idx < len(eps_gnmax): + y_gnmax[i] = eps_gnmax[idx] + + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + ax.plot( + x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5) + ax.plot( + x_axis, + y_gnmax, + color='g', + ls='-', + label='Confident-GNMax', + alpha=.5, + linewidth=5) + plt.xticks(np.arange(0, 7000, 1000)) + plt.xlim([0, 6000]) + plt.ylim([0, 6.]) + plt.xlabel('Number of queries answered', fontsize=16) + plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16) + plt.legend(loc=2, fontsize=13) # loc=2 -- upper left + ax.tick_params(labelsize=14) + fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf') + print('Saving the graph to ' + fout_name) + fig.savefig(fout_name, bbox_inches='tight') + plt.show() + + +def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1, + eps_gnmax2, partition_gnmax2, answered_gnmax2): + """Plots a graph of LNMax vs GNMax with two parameters. + + Args: + figures_dir: A name of the directory where to save the plot. + eps_lap: The cumulative privacy costs of the Laplace mechanism. + eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1). + answered_gnmax1: The cumulative count of queries answered (set 1). + eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2). + partition_gnmax2: Allocation of eps for set 2. + answered_gnmax2: The cumulative count of queries answered (set 2). + """ + xlim = 6000 + x_axis = range(0, int(xlim), 10) + lenx = len(x_axis) + y_lap = np.zeros(lenx) + y_gnmax1 = np.full(lenx, np.nan, dtype=float) + y_gnmax2 = np.full(lenx, np.nan, dtype=float) + y1_gnmax2 = np.full(lenx, np.nan, dtype=float) + + for i in range(lenx): + x = x_axis[i] + y_lap[i] = eps_lap[x] + idx1 = np.searchsorted(answered_gnmax1, x) + if idx1 < len(eps_gnmax1): + y_gnmax1[i] = eps_gnmax1[idx1] + idx2 = np.searchsorted(answered_gnmax2, x) + if idx2 < len(eps_gnmax2): + y_gnmax2[i] = eps_gnmax2[idx2] + fraction_step1, fraction_step2, _ = partition_gnmax2[idx2] + y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / ( + fraction_step1 + fraction_step2) + + fig, ax = plt.subplots() + fig.set_figheight(4.5) + fig.set_figwidth(4.7) + ax.plot( + x_axis, + y_lap, + color='r', + ls='dashed', + label='LNMax', + alpha=.5, + linewidth=5) + ax.plot( + x_axis, + y_gnmax1, + color='g', + ls='-', + label='Confident-GNMax (moderate)', + alpha=.5, + linewidth=5) + ax.plot( + x_axis, + y_gnmax2, + color='b', + ls='-', + label='Confident-GNMax (aggressive)', + alpha=.5, + linewidth=5) + ax.fill_between( + x_axis, [0] * lenx, + y1_gnmax2.tolist(), + facecolor='b', + alpha=.3, + hatch='\\') + ax.plot( + x_axis, + y1_gnmax2, + color='b', + ls='-', + label='_nolegend_', + alpha=.5, + linewidth=1) + ax.fill_between( + x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3) + plt.xticks(np.arange(0, 7000, 1000)) + plt.xlim([0, xlim]) + plt.ylim([0, 1.]) + plt.xlabel('Number of queries answered', fontsize=16) + plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16) + plt.legend(loc=2, fontsize=13) # loc=2 -- upper left + ax.tick_params(labelsize=14) + fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf') + print('Saving the graph to ' + fout_name) + fig.savefig(fout_name, bbox_inches='tight') + plt.show() + + +def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2): + """Sequentially runs all analyses. + + Args: + votes: A matrix of votes, where each row contains votes in one instance. + lambda_laplace: The scale of the Laplace noise (lambda). + gnmax_parameters: A list of parameters for GNMax. + sigma2: Shared parameter for the GNMax mechanisms. + + Returns: + Five lists whose length is the number of queries. + """ + print('=== Laplace Mechanism ===') + eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None) + print() + + # Does not go anywhere, for now + # print('=== Gaussian Mechanism (simple) ===') + # eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None) + + eps_gnmax = [[] for p in gnmax_parameters] + partition_gmax = [[] for p in gnmax_parameters] + answered = [[] for p in gnmax_parameters] + order_opt = [[] for p in gnmax_parameters] + for i, p in enumerate(gnmax_parameters): + print('=== Gaussian Mechanism (confident) {}: ==='.format(p)) + eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis( + votes, 'gnmax_conf', sigma2, p) + print() + + return eps_lap, eps_gnmax, partition_gmax, answered, order_opt + + +def main(argv): + del argv # Unused. + lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace + + # Paramaters of the GNMax + gnmax_parameters = ({ + 't': 1000, + 'sigma1': 500 + }, { + 't': 3500, + 'sigma1': 1500 + }, { + 't': 5000, + 'sigma1': 1500 + }) + sigma2 = 100 # GNMax parameters differ only in Step 1 (selection). + ftemp_name = '/tmp/precomputed.pkl' + + figures_dir = os.path.expanduser(FLAGS.figures_dir) + + if FLAGS.cache and os.path.isfile(ftemp_name): + print('Reading from cache ' + ftemp_name) + with open(ftemp_name, 'rb') as f: + (eps_lap, eps_gnmax, partition_gmax, answered_gnmax, + orders_opt_gnmax) = pickle.load(f) + else: + fin_name = os.path.expanduser(FLAGS.counts_file) + print('Reading raw votes from ' + fin_name) + sys.stdout.flush() + + votes = np.load(fin_name) + + (eps_lap, eps_gnmax, partition_gmax, + answered_gnmax, orders_opt_gnmax) = run_all_analyses( + votes, lambda_laplace, gnmax_parameters, sigma2) + + print('Writing to cache ' + ftemp_name) + with open(ftemp_name, 'wb') as f: + pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax, + orders_opt_gnmax), f) + + print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0]) + print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1], + eps_gnmax[2], partition_gmax[2], answered_gnmax[2]) + plt.close('all') + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/smooth_sensitivity_table.py b/research/pate_2018/ICLR2018/smooth_sensitivity_table.py new file mode 100644 index 0000000..89d4c28 --- /dev/null +++ b/research/pate_2018/ICLR2018/smooth_sensitivity_table.py @@ -0,0 +1,358 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Performs privacy analysis of GNMax with smooth sensitivity. + +A script in support of the paper "Scalable Private Learning with PATE" by +Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, +Ulfar Erlingsson (https://arxiv.org/abs/1802.08908). + +Several flavors of the GNMax algorithm can be analyzed. + - Plain GNMax (argmax w/ Gaussian noise) is assumed when arguments threshold + and sigma2 are missing. + - Confident GNMax (thresholding + argmax w/ Gaussian noise) is used when + threshold, sigma1, and sigma2 are given. + - Interactive GNMax (two- or multi-round) is triggered by specifying + baseline_file, which provides baseline values for votes selection in Step 1. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +import sys + +sys.path.append('..') # Main modules reside in the parent directory. + +from absl import app +from absl import flags +import numpy as np +import core as pate +import smooth_sensitivity as pate_ss + +FLAGS = flags.FLAGS + +flags.DEFINE_string('counts_file', None, 'Counts file.') +flags.DEFINE_string('baseline_file', None, 'File with baseline scores.') +flags.DEFINE_boolean('data_independent', False, + 'Force data-independent bounds.') +flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).') +flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).') +flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).') +flags.DEFINE_integer('queries', None, 'Number of queries made by the student.') +flags.DEFINE_float('delta', 1e-8, 'Target delta.') +flags.DEFINE_float( + 'order', None, + 'Fixes a Renyi DP order (if unspecified, finds an optimal order from a ' + 'hardcoded list).') +flags.DEFINE_integer( + 'teachers', None, + 'Number of teachers (if unspecified, derived from the counts file).') + +flags.mark_flag_as_required('counts_file') +flags.mark_flag_as_required('sigma2') + + +def _check_conditions(sigma, num_classes, orders): + """Symbolic-numeric verification of conditions C5 and C6. + + The conditions on the beta function are verified by constructing the beta + function symbolically, and then checking that its derivative (computed + symbolically) is non-negative within the interval of conjectured monotonicity. + The last check is performed numerically. + """ + + print('Checking conditions C5 and C6 for all orders.') + sys.stdout.flush() + conditions_hold = True + + for order in orders: + cond5, cond6 = pate_ss.check_conditions(sigma, num_classes, order) + conditions_hold &= cond5 and cond6 + if not cond5: + print('Condition C5 does not hold for order =', order) + elif not cond6: + print('Condition C6 does not hold for order =', order) + + if conditions_hold: + print('Conditions C5-C6 hold for all orders.') + sys.stdout.flush() + return conditions_hold + + +def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders, + data_ind): + """Computes the (data-dependent) RDP curve for Confident GNMax.""" + rdp_cum = np.zeros(len(orders)) + rdp_sqrd_cum = np.zeros(len(orders)) + answered = 0 + + for i, v in enumerate(votes): + if threshold is None: + logq_step1 = 0 # No thresholding, always proceed to step 2. + rdp_step1 = np.zeros(len(orders)) + else: + logq_step1 = pate.compute_logpr_answered(threshold, sigma1, + v - baseline[i,]) + if data_ind: + rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders) + else: + rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders) + + if data_ind: + rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders) + else: + logq_step2 = pate.compute_logq_gaussian(v, sigma2) + rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders) + + q_step1 = np.exp(logq_step1) + rdp = rdp_step1 + rdp_step2 * q_step1 + # The expression below evaluates + # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] + rdp_sqrd = ( + rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 + + q_step1 * rdp_step2**2) + rdp_sqrd_cum += rdp_sqrd + + rdp_cum += rdp + answered += q_step1 + if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1): + rdp_var = rdp_sqrd_cum / i - ( + rdp_cum / i)**2 # Ignore Bessel's correction. + eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) + order_opt_idx = np.searchsorted(orders, order_opt) + eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum. + print( + 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' + 'at order = {:.2f} (contribution from delta = {:.3f})'.format( + i + 1, answered, eps_total, eps_std, order_opt, + -math.log(delta) / (order_opt - 1))) + sys.stdout.flush() + + _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) + + return order_opt + + +def _find_optimal_smooth_sensitivity_parameters( + votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1, + ind_step2, order): + """Optimizes smooth sensitivity parameters by minimizing a cost function. + + The cost function is + exact_eps + cost of GNSS + two stds of noise, + which captures that upper bound of the confidence interval of the sanitized + privacy budget. + + Since optimization is done with full view of sensitive data, the results + cannot be released. + """ + rdp_cum = 0 + answered_cum = 0 + ls_cum = 0 + + # Define a plausible range for the beta values. + betas = np.arange(.3 / order, .495 / order, .01 / order) + cost_delta = math.log(1 / delta) / (order - 1) + + for i, v in enumerate(votes): + if threshold is None: + log_pr_answered = 0 + rdp1 = 0 + ls_step1 = np.zeros(num_teachers) + else: + log_pr_answered = pate.compute_logpr_answered(threshold, sigma1, + v - baseline[i,]) + if ind_step1: # apply data-independent bound for step 1 (thresholding). + rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order) + ls_step1 = np.zeros(num_teachers) + else: + rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order) + ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold( + v - baseline[i,], num_teachers, threshold, sigma1, order) + + pr_answered = math.exp(log_pr_answered) + answered_cum += pr_answered + + if ind_step2: # apply data-independent bound for step 2 (GNMax). + rdp2 = pate.rdp_data_independent_gaussian(sigma2, order) + ls_step2 = np.zeros(num_teachers) + else: + logq_step2 = pate.compute_logq_gaussian(v, sigma2) + rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order) + # Compute smooth sensitivity. + ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( + v, num_teachers, sigma2, order) + + rdp_cum += rdp1 + pr_answered * rdp2 + ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity. + + if ind_step1 and ind_step2: + # Data-independent bounds. + cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf + else: + # Data-dependent bounds. + cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None + + for beta in betas: + ss = pate_ss.compute_discounted_max(beta, ls_cum) + + # Solution to the minimization problem: + # min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma} + sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3) + cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( + beta, sigma_ss, order) + + # Cost captures exact_eps + cost of releasing SS + two stds of noise. + cost = rdp_cum + cost_ss + 2 * ss * sigma_ss + if cost < cost_opt: + cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss + + if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1): + eps_before_ss = rdp_cum + cost_delta + eps_with_ss = ( + eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( + beta_opt, sigma_ss_opt, order)) + print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to ' + '{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'. + format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss, + ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt)) + sys.stdout.flush() + + # Return optimal parameters for the last iteration. + return beta_opt, ss_opt, sigma_ss_opt + + +#################### +# HELPER FUNCTIONS # +#################### + + +def _load_votes(counts_file, baseline_file, queries): + counts_file_expanded = os.path.expanduser(counts_file) + print('Reading raw votes from ' + counts_file_expanded) + sys.stdout.flush() + + votes = np.load(counts_file_expanded) + print('Shape of the votes matrix = {}'.format(votes.shape)) + + if baseline_file is not None: + baseline_file_expanded = os.path.expanduser(baseline_file) + print('Reading baseline values from ' + baseline_file_expanded) + sys.stdout.flush() + baseline = np.load(baseline_file_expanded) + if votes.shape != baseline.shape: + raise ValueError( + 'Counts file and baseline file must have the same shape. Got {} and ' + '{} instead.'.format(votes.shape, baseline.shape)) + else: + baseline = np.zeros_like(votes) + + if queries is not None: + if votes.shape[0] < queries: + raise ValueError('Expect {} rows, got {} in {}'.format( + queries, votes.shape[0], counts_file)) + # Truncate the votes matrix to the number of queries made. + votes = votes[:queries,] + baseline = baseline[:queries,] + else: + print('Process all {} input rows. (Use --queries flag to truncate.)'.format( + votes.shape[0])) + + return votes, baseline + + +def _count_teachers(votes): + s = np.sum(votes, axis=1) + num_teachers = int(max(s)) + if min(s) != num_teachers: + raise ValueError( + 'Matrix of votes is malformed: the number of votes is not the same ' + 'across rows.') + return num_teachers + + +def _is_data_ind_step1(num_teachers, threshold, sigma1, orders): + if threshold is None: + return True + return np.all( + pate.is_data_independent_always_opt_threshold(num_teachers, threshold, + sigma1, orders)) + + +def _is_data_ind_step2(num_teachers, num_classes, sigma, orders): + return np.all( + pate.is_data_independent_always_opt_gaussian(num_teachers, num_classes, + sigma, orders)) + + +def main(argv): + del argv # Unused. + + if (FLAGS.threshold is None) != (FLAGS.sigma1 is None): + raise ValueError( + '--threshold flag and --sigma1 flag must be present or absent ' + 'simultaneously.') + + if FLAGS.order is None: + # Long list of orders. + orders = np.concatenate((np.arange(2, 100 + 1, .5), + np.logspace(np.log10(100), np.log10(500), + num=100))) + # Short list of orders. + # orders = np.round( + # np.concatenate((np.arange(2, 50 + 1, 1), + # np.logspace(np.log10(50), np.log10(1000), num=20)))) + else: + orders = np.array([FLAGS.order]) + + votes, baseline = _load_votes(FLAGS.counts_file, FLAGS.baseline_file, + FLAGS.queries) + + if FLAGS.teachers is None: + num_teachers = _count_teachers(votes) + else: + num_teachers = FLAGS.teachers + + num_classes = votes.shape[1] + + order = _compute_rdp(votes, baseline, FLAGS.threshold, FLAGS.sigma1, + FLAGS.sigma2, FLAGS.delta, orders, + FLAGS.data_independent) + + ind_step1 = _is_data_ind_step1(num_teachers, FLAGS.threshold, FLAGS.sigma1, + order) + + ind_step2 = _is_data_ind_step2(num_teachers, num_classes, FLAGS.sigma2, order) + + if FLAGS.data_independent or (ind_step1 and ind_step2): + print('Nothing to do here, all analyses are data-independent.') + return + + if not _check_conditions(FLAGS.sigma2, num_classes, [order]): + return # Quit early: sufficient conditions for correctness fail to hold. + + beta_opt, ss_opt, sigma_ss_opt = _find_optimal_smooth_sensitivity_parameters( + votes, baseline, num_teachers, FLAGS.threshold, FLAGS.sigma1, + FLAGS.sigma2, FLAGS.delta, ind_step1, ind_step2, order) + + print('Optimal beta = {:.4f}, E[SS_beta] = {:.4}, sigma_ss = {:.2f}'.format( + beta_opt, ss_opt, sigma_ss_opt)) + + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/ICLR2018/utility_queries_answered.py b/research/pate_2018/ICLR2018/utility_queries_answered.py new file mode 100644 index 0000000..d8663ad --- /dev/null +++ b/research/pate_2018/ICLR2018/utility_queries_answered.py @@ -0,0 +1,90 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app +from absl import flags +import matplotlib +import os + +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt + +plt.style.use('ggplot') + +FLAGS = flags.FLAGS +flags.DEFINE_string('plot_file', '', 'Output file name.') + +qa_lnmax = [500, 750] + range(1000, 12500, 500) + +acc_lnmax = [43.3, 52.3, 59.8, 66.7, 68.8, 70.5, 71.6, 72.3, 72.6, 72.9, 73.4, + 73.4, 73.7, 73.9, 74.2, 74.4, 74.5, 74.7, 74.8, 75, 75.1, 75.1, + 75.4, 75.4, 75.4] + +qa_gnmax = [456, 683, 908, 1353, 1818, 2260, 2702, 3153, 3602, 4055, 4511, 4964, + 5422, 5875, 6332, 6792, 7244, 7696, 8146, 8599, 9041, 9496, 9945, + 10390, 10842] + +acc_gnmax = [39.6, 52.2, 59.6, 66.6, 69.6, 70.5, 71.8, 72, 72.7, 72.9, 73.3, + 73.4, 73.4, 73.8, 74, 74.2, 74.4, 74.5, 74.5, 74.7, 74.8, 75, 75.1, + 75.1, 75.4] + +qa_gnmax_aggressive = [167, 258, 322, 485, 647, 800, 967, 1133, 1282, 1430, + 1573, 1728, 1889, 2028, 2190, 2348, 2510, 2668, 2950, + 3098, 3265, 3413, 3581, 3730] + +acc_gnmax_aggressive = [17.8, 26.8, 39.3, 48, 55.7, 61, 62.8, 64.8, 65.4, 66.7, + 66.2, 68.3, 68.3, 68.7, 69.1, 70, 70.2, 70.5, 70.9, + 70.7, 71.3, 71.3, 71.3, 71.8] + + +def main(argv): + del argv # Unused. + + plt.close('all') + fig, ax = plt.subplots() + fig.set_figheight(4.7) + fig.set_figwidth(5) + ax.plot(qa_lnmax, acc_lnmax, color='r', ls='--', linewidth=5., marker='o', + alpha=.5, label='LNMax') + ax.plot(qa_gnmax, acc_gnmax, color='g', ls='-', linewidth=5., marker='o', + alpha=.5, label='Confident-GNMax') + # ax.plot(qa_gnmax_aggressive, acc_gnmax_aggressive, color='b', ls='-', marker='o', alpha=.5, label='Confident-GNMax (aggressive)') + plt.xticks([0, 2000, 4000, 6000]) + plt.xlim([0, 6000]) + # ax.set_yscale('log') + plt.ylim([65, 76]) + ax.tick_params(labelsize=14) + plt.xlabel('Number of queries answered', fontsize=16) + plt.ylabel('Student test accuracy (%)', fontsize=16) + plt.legend(loc=2, prop={'size': 16}) + + x = [400, 2116, 4600, 4680] + y = [69.5, 68.5, 74, 72.5] + annotations = [0.76, 2.89, 1.42, 5.76] + color_annotations = ['g', 'r', 'g', 'r'] + for i, txt in enumerate(annotations): + ax.annotate(r'${\varepsilon=}$' + str(txt), (x[i], y[i]), fontsize=16, + color=color_annotations[i]) + + plot_filename = os.path.expanduser(FLAGS.plot_file) + plt.savefig(plot_filename, bbox_inches='tight') + plt.show() + +if __name__ == '__main__': + app.run(main) diff --git a/research/pate_2018/README.md b/research/pate_2018/README.md new file mode 100644 index 0000000..decd633 --- /dev/null +++ b/research/pate_2018/README.md @@ -0,0 +1,71 @@ +Implementation of an RDP privacy accountant and smooth sensitivity analysis for +the PATE framework. The underlying theory and supporting experiments appear in +"Scalable Private Learning with PATE" by Nicolas Papernot, Shuang Song, Ilya +Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson (ICLR 2018, +https://arxiv.org/abs/1802.08908). + +## Overview + +The PATE ('Private Aggregation of Teacher Ensembles') framework was introduced +by Papernot et al. in "Semi-supervised Knowledge Transfer for Deep Learning from +Private Training Data" (ICLR 2017, https://arxiv.org/abs/1610.05755). The +framework enables model-agnostic training that provably provides [differential +privacy](https://en.wikipedia.org/wiki/Differential_privacy) of the training +dataset. + +The framework consists of _teachers_, the _student_ model, and the _aggregator_. The +teachers are models trained on disjoint subsets of the training datasets. The student +model has access to an insensitive (e.g., public) unlabelled dataset, which is labelled by +interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies +outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or +refuses to answer. + +Differential privacy is enforced by the aggregator. The privacy guarantees can be _data-independent_, +which means that they are solely the function of the aggregator's parameters. Alternatively, privacy +analysis can be _data-dependent_, which allows for finer reasoning where, under certain conditions on +the input distribution, the final privacy guarantees can be improved relative to the data-independent +analysis. Data-dependent privacy guarantees may, by themselves, be a function of sensitive data and +therefore publishing these guarantees requires its own sanitization procedure. In our case +sanitization of data-dependent privacy guarantees proceeds via _smooth sensitivity_ analysis. + +The common machinery used for all privacy analyses in this repository is the +Rényi differential privacy, or RDP (see https://arxiv.org/abs/1702.07476). + +This repository contains implementations of privacy accountants and smooth +sensitivity analysis for several data-independent and data-dependent mechanism that together +comprise the PATE framework. + + +### Requirements + +* Python, version ≥ 2.7 +* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`) +* numpy +* scipy +* sympy (for smooth sensitivity analysis) +* unittest (for testing) + + +### Self-testing + +To verify the installation run +```bash +$ python core_test.py +$ python smooth_sensitivity_test.py +``` + + +## Files in this directory + +* core.py — RDP privacy accountant for several vote aggregators (GNMax, + Threshold, Laplace). + +* smooth_sensitivity.py — Smooth sensitivity analysis for GNMax and + Threshold mechanisms. + +* core_test.py and smooth_sensitivity_test.py — Unit tests for the + files above. + +## Contact information + +You may direct your comments to mironov@google.com and PR to @ilyamironov. diff --git a/research/pate_2018/core.py b/research/pate_2018/core.py new file mode 100644 index 0000000..84c79dc --- /dev/null +++ b/research/pate_2018/core.py @@ -0,0 +1,370 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Core functions for RDP analysis in PATE framework. + +This library comprises the core functions for doing differentially private +analysis of the PATE architecture and its various Noisy Max and other +mechanisms. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +from absl import app +import numpy as np +import scipy.stats + + +def _logaddexp(x): + """Addition in the log space. Analogue of numpy.logaddexp for a list.""" + m = max(x) + return m + math.log(sum(np.exp(x - m))) + + +def _log1mexp(x): + """Numerically stable computation of log(1-exp(x)).""" + if x < -1: + return math.log1p(-math.exp(x)) + elif x < 0: + return math.log(-math.expm1(x)) + elif x == 0: + return -np.inf + else: + raise ValueError("Argument must be non-positive.") + + +def compute_eps_from_delta(orders, rdp, delta): + """Translates between RDP and (eps, delta)-DP. + + Args: + orders: A list (or a scalar) of orders. + rdp: A list of RDP guarantees (of the same length as orders). + delta: Target delta. + + Returns: + Pair of (eps, optimal_order). + + Raises: + ValueError: If input is malformed. + """ + if len(orders) != len(rdp): + raise ValueError("Input lists must have the same length.") + eps = np.array(rdp) - math.log(delta) / (np.array(orders) - 1) + idx_opt = np.argmin(eps) + return eps[idx_opt], orders[idx_opt] + + +##################### +# RDP FOR THE GNMAX # +##################### + + +def compute_logq_gaussian(counts, sigma): + """Returns an upper bound on ln Pr[outcome != argmax] for GNMax. + + Implementation of Proposition 7. + + Args: + counts: A numpy array of scores. + sigma: The standard deviation of the Gaussian noise in the GNMax mechanism. + + Returns: + logq: Natural log of the probability that outcome is different from argmax. + """ + n = len(counts) + variance = sigma**2 + idx_max = np.argmax(counts) + counts_normalized = counts[idx_max] - counts + counts_rest = counts_normalized[np.arange(n) != idx_max] # exclude one index + # Upper bound q via a union bound rather than a more precise calculation. + logq = _logaddexp( + scipy.stats.norm.logsf(counts_rest, scale=math.sqrt(2 * variance))) + + # A sketch of a more accurate estimate, which is currently disabled for two + # reasons: + # 1. Numerical instability; + # 2. Not covered by smooth sensitivity analysis. + # covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1)) + # logq = np.log1p(-statsmodels.sandbox.distributions.extras.mvnormcdf( + # counts_rest, np.zeros(n - 1), covariance, maxpts=1e4)) + + return min(logq, math.log(1 - (1 / n))) + + +def rdp_data_independent_gaussian(sigma, orders): + """Computes a data-independent RDP curve for GNMax. + + Implementation of Proposition 8. + + Args: + sigma: Standard deviation of Gaussian noise. + orders: An array_like list of Renyi orders. + + Returns: + Upper bound on RPD for all orders. A scalar if orders is a scalar. + + Raises: + ValueError: If the input is malformed. + """ + if sigma < 0 or np.any(orders <= 1): # not defined for alpha=1 + raise ValueError("Inputs are malformed.") + + variance = sigma**2 + if np.isscalar(orders): + return orders / variance + else: + return np.atleast_1d(orders) / variance + + +def rdp_gaussian(logq, sigma, orders): + """Bounds RDP from above of GNMax given an upper bound on q (Theorem 6). + + Args: + logq: Natural logarithm of the probability of a non-argmax outcome. + sigma: Standard deviation of Gaussian noise. + orders: An array_like list of Renyi orders. + + Returns: + Upper bound on RPD for all orders. A scalar if orders is a scalar. + + Raises: + ValueError: If the input is malformed. + """ + if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1 + raise ValueError("Inputs are malformed.") + + if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP. + if np.isscalar(orders): + return 0. + else: + return np.full_like(orders, 0., dtype=np.float) + + variance = sigma**2 + + # Use two different higher orders: mu_hi1 and mu_hi2 computed according to + # Proposition 10. + mu_hi2 = math.sqrt(variance * -logq) + mu_hi1 = mu_hi2 + 1 + + orders_vec = np.atleast_1d(orders) + + ret = orders_vec / variance # baseline: data-independent bound + + # Filter out entries where data-dependent bound does not apply. + mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1) + + rdp_hi1 = mu_hi1 / variance + rdp_hi2 = mu_hi2 / variance + + log_a2 = (mu_hi2 - 1) * rdp_hi2 + + # Make sure q is in the increasing wrt q range and A is positive. + if (np.any(mask) and logq <= log_a2 - mu_hi2 * + (math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and + -logq > rdp_hi2): + # Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0. + log1q = _log1mexp(logq) # log1q = log(1-q) + log_a = (orders - 1) * ( + log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2))) + log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1)) + + # Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y. + log_s = np.logaddexp(log1q + log_a, logq + log_b) + ret[mask] = np.minimum(ret, log_s / (orders - 1))[mask] + + assert np.all(ret >= 0) + + if np.isscalar(orders): + return np.asscalar(ret) + else: + return ret + + +def is_data_independent_always_opt_gaussian(num_teachers, num_classes, sigma, + orders): + """Tests whether data-ind bound is always optimal for GNMax. + + Args: + num_teachers: Number of teachers. + num_classes: Number of classes. + sigma: Standard deviation of the Gaussian noise. + orders: An array_like list of Renyi orders. + + Returns: + Boolean array of length |orders| (a scalar if orders is a scalar). True if + the data-independent bound is always the same as the data-dependent bound. + + """ + unanimous = np.array([num_teachers] + [0] * (num_classes - 1)) + logq = compute_logq_gaussian(unanimous, sigma) + + rdp_dep = rdp_gaussian(logq, sigma, orders) + rdp_ind = rdp_data_independent_gaussian(sigma, orders) + return np.isclose(rdp_dep, rdp_ind) + + +################################### +# RDP FOR THE THRESHOLD MECHANISM # +################################### + + +def compute_logpr_answered(t, sigma, counts): + """Computes log of the probability that a noisy threshold is crossed. + + Args: + t: The threshold. + sigma: The stdev of the Gaussian noise added to the threshold. + counts: An array of votes. + + Returns: + Natural log of the probability that max is larger than a noisy threshold. + """ + # Compared to the paper, max(counts) is rounded to the nearest integer. This + # is done to facilitate computation of smooth sensitivity for the case of + # the interactive mechanism, where votes are not necessarily integer. + return scipy.stats.norm.logsf(t - round(max(counts)), scale=sigma) + + +def compute_rdp_data_independent_threshold(sigma, orders): + # The input to the threshold mechanism has stability 1, compared to + # GNMax, which has stability = 2. Hence the sqrt(2) factor below. + return rdp_data_independent_gaussian(2**.5 * sigma, orders) + + +def compute_rdp_threshold(log_pr_answered, sigma, orders): + logq = min(log_pr_answered, _log1mexp(log_pr_answered)) + # The input to the threshold mechanism has stability 1, compared to + # GNMax, which has stability = 2. Hence the sqrt(2) factor below. + return rdp_gaussian(logq, 2**.5 * sigma, orders) + + +def is_data_independent_always_opt_threshold(num_teachers, threshold, sigma, + orders): + """Tests whether data-ind bound is always optimal for the threshold mechanism. + + Args: + num_teachers: Number of teachers. + threshold: The cut-off threshold. + sigma: Standard deviation of the Gaussian noise. + orders: An array_like list of Renyi orders. + + Returns: + Boolean array of length |orders| (a scalar if orders is a scalar). True if + the data-independent bound is always the same as the data-dependent bound. + """ + + # Since the data-dependent bound depends only on max(votes), it suffices to + # check whether the data-dependent bounds are better than data-independent + # bounds in the extreme cases when max(votes) is minimal or maximal. + # For both Confident GNMax and Interactive GNMax it holds that + # 0 <= max(votes) <= num_teachers. + # The upper bound is trivial in both cases. + # The lower bound is trivial for Confident GNMax (and a stronger one, based on + # the pigeonhole principle, is possible). + # For Interactive GNMax (Algorithm 2), the lower bound follows from the + # following argument. Since the votes vector is the difference between the + # actual teachers' votes and the student's baseline, we need to argue that + # max(n_j - M * p_j) >= 0. + # The bound holds because sum_j n_j = sum M * p_j = M. Thus, + # sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed. + logq1 = compute_logpr_answered(threshold, sigma, [0]) + logq2 = compute_logpr_answered(threshold, sigma, [num_teachers]) + + rdp_dep1 = compute_rdp_threshold(logq1, sigma, orders) + rdp_dep2 = compute_rdp_threshold(logq2, sigma, orders) + + rdp_ind = compute_rdp_data_independent_threshold(sigma, orders) + return np.isclose(rdp_dep1, rdp_ind) and np.isclose(rdp_dep2, rdp_ind) + + +############################# +# RDP FOR THE LAPLACE NOISE # +############################# + + +def compute_logq_laplace(counts, lmbd): + """Computes an upper bound on log Pr[outcome != argmax] for LNMax. + + Args: + counts: A list of scores. + lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda). + + Returns: + logq: Natural log of the probability that outcome is different from argmax. + """ + # For noisy max, we only get an upper bound via the union bound. See Lemma 4 + # in https://arxiv.org/abs/1610.05755. + # + # Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*) + # proof at http://mathoverflow.net/questions/66763/ + + idx_max = np.argmax(counts) + counts_normalized = (counts - counts[idx_max]) / lmbd + counts_rest = np.array( + [counts_normalized[i] for i in range(len(counts)) if i != idx_max]) + + logq = _logaddexp(np.log(2 - counts_rest) + math.log(.25) + counts_rest) + + return min(logq, math.log(1 - (1 / len(counts)))) + + +def rdp_pure_eps(logq, pure_eps, orders): + """Computes the RDP value given logq and pure privacy eps. + + Implementation of https://arxiv.org/abs/1610.05755, Theorem 3. + + The bound used is the min of three terms. The first term is from + https://arxiv.org/pdf/1605.02065.pdf. + The second term is based on the fact that when event has probability (1-q) for + q close to zero, q can only change by exp(eps), which corresponds to a + much smaller multiplicative change in (1-q) + The third term comes directly from the privacy guarantee. + + Args: + logq: Natural logarithm of the probability of a non-optimal outcome. + pure_eps: eps parameter for DP + orders: array_like list of moments to compute. + + Returns: + Array of upper bounds on rdp (a scalar if orders is a scalar). + """ + orders_vec = np.atleast_1d(orders) + q = math.exp(logq) + log_t = np.full_like(orders_vec, np.inf) + if q <= 1 / (math.exp(pure_eps) + 1): + logt_one = math.log1p(-q) + ( + math.log1p(-q) - _log1mexp(pure_eps + logq)) * ( + orders_vec - 1) + logt_two = logq + pure_eps * (orders_vec - 1) + log_t = np.logaddexp(logt_one, logt_two) + + ret = np.minimum( + np.minimum(0.5 * pure_eps * pure_eps * orders_vec, + log_t / (orders_vec - 1)), pure_eps) + if np.isscalar(orders): + return np.asscalar(ret) + else: + return ret + + +def main(argv): + del argv # Unused. + + +if __name__ == "__main__": + app.run(main) diff --git a/research/pate_2018/core_test.py b/research/pate_2018/core_test.py new file mode 100644 index 0000000..933f5c2 --- /dev/null +++ b/research/pate_2018/core_test.py @@ -0,0 +1,124 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for pate.core.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import unittest +import numpy as np + +import core as pate + + +class PateTest(unittest.TestCase): + + def _test_rdp_gaussian_value_errors(self): + # Test for ValueErrors. + with self.assertRaises(ValueError): + pate.rdp_gaussian(1.0, 1.0, np.array([2, 3, 4])) + with self.assertRaises(ValueError): + pate.rdp_gaussian(np.log(0.5), -1.0, np.array([2, 3, 4])) + with self.assertRaises(ValueError): + pate.rdp_gaussian(np.log(0.5), 1.0, np.array([1, 3, 4])) + + def _test_rdp_gaussian_as_function_of_q(self): + # Test for data-independent and data-dependent ranges over q. + # The following corresponds to orders 1.1, 2.5, 32, 250 + # sigmas 1.5, 15, 1500, 15000. + # Hand calculated -log(q0)s arranged in a 'sigma major' ordering. + neglogq0s = [ + 2.8, 2.6, 427, None, 4.8, 4.0, 4.7, 275, 9.6, 8.8, 6.0, 4, 12, 11.2, + 8.6, 6.4 + ] + idx_neglogq0s = 0 # To iterate through neglogq0s. + orders = [1.1, 2.5, 32, 250] + sigmas = [1.5, 15, 1500, 15000] + for sigma in sigmas: + for order in orders: + curr_neglogq0 = neglogq0s[idx_neglogq0s] + idx_neglogq0s += 1 + if curr_neglogq0 is None: # sigma == 1.5 and order == 250: + continue + + rdp_at_q0 = pate.rdp_gaussian(-curr_neglogq0, sigma, order) + + # Data-dependent range. (Successively halve the value of q.) + logq_dds = (-curr_neglogq0 - np.array( + [0, np.log(2), np.log(4), np.log(8)])) + # Check that in q_dds, rdp is decreasing. + for idx in range(len(logq_dds) - 1): + self.assertGreater( + pate.rdp_gaussian(logq_dds[idx], sigma, order), + pate.rdp_gaussian(logq_dds[idx + 1], sigma, order)) + + # Data-independent range. + q_dids = np.exp(-curr_neglogq0) + np.array([0.1, 0.2, 0.3, 0.4]) + # Check that in q_dids, rdp is constant. + for q in q_dids: + self.assertEqual(rdp_at_q0, pate.rdp_gaussian( + np.log(q), sigma, order)) + + def _test_compute_eps_from_delta_value_error(self): + # Test for ValueError. + with self.assertRaises(ValueError): + pate.compute_eps_from_delta([1.1, 2, 3, 4], [1, 2, 3], 0.001) + + def _test_compute_eps_from_delta_monotonicity(self): + # Test for monotonicity with respect to delta. + orders = [1.1, 2.5, 250.0] + sigmas = [1e-3, 1.0, 1e5] + deltas = [1e-60, 1e-6, 0.1, 0.999] + for sigma in sigmas: + list_of_eps = [] + rdps_for_gaussian = np.array(orders) / (2 * sigma**2) + for delta in deltas: + list_of_eps.append( + pate.compute_eps_from_delta(orders, rdps_for_gaussian, delta)[0]) + + # Check that in list_of_eps, epsilons are decreasing (as delta increases). + sorted_list_of_eps = list(list_of_eps) + sorted_list_of_eps.sort(reverse=True) + self.assertEqual(list_of_eps, sorted_list_of_eps) + + def _test_compute_q0(self): + # Stub code to search a logq space and figure out logq0 by eyeballing + # results. This code does not run with the tests. Remove underscore to run. + sigma = 15 + order = 250 + logqs = np.arange(-290, -270, 1) + count = 0 + for logq in logqs: + count += 1 + sys.stdout.write("\t%0.5g: %0.10g" % + (logq, pate.rdp_gaussian(logq, sigma, order))) + sys.stdout.flush() + if count % 5 == 0: + print("") + + def test_rdp_gaussian(self): + self._test_rdp_gaussian_value_errors() + self._test_rdp_gaussian_as_function_of_q() + + def test_compute_eps_from_delta(self): + self._test_compute_eps_from_delta_value_error() + self._test_compute_eps_from_delta_monotonicity() + + +if __name__ == "__main__": + unittest.main() diff --git a/research/pate_2018/smooth_sensitivity.py b/research/pate_2018/smooth_sensitivity.py new file mode 100644 index 0000000..4bf399e --- /dev/null +++ b/research/pate_2018/smooth_sensitivity.py @@ -0,0 +1,419 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functions for smooth sensitivity analysis for PATE mechanisms. + +This library implements functionality for doing smooth sensitivity analysis +for Gaussian Noise Max (GNMax), Threshold with Gaussian noise, and Gaussian +Noise with Smooth Sensitivity (GNSS) mechanisms. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from absl import app +import numpy as np +import scipy +import sympy as sp + +import core as pate + +################################ +# SMOOTH SENSITIVITY FOR GNMAX # +################################ + +# Global dictionary for storing cached q0 values keyed by (sigma, order). +_logq0_cache = {} + + +def _compute_logq0(sigma, order): + key = (sigma, order) + if key in _logq0_cache: + return _logq0_cache[key] + + logq0 = compute_logq0_gnmax(sigma, order) + + _logq0_cache[key] = logq0 # Update the global variable. + return logq0 + + +def _compute_logq1(sigma, order, num_classes): + logq0 = _compute_logq0(sigma, order) # Most likely already cached. + logq1 = math.log(_compute_bl_gnmax(math.exp(logq0), sigma, num_classes)) + assert logq1 <= logq0 + return logq1 + + +def _compute_mu1_mu2_gnmax(sigma, logq): + # Computes mu1, mu2 according to Proposition 10. + mu2 = sigma * math.sqrt(-logq) + mu1 = mu2 + 1 + return mu1, mu2 + + +def _compute_data_dep_bound_gnmax(sigma, logq, order): + # Applies Theorem 6 in Appendix without checking that logq satisfies necessary + # constraints. The pre-conditions must be assured by comparing logq against + # logq0 by the caller. + variance = sigma**2 + mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq) + eps1 = mu1 / variance + eps2 = mu2 / variance + + log1q = np.log1p(-math.exp(logq)) # log1q = log(1-q) + log_a = (order - 1) * ( + log1q - (np.log1p(-math.exp((logq + eps2) * (1 - 1 / mu2))))) + log_b = (order - 1) * (eps1 - logq / (mu1 - 1)) + + return np.logaddexp(log1q + log_a, logq + log_b) / (order - 1) + + +def _compute_rdp_gnmax(sigma, logq, order): + logq0 = _compute_logq0(sigma, order) + if logq >= logq0: + return pate.rdp_data_independent_gaussian(sigma, order) + else: + return _compute_data_dep_bound_gnmax(sigma, logq, order) + + +def compute_logq0_gnmax(sigma, order): + """Computes the point where we start using data-independent bounds. + + Args: + sigma: std of the Gaussian noise + order: Renyi order lambda + + Returns: + logq0: the point above which the data-ind bound overtakes data-dependent + bound. + """ + + def _check_validity_conditions(logq): + # Function returns true iff logq is in the range where data-dependent bound + # is valid. (Theorem 6 in Appendix.) + mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq) + if mu1 < order: + return False + eps2 = mu2 / sigma**2 + # Do computation in the log space. The condition below comes from Lemma 9 + # from Appendix. + return (logq <= (mu2 - 1) * eps2 - mu2 * math.log(mu1 / (mu1 - 1) * mu2 / + (mu2 - 1))) + + def _compare_dep_vs_ind(logq): + return (_compute_data_dep_bound_gnmax(sigma, logq, order) - + pate.rdp_data_independent_gaussian(sigma, order)) + + # Natural upper bounds on q0. + logub = min(-(1 + 1. / sigma)**2, -((order - .99) / sigma)**2, -1 / sigma**2) + assert _check_validity_conditions(logub) + + # If data-dependent bound is already better, we are done already. + if _compare_dep_vs_ind(logub) < 0: + return logub + + # Identifying a reasonable lower bound to bracket logq0. + loglb = 2 * logub # logub is negative, and thus loglb < logub. + while _compare_dep_vs_ind(loglb) > 0: + assert loglb > -10000, "The lower bound on q0 is way too low." + loglb *= 1.5 + + logq0, r = scipy.optimize.brentq( + _compare_dep_vs_ind, loglb, logub, full_output=True) + assert r.converged, "The root finding procedure failed to converge." + assert _check_validity_conditions(logq0) # just in case. + + return logq0 + + +def _compute_bl_gnmax(q, sigma, num_classes): + return ((num_classes - 1) / 2 * scipy.special.erfc( + 1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1)))) + + +def _compute_bu_gnmax(q, sigma, num_classes): + return min(1, (num_classes - 1) / 2 * scipy.special.erfc( + -1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1)))) + + +def _compute_local_sens_gnmax(logq, sigma, num_classes, order): + """Implements Algorithm 3 (computes an upper bound on local sensitivity). + + (See Proposition 13 for proof of correctness.) + """ + logq0 = _compute_logq0(sigma, order) + logq1 = _compute_logq1(sigma, order, num_classes) + if logq1 <= logq <= logq0: + logq = logq1 + + beta = _compute_rdp_gnmax(sigma, logq, order) + beta_bu_q = _compute_rdp_gnmax( + sigma, math.log(_compute_bu_gnmax(math.exp(logq), sigma, num_classes)), + order) + beta_bl_q = _compute_rdp_gnmax( + sigma, math.log(_compute_bl_gnmax(math.exp(logq), sigma, num_classes)), + order) + return max(beta_bu_q - beta, beta - beta_bl_q) + + +def compute_local_sensitivity_bounds_gnmax(votes, num_teachers, sigma, order): + """Computes a list of max-LS-at-distance-d for the GNMax mechanism. + + A more efficient implementation of Algorithms 4 and 5 working in time + O(teachers*classes). A naive implementation is O(teachers^2*classes) or worse. + + Args: + votes: A numpy array of votes. + num_teachers: Total number of voting teachers. + sigma: Standard deviation of the Guassian noise. + order: The Renyi order. + + Returns: + A numpy array of local sensitivities at distances d, 0 <= d <= num_teachers. + """ + + num_classes = len(votes) # Called m in the paper. + + logq0 = _compute_logq0(sigma, order) + logq1 = _compute_logq1(sigma, order, num_classes) + logq = pate.compute_logq_gaussian(votes, sigma) + plateau = _compute_local_sens_gnmax(logq1, sigma, num_classes, order) + + res = np.full(num_teachers, plateau) + + if logq1 <= logq <= logq0: + return res + + # Invariant: votes is sorted in the non-increasing order. + votes = sorted(votes, reverse=True) + + res[0] = _compute_local_sens_gnmax(logq, sigma, num_classes, order) + curr_d = 0 + + go_left = logq > logq0 # Otherwise logq < logq1 and we go right. + + # Iterate while the following is true: + # 1. If we are going left, logq is still larger than logq0 and we may still + # increase the gap between votes[0] and votes[1]. + # 2. If we are going right, logq is still smaller than logq1. + while ((go_left and logq > logq0 and votes[1] > 0) or + (not go_left and logq < logq1)): + curr_d += 1 + if go_left: # Try decreasing logq. + votes[0] += 1 + votes[1] -= 1 + idx = 1 + # Restore the invariant. (Can be implemented more efficiently by keeping + # track of the range of indices equal to votes[1]. Does not seem to matter + # for the overall running time.) + while idx < len(votes) - 1 and votes[idx] < votes[idx + 1]: + votes[idx], votes[idx + 1] = votes[idx + 1], votes[idx] + idx += 1 + else: # Go right, i.e., try increasing logq. + votes[0] -= 1 + votes[1] += 1 # The invariant holds since otherwise logq >= logq1. + + logq = pate.compute_logq_gaussian(votes, sigma) + res[curr_d] = _compute_local_sens_gnmax(logq, sigma, num_classes, order) + + return res + + +################################################## +# SMOOTH SENSITIVITY FOR THE THRESHOLD MECHANISM # +################################################## + +# A global dictionary of RDPs for various threshold values. Indexed by a 4-tuple +# (num_teachers, threshold, sigma, order). +_rdp_thresholds = {} + + +def _compute_rdp_list_threshold(num_teachers, threshold, sigma, order): + key = (num_teachers, threshold, sigma, order) + if key in _rdp_thresholds: + return _rdp_thresholds[key] + + res = np.zeros(num_teachers + 1) + for v in range(0, num_teachers + 1): + logp = scipy.stats.norm.logsf(threshold - v, scale=sigma) + res[v] = pate.compute_rdp_threshold(logp, sigma, order) + + _rdp_thresholds[key] = res + return res + + +def compute_local_sensitivity_bounds_threshold(counts, num_teachers, threshold, + sigma, order): + """Computes a list of max-LS-at-distance-d for the threshold mechanism.""" + + def _compute_ls(v): + ls_step_up, ls_step_down = None, None + if v > 0: + ls_step_down = abs(rdp_list[v - 1] - rdp_list[v]) + if v < num_teachers: + ls_step_up = abs(rdp_list[v + 1] - rdp_list[v]) + return max(ls_step_down, ls_step_up) # Rely on max(x, None) = x. + + cur_max = int(round(max(counts))) + rdp_list = _compute_rdp_list_threshold(num_teachers, threshold, sigma, order) + + ls = np.zeros(num_teachers) + for d in range(max(cur_max, num_teachers - cur_max)): + ls_up, ls_down = None, None + if cur_max + d <= num_teachers: + ls_up = _compute_ls(cur_max + d) + if cur_max - d >= 0: + ls_down = _compute_ls(cur_max - d) + ls[d] = max(ls_up, ls_down) + return ls + + +############################################# +# PROCEDURES FOR SMOOTH SENSITIVITY RELEASE # +############################################# + +# A global dictionary of exponentially decaying arrays. Indexed by beta. +dict_beta_discount = {} + + +def compute_discounted_max(beta, a): + n = len(a) + + if beta not in dict_beta_discount or (len(dict_beta_discount[beta]) < n): + dict_beta_discount[beta] = np.exp(-beta * np.arange(n)) + + return max(a * dict_beta_discount[beta][:n]) + + +def compute_smooth_sensitivity_gnmax(beta, counts, num_teachers, sigma, order): + """Computes smooth sensitivity of a single application of GNMax.""" + + ls = compute_local_sensitivity_bounds_gnmax(counts, sigma, order, + num_teachers) + return compute_discounted_max(beta, ls) + + +def compute_rdp_of_smooth_sensitivity_gaussian(beta, sigma, order): + """Computes the RDP curve for the GNSS mechanism. + + Implements Theorem 23 (https://arxiv.org/pdf/1802.08908.pdf). + """ + if beta > 0 and not 1 < order < 1 / (2 * beta): + raise ValueError("Order outside the (1, 1/(2*beta)) range.") + + return order * math.exp(2 * beta) / sigma**2 + ( + -.5 * math.log(1 - 2 * order * beta) + beta * order) / ( + order - 1) + + +def compute_params_for_ss_release(eps, delta): + """Computes sigma for additive Gaussian noise scaled by smooth sensitivity. + + Presently not used. (We proceed via RDP analysis.) + + Compute beta, sigma for applying Lemma 2.6 (full version of Nissim et al.) via + Lemma 2.10. + """ + # Rather than applying Lemma 2.10 directly, which would give suboptimal alpha, + # (see http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf), + # we extract a sufficient condition on alpha from its proof. + # + # Let a = rho_(delta/2)(Z_1). Then solve for alpha such that + # 2 alpha a + alpha^2 = eps/2. + a = scipy.special.ndtri(1 - delta / 2) + alpha = math.sqrt(a**2 + eps / 2) - a + + beta = eps / (2 * scipy.special.chdtri(1, delta / 2)) + + return alpha, beta + + +####################################################### +# SYMBOLIC-NUMERIC VERIFICATION OF CONDITIONS C5--C6. # +####################################################### + + +def _construct_symbolic_beta(q, sigma, order): + mu2 = sigma * sp.sqrt(sp.log(1 / q)) + mu1 = mu2 + 1 + eps1 = mu1 / sigma**2 + eps2 = mu2 / sigma**2 + a = (1 - q) / (1 - (q * sp.exp(eps2))**(1 - 1 / mu2)) + b = sp.exp(eps1) / q**(1 / (mu1 - 1)) + s = (1 - q) * a**(order - 1) + q * b**(order - 1) + return (1 / (order - 1)) * sp.log(s) + + +def _construct_symbolic_bu(q, sigma, m): + return (m - 1) / 2 * sp.erfc(sp.erfcinv(2 * q / (m - 1)) - 1 / sigma) + + +def _is_non_decreasing(fn, q, bounds): + """Verifies whether the function is non-decreasing within a range. + + Args: + fn: Symbolic function of a single variable. + q: The name of f's variable. + bounds: Pair of (lower_bound, upper_bound) reals. + + Returns: + True iff the function is non-decreasing in the range. + """ + diff_fn = sp.diff(fn, q) # Symbolically compute the derivative. + diff_fn_lambdified = sp.lambdify( + q, + diff_fn, + modules=[ + "numpy", { + "erfc": scipy.special.erfc, + "erfcinv": scipy.special.erfcinv + } + ]) + r = scipy.optimize.minimize_scalar( + diff_fn_lambdified, bounds=bounds, method="bounded") + assert r.success, "Minimizer failed to converge." + return r.fun >= 0 # Check whether the derivative is non-negative. + + +def check_conditions(sigma, m, order): + """Checks conditions C5 and C6 (Section B.4.2 in Appendix).""" + q = sp.symbols("q", positive=True, real=True) + + beta = _construct_symbolic_beta(q, sigma, order) + q0 = math.exp(compute_logq0_gnmax(sigma, order)) + + cond5 = _is_non_decreasing(beta, q, (0, q0)) + + if cond5: + bl_q0 = _compute_bl_gnmax(q0, sigma, m) + + bu = _construct_symbolic_bu(q, sigma, m) + delta_beta = beta.subs(q, bu) - beta + + cond6 = _is_non_decreasing(delta_beta, q, (0, bl_q0)) + else: + cond6 = False # Skip the check, since Condition 5 is false already. + + return (cond5, cond6) + + +def main(argv): + del argv # Unused. + + +if __name__ == "__main__": + app.run(main) diff --git a/research/pate_2018/smooth_sensitivity_test.py b/research/pate_2018/smooth_sensitivity_test.py new file mode 100644 index 0000000..c1f371a --- /dev/null +++ b/research/pate_2018/smooth_sensitivity_test.py @@ -0,0 +1,126 @@ +# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for pate.smooth_sensitivity.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import numpy as np + +import smooth_sensitivity as pate_ss + + +class PateSmoothSensitivityTest(unittest.TestCase): + + def test_check_conditions(self): + self.assertEqual(pate_ss.check_conditions(20, 10, 25.), (True, False)) + self.assertEqual(pate_ss.check_conditions(30, 10, 25.), (True, True)) + + def _assert_all_close(self, x, y): + """Asserts that two numpy arrays are close.""" + self.assertEqual(len(x), len(y)) + self.assertTrue(np.allclose(x, y, rtol=1e-8, atol=0)) + + def test_compute_local_sensitivity_bounds_gnmax(self): + counts1 = np.array([10, 0, 0]) + sigma1 = .5 + order1 = 1.5 + + answer1 = np.array( + [3.13503646e-17, 1.60178280e-08, 5.90681786e-03] + [5.99981308e+00] * 7) + + # Test for "going right" in the smooth sensitivity computation. + out1 = pate_ss.compute_local_sensitivity_bounds_gnmax( + counts1, 10, sigma1, order1) + + self._assert_all_close(out1, answer1) + + counts2 = np.array([1000, 500, 300, 200, 0]) + sigma2 = 250. + order2 = 10. + + # Test for "going left" in the smooth sensitivity computation. + out2 = pate_ss.compute_local_sensitivity_bounds_gnmax( + counts2, 2000, sigma2, order2) + + answer2 = np.array([0.] * 298 + [2.77693450548e-7, 2.10853979548e-6] + + [2.73113623988e-6] * 1700) + self._assert_all_close(out2, answer2) + + def test_compute_local_sensitivity_bounds_threshold(self): + counts1_3 = np.array([20, 10, 0]) + num_teachers = sum(counts1_3) + t1 = 16 # high threshold + sigma = 2 + order = 10 + + out1 = pate_ss.compute_local_sensitivity_bounds_threshold( + counts1_3, num_teachers, t1, sigma, order) + answer1 = np.array([0] * 3 + [ + 1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02, + 9.01543247e-02, 1.16054002e-01, 1.42180452e-01, 1.42180452e-01, + 1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02, + 9.01543266e-02, 1.16054000e-01, 1.42180452e-01, 1.68302106e-01, + 1.93127860e-01 + ] + [0] * 10) + self._assert_all_close(out1, answer1) + + t2 = 2 # low threshold + + out2 = pate_ss.compute_local_sensitivity_bounds_threshold( + counts1_3, num_teachers, t2, sigma, order) + answer2 = np.array([ + 1.60212079e-01, 2.07021132e-01, 2.07021132e-01, 1.93127860e-01, + 1.68302106e-01, 1.42180452e-01, 1.16054002e-01, 9.01543247e-02, + 6.45775697e-02, 3.94153241e-02, 1.47826870e-02, 1.48454129e-04 + ] + [0] * 18) + self._assert_all_close(out2, answer2) + + t3 = 50 # very high threshold (larger than the number of teachers). + + out3 = pate_ss.compute_local_sensitivity_bounds_threshold( + counts1_3, num_teachers, t3, sigma, order) + + answer3 = np.array([ + 1.35750725752e-19, 1.88990500499e-17, 2.05403154065e-15, + 1.74298153642e-13, 1.15489723995e-11, 5.97584949325e-10, + 2.41486826748e-08, 7.62150641922e-07, 1.87846248741e-05, + 0.000360973025976, 0.000360973025976, 2.76377015215e-50, + 1.00904975276e-53, 2.87254164748e-57, 6.37583360761e-61, + 1.10331620211e-64, 1.48844393335e-68, 1.56535552444e-72, + 1.28328011060e-76, 8.20047697109e-81 + ] + [0] * 10) + + self._assert_all_close(out3, answer3) + + # Fractional values. + counts4 = np.array([19.5, -5.1, 0]) + t4 = 10.1 + out4 = pate_ss.compute_local_sensitivity_bounds_threshold( + counts4, num_teachers, t4, sigma, order) + + answer4 = np.array([ + 0.0620410301, 0.0875807131, 0.113451958, 0.139561671, 0.1657074530, + 0.1908244840, 0.2070270720, 0.207027072, 0.169718100, 0.0575152142, + 0.00678695871 + ] + [0] * 6 + [0.000536304908, 0.0172181073, 0.041909870] + [0] * 10) + self._assert_all_close(out4, answer4) + + +if __name__ == "__main__": + unittest.main()