diff --git a/privacy/analysis/privacy_ledger.py b/privacy/analysis/privacy_ledger.py index 5ebf38e..1dc995f 100644 --- a/privacy/analysis/privacy_ledger.py +++ b/privacy/analysis/privacy_ledger.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -"""PrivacyLedger class for keeping a record of private queries. -""" +"""PrivacyLedger class for keeping a record of private queries.""" from __future__ import absolute_import from __future__ import division @@ -65,36 +63,39 @@ class PrivacyLedger(object): for the purpose of computing privacy guarantees. """ - def __init__( - self, - population_size, - selection_probability, - max_samples, - max_queries): + def __init__(self, + population_size, + selection_probability=None, + max_samples=None, + max_queries=None): """Initialize the PrivacyLedger. Args: population_size: An integer (may be variable) specifying the size of the - population. + population, i.e. size of the training data used in each epoch. selection_probability: A float (may be variable) specifying the probability each record is included in a sample. - max_samples: The maximum number of samples. An exception is thrown if - more than this many samples are recorded. - max_queries: The maximum number of queries. An exception is thrown if - more than this many queries are recorded. + max_samples: The maximum number of samples. An exception is thrown if more + than this many samples are recorded. + max_queries: The maximum number of queries. An exception is thrown if more + than this many queries are recorded. """ self._population_size = population_size self._selection_probability = selection_probability + if max_samples is None: + max_samples = 1000 * population_size + if max_queries is None: + max_queries = 1000 * population_size # The query buffer stores rows corresponding to GaussianSumQueryEntries. - self._query_buffer = tensor_buffer.TensorBuffer( - max_queries, [3], tf.float32, 'query') + self._query_buffer = tensor_buffer.TensorBuffer(max_queries, [3], + tf.float32, 'query') self._sample_var = tf.Variable( initial_value=tf.zeros([3]), trainable=False, name='sample') # The sample buffer stores rows corresponding to SampleEntries. - self._sample_buffer = tensor_buffer.TensorBuffer( - max_samples, [3], tf.float32, 'sample') + self._sample_buffer = tensor_buffer.TensorBuffer(max_samples, [3], + tf.float32, 'sample') self._sample_count = tf.Variable( initial_value=0.0, trainable=False, name='sample_count') self._query_count = tf.Variable( @@ -116,9 +117,10 @@ class PrivacyLedger(object): Returns: An operation recording the sum query to the ledger. """ + def _do_record_query(): - with tf.control_dependencies([ - tf.assign(self._query_count, self._query_count + 1)]): + with tf.control_dependencies( + [tf.assign(self._query_count, self._query_count + 1)]): return self._query_buffer.append( [self._sample_count, l2_norm_bound, noise_stddev]) @@ -127,14 +129,15 @@ class PrivacyLedger(object): def finalize_sample(self): """Finalizes sample and records sample ledger entry.""" with tf.control_dependencies([ - tf.assign( - self._sample_var, - [self._population_size, - self._selection_probability, - self._query_count])]): + tf.assign(self._sample_var, [ + self._population_size, self._selection_probability, + self._query_count + ]) + ]): with tf.control_dependencies([ tf.assign(self._sample_count, self._sample_count + 1), - tf.assign(self._query_count, 0)]): + tf.assign(self._query_count, 0) + ]): return self._sample_buffer.append(self._sample_var) def get_unformatted_ledger(self): @@ -165,6 +168,10 @@ class PrivacyLedger(object): return format_ledger(sample_array, query_array) + def set_sample_size(self, batch_size): + self._selection_probability = tf.cast(batch_size, + tf.float32) / self._population_size + class DummyLedger(object): """A ledger that records nothing. @@ -212,8 +219,8 @@ class QueryWithLedger(dp_query.DPQuery): Args: query: The query whose events should be recorded to the ledger. Any - subqueries (including those in the leaves of a nested query) should - also contain a reference to the same ledger given here. + subqueries (including those in the leaves of a nested query) should also + contain a reference to the same ledger given here. ledger: A PrivacyLedger to which privacy events should be recorded. """ self._query = query @@ -240,3 +247,7 @@ class QueryWithLedger(dp_query.DPQuery): with tf.control_dependencies(nest.flatten(sample_state)): with tf.control_dependencies([self._ledger.finalize_sample()]): return self._query.get_noised_result(sample_state, global_state) + + def set_denominator(self, num_microbatches, microbatch_size=1): + self._query.set_denominator(num_microbatches) + self._ledger.set_sample_size(num_microbatches * microbatch_size) diff --git a/privacy/dp_query/gaussian_query.py b/privacy/dp_query/gaussian_query.py index 8dbd8c9..bb2f6f8 100644 --- a/privacy/dp_query/gaussian_query.py +++ b/privacy/dp_query/gaussian_query.py @@ -167,4 +167,4 @@ class GaussianAverageQuery(normalized_query.NormalizedQuery): """ super(GaussianAverageQuery, self).__init__( numerator_query=GaussianSumQuery(l2_norm_clip, sum_stddev, ledger), - denominator=tf.cast(denominator, tf.float32)) + denominator=denominator) diff --git a/privacy/dp_query/normalized_query.py b/privacy/dp_query/normalized_query.py index 57e96d0..8d2df58 100644 --- a/privacy/dp_query/normalized_query.py +++ b/privacy/dp_query/normalized_query.py @@ -41,7 +41,8 @@ class NormalizedQuery(dp_query.DPQuery): denominator: A value for the denominator. """ self._numerator = numerator_query - self._denominator = tf.cast(denominator, tf.float32) + self._denominator = tf.cast(denominator, + tf.float32) if denominator is not None else None def initial_global_state(self): """Returns the initial global state for the NormalizedQuery.""" @@ -103,4 +104,5 @@ class NormalizedQuery(dp_query.DPQuery): return nest.map_structure(normalize, noised_sum), new_sum_global_state - + def set_denominator(self, denominator): + self._denominator = tf.cast(denominator, tf.float32) diff --git a/privacy/optimizers/dp_optimizer.py b/privacy/optimizers/dp_optimizer.py index 19d49ff..5e7c6df 100644 --- a/privacy/optimizers/dp_optimizer.py +++ b/privacy/optimizers/dp_optimizer.py @@ -47,10 +47,22 @@ def make_optimizer_class(cls): def __init__( self, dp_average_query, - num_microbatches, + num_microbatches=None, unroll_microbatches=False, - *args, # pylint: disable=keyword-arg-before-vararg + *args, # pylint: disable=keyword-arg-before-vararg, g-doc-args **kwargs): + """Initialize the DPOptimizerClass. + + Args: + dp_average_query: DPQuery object, specifying differential privacy + mechanism to use. + num_microbatches: How many microbatches into which the minibatch is + split. If None, will default to the size of the minibatch, and + per-example gradients will be computed. + unroll_microbatches: If true, processes microbatches within a Python + loop instead of a tf.while_loop. Can be used if using a tf.while_loop + raises an exception. + """ super(DPOptimizerClass, self).__init__(*args, **kwargs) self._dp_average_query = dp_average_query self._num_microbatches = num_microbatches @@ -74,6 +86,9 @@ def make_optimizer_class(cls): raise ValueError('When in Eager mode, a tape needs to be passed.') vector_loss = loss() + if self._num_microbatches is None: + self._num_microbatches = tf.shape(vector_loss)[0] + self._dp_average_query.set_denominator(self._num_microbatches) sample_state = self._dp_average_query.initial_sample_state( self._global_state, var_list) microbatches_losses = tf.reshape(vector_loss, @@ -109,6 +124,9 @@ def make_optimizer_class(cls): # we sampled each microbatch from the appropriate binomial distribution, # although that still wouldn't be quite correct because it would be # sampling from the dataset without replacement. + if self._num_microbatches is None: + self._num_microbatches = tf.shape(loss)[0] + self._dp_average_query.set_denominator(self._num_microbatches) microbatches_losses = tf.reshape(loss, [self._num_microbatches, -1]) sample_params = ( self._dp_average_query.derive_sample_params(self._global_state)) diff --git a/privacy/optimizers/dp_optimizer_test.py b/privacy/optimizers/dp_optimizer_test.py index 05496df..e8bd635 100644 --- a/privacy/optimizers/dp_optimizer_test.py +++ b/privacy/optimizers/dp_optimizer_test.py @@ -46,14 +46,15 @@ class DPOptimizerTest(tf.test.TestCase, parameterized.TestCase): ('DPAdagrad 4', dp_optimizer.DPAdagradOptimizer, 4, [-2.5, -2.5]), ('DPAdam 1', dp_optimizer.DPAdamOptimizer, 1, [-2.5, -2.5]), ('DPAdam 2', dp_optimizer.DPAdamOptimizer, 2, [-2.5, -2.5]), - ('DPAdam 4', dp_optimizer.DPAdamOptimizer, 4, [-2.5, -2.5])) + ('DPAdam 4', dp_optimizer.DPAdamOptimizer, 4, [-2.5, -2.5]), + ('DPAdam None', dp_optimizer.DPAdamOptimizer, None, [-2.5, -2.5])) def testBaseline(self, cls, num_microbatches, expected_answer): with self.cached_session() as sess: var0 = tf.Variable([1.0, 2.0]) data0 = tf.Variable([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0]]) ledger = privacy_ledger.PrivacyLedger( - 1e6, num_microbatches / 1e6, 50, 50) + 1e6, num_microbatches / 1e6 if num_microbatches else None, 50, 50) dp_average_query = gaussian_query.GaussianAverageQuery( 1.0e9, 0.0, num_microbatches, ledger) dp_average_query = privacy_ledger.QueryWithLedger(