Update PrivacyLedger and DPOptimizer to make certain arguments optional.

PiperOrigin-RevId: 246235646
2019-05-01 18:07:11 -07:00 · 2019-05-01 18:07:11 -07:00 · beb86c6e18
commit beb86c6e18
parent c09ec4c22b
5 changed files with 67 additions and 35 deletions
--- a/privacy/analysis/privacy_ledger.py
+++ b/privacy/analysis/privacy_ledger.py
@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""PrivacyLedger class for keeping a record of private queries.
-"""
+"""PrivacyLedger class for keeping a record of private queries."""

 from __future__ import absolute_import
 from __future__ import division
@ -65,36 +63,39 @@ class PrivacyLedger(object):
  for the purpose of computing privacy guarantees.
  """

-  def __init__(
-      self,
-      population_size,
-      selection_probability,
-      max_samples,
-      max_queries):
+  def __init__(self,
+               population_size,
+               selection_probability=None,
+               max_samples=None,
+               max_queries=None):
    """Initialize the PrivacyLedger.

    Args:
      population_size: An integer (may be variable) specifying the size of the
-        population.
+        population, i.e. size of the training data used in each epoch.
      selection_probability: A float (may be variable) specifying the
        probability each record is included in a sample.
-      max_samples: The maximum number of samples. An exception is thrown if
-        more than this many samples are recorded.
-      max_queries: The maximum number of queries. An exception is thrown if
-        more than this many queries are recorded.
+      max_samples: The maximum number of samples. An exception is thrown if more
+        than this many samples are recorded.
+      max_queries: The maximum number of queries. An exception is thrown if more
+        than this many queries are recorded.
    """
    self._population_size = population_size
    self._selection_probability = selection_probability
+    if max_samples is None:
+      max_samples = 1000 * population_size
+    if max_queries is None:
+      max_queries = 1000 * population_size

    # The query buffer stores rows corresponding to GaussianSumQueryEntries.
-    self._query_buffer = tensor_buffer.TensorBuffer(
-        max_queries, [3], tf.float32, 'query')
+    self._query_buffer = tensor_buffer.TensorBuffer(max_queries, [3],
+                                                    tf.float32, 'query')
    self._sample_var = tf.Variable(
        initial_value=tf.zeros([3]), trainable=False, name='sample')

    # The sample buffer stores rows corresponding to SampleEntries.
-    self._sample_buffer = tensor_buffer.TensorBuffer(
-        max_samples, [3], tf.float32, 'sample')
+    self._sample_buffer = tensor_buffer.TensorBuffer(max_samples, [3],
+                                                     tf.float32, 'sample')
    self._sample_count = tf.Variable(
        initial_value=0.0, trainable=False, name='sample_count')
    self._query_count = tf.Variable(
@ -116,9 +117,10 @@ class PrivacyLedger(object):
    Returns:
      An operation recording the sum query to the ledger.
    """
+
    def _do_record_query():
-      with tf.control_dependencies([
-          tf.assign(self._query_count, self._query_count + 1)]):
+      with tf.control_dependencies(
+          [tf.assign(self._query_count, self._query_count + 1)]):
        return self._query_buffer.append(
            [self._sample_count, l2_norm_bound, noise_stddev])

@ -127,14 +129,15 @@ class PrivacyLedger(object):
  def finalize_sample(self):
    """Finalizes sample and records sample ledger entry."""
    with tf.control_dependencies([
-        tf.assign(
-            self._sample_var,
-            [self._population_size,
-             self._selection_probability,
-             self._query_count])]):
+        tf.assign(self._sample_var, [
+            self._population_size, self._selection_probability,
+            self._query_count
+        ])
+    ]):
      with tf.control_dependencies([
          tf.assign(self._sample_count, self._sample_count + 1),
-          tf.assign(self._query_count, 0)]):
+          tf.assign(self._query_count, 0)
+      ]):
        return self._sample_buffer.append(self._sample_var)

  def get_unformatted_ledger(self):
@ -165,6 +168,10 @@ class PrivacyLedger(object):

    return format_ledger(sample_array, query_array)

+  def set_sample_size(self, batch_size):
+    self._selection_probability = tf.cast(batch_size,
+                                          tf.float32) / self._population_size
+

 class DummyLedger(object):
  """A ledger that records nothing.
@ -212,8 +219,8 @@ class QueryWithLedger(dp_query.DPQuery):

    Args:
      query: The query whose events should be recorded to the ledger. Any
-        subqueries (including those in the leaves of a nested query) should
-        also contain a reference to the same ledger given here.
+        subqueries (including those in the leaves of a nested query) should also
+        contain a reference to the same ledger given here.
      ledger: A PrivacyLedger to which privacy events should be recorded.
    """
    self._query = query
@ -240,3 +247,7 @@ class QueryWithLedger(dp_query.DPQuery):
    with tf.control_dependencies(nest.flatten(sample_state)):
      with tf.control_dependencies([self._ledger.finalize_sample()]):
        return self._query.get_noised_result(sample_state, global_state)
+
+  def set_denominator(self, num_microbatches, microbatch_size=1):
+    self._query.set_denominator(num_microbatches)
+    self._ledger.set_sample_size(num_microbatches * microbatch_size)
--- a/privacy/dp_query/gaussian_query.py
+++ b/privacy/dp_query/gaussian_query.py
@ -167,4 +167,4 @@ class GaussianAverageQuery(normalized_query.NormalizedQuery):
    """
    super(GaussianAverageQuery, self).__init__(
        numerator_query=GaussianSumQuery(l2_norm_clip, sum_stddev, ledger),
-        denominator=tf.cast(denominator, tf.float32))
+        denominator=denominator)
--- a/privacy/dp_query/normalized_query.py
+++ b/privacy/dp_query/normalized_query.py
@ -41,7 +41,8 @@ class NormalizedQuery(dp_query.DPQuery):
      denominator: A value for the denominator.
    """
    self._numerator = numerator_query
-    self._denominator = tf.cast(denominator, tf.float32)
+    self._denominator = tf.cast(denominator,
+                                tf.float32) if denominator is not None else None

  def initial_global_state(self):
    """Returns the initial global state for the NormalizedQuery."""
@ -103,4 +104,5 @@ class NormalizedQuery(dp_query.DPQuery):

    return nest.map_structure(normalize, noised_sum), new_sum_global_state

-
+  def set_denominator(self, denominator):
+    self._denominator = tf.cast(denominator, tf.float32)
--- a/privacy/optimizers/dp_optimizer.py
+++ b/privacy/optimizers/dp_optimizer.py
@ -47,10 +47,22 @@ def make_optimizer_class(cls):
    def __init__(
        self,
        dp_average_query,
-        num_microbatches,
+        num_microbatches=None,
        unroll_microbatches=False,
-        *args,  # pylint: disable=keyword-arg-before-vararg
+        *args,  # pylint: disable=keyword-arg-before-vararg, g-doc-args
        **kwargs):
+      """Initialize the DPOptimizerClass.
+
+      Args:
+        dp_average_query: DPQuery object, specifying differential privacy
+          mechanism to use.
+        num_microbatches: How many microbatches into which the minibatch is
+          split. If None, will default to the size of the minibatch, and
+          per-example gradients will be computed.
+        unroll_microbatches: If true, processes microbatches within a Python
+          loop instead of a tf.while_loop. Can be used if using a tf.while_loop
+          raises an exception.
+      """
      super(DPOptimizerClass, self).__init__(*args, **kwargs)
      self._dp_average_query = dp_average_query
      self._num_microbatches = num_microbatches
@ -74,6 +86,9 @@ def make_optimizer_class(cls):
          raise ValueError('When in Eager mode, a tape needs to be passed.')

        vector_loss = loss()
+        if self._num_microbatches is None:
+          self._num_microbatches = tf.shape(vector_loss)[0]
+          self._dp_average_query.set_denominator(self._num_microbatches)
        sample_state = self._dp_average_query.initial_sample_state(
            self._global_state, var_list)
        microbatches_losses = tf.reshape(vector_loss,
@ -109,6 +124,9 @@ def make_optimizer_class(cls):
        # we sampled each microbatch from the appropriate binomial distribution,
        # although that still wouldn't be quite correct because it would be
        # sampling from the dataset without replacement.
+        if self._num_microbatches is None:
+          self._num_microbatches = tf.shape(loss)[0]
+          self._dp_average_query.set_denominator(self._num_microbatches)
        microbatches_losses = tf.reshape(loss, [self._num_microbatches, -1])
        sample_params = (
            self._dp_average_query.derive_sample_params(self._global_state))
--- a/privacy/optimizers/dp_optimizer_test.py
+++ b/privacy/optimizers/dp_optimizer_test.py
@ -46,14 +46,15 @@ class DPOptimizerTest(tf.test.TestCase, parameterized.TestCase):
      ('DPAdagrad 4', dp_optimizer.DPAdagradOptimizer, 4, [-2.5, -2.5]),
      ('DPAdam 1', dp_optimizer.DPAdamOptimizer, 1, [-2.5, -2.5]),
      ('DPAdam 2', dp_optimizer.DPAdamOptimizer, 2, [-2.5, -2.5]),
-      ('DPAdam 4', dp_optimizer.DPAdamOptimizer, 4, [-2.5, -2.5]))
+      ('DPAdam 4', dp_optimizer.DPAdamOptimizer, 4, [-2.5, -2.5]),
+      ('DPAdam None', dp_optimizer.DPAdamOptimizer, None, [-2.5, -2.5]))
  def testBaseline(self, cls, num_microbatches, expected_answer):
    with self.cached_session() as sess:
      var0 = tf.Variable([1.0, 2.0])
      data0 = tf.Variable([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0]])

      ledger = privacy_ledger.PrivacyLedger(
-          1e6, num_microbatches / 1e6, 50, 50)
+          1e6, num_microbatches / 1e6 if num_microbatches else None, 50, 50)
      dp_average_query = gaussian_query.GaussianAverageQuery(
          1.0e9, 0.0, num_microbatches, ledger)
      dp_average_query = privacy_ledger.QueryWithLedger(