tensorflow_privacy/privacy/analysis/privacy_ledger.py

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PrivacyLedger class for keeping a record of private queries."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections

from distutils.version import LooseVersion
import numpy as np
import tensorflow as tf

from privacy.analysis import tensor_buffer
from privacy.dp_query import dp_query

if LooseVersion(tf.__version__) < LooseVersion('2.0.0'):
  nest = tf.contrib.framework.nest
else:
  nest = tf.nest

SampleEntry = collections.namedtuple(  # pylint: disable=invalid-name
    'SampleEntry', ['population_size', 'selection_probability', 'queries'])

GaussianSumQueryEntry = collections.namedtuple(  # pylint: disable=invalid-name
    'GaussianSumQueryEntry', ['l2_norm_bound', 'noise_stddev'])


def format_ledger(sample_array, query_array):
  """Converts array representation into a list of SampleEntries."""
  samples = []
  query_pos = 0
  sample_pos = 0
  for sample in sample_array:
    population_size, selection_probability, num_queries = sample
    queries = []
    for _ in range(int(num_queries)):
      query = query_array[query_pos]
      assert int(query[0]) == sample_pos
      queries.append(GaussianSumQueryEntry(*query[1:]))
      query_pos += 1
    samples.append(SampleEntry(population_size, selection_probability, queries))
    sample_pos += 1
  return samples


class PrivacyLedger(object):
  """Class for keeping a record of private queries.

  The PrivacyLedger keeps a record of all queries executed over a given dataset
  for the purpose of computing privacy guarantees.
  """

  def __init__(self,
               population_size,
               selection_probability=None,
               max_samples=None,
               max_queries=None):
    """Initialize the PrivacyLedger.

    Args:
      population_size: An integer (may be variable) specifying the size of the
        population, i.e. size of the training data used in each epoch.
      selection_probability: A float (may be variable) specifying the
        probability each record is included in a sample.
      max_samples: The maximum number of samples. An exception is thrown if more
        than this many samples are recorded.
      max_queries: The maximum number of queries. An exception is thrown if more
        than this many queries are recorded.
    """
    self._population_size = population_size
    self._selection_probability = selection_probability
    if max_samples is None:
      max_samples = 1000 * population_size
    if max_queries is None:
      max_queries = 1000 * population_size

    # The query buffer stores rows corresponding to GaussianSumQueryEntries.
    self._query_buffer = tensor_buffer.TensorBuffer(max_queries, [3],
                                                    tf.float32, 'query')
    self._sample_var = tf.Variable(
        initial_value=tf.zeros([3]), trainable=False, name='sample')

    # The sample buffer stores rows corresponding to SampleEntries.
    self._sample_buffer = tensor_buffer.TensorBuffer(max_samples, [3],
                                                     tf.float32, 'sample')
    self._sample_count = tf.Variable(
        initial_value=0.0, trainable=False, name='sample_count')
    self._query_count = tf.Variable(
        initial_value=0.0, trainable=False, name='query_count')
    try:
      # Newer versions of TF
      self._cs = tf.CriticalSection()
    except AttributeError:
      # Older versions of TF
      self._cs = tf.contrib.framework.CriticalSection()

  def record_sum_query(self, l2_norm_bound, noise_stddev):
    """Records that a query was issued.

    Args:
      l2_norm_bound: The maximum l2 norm of the tensor group in the query.
      noise_stddev: The standard deviation of the noise applied to the sum.

    Returns:
      An operation recording the sum query to the ledger.
    """

    def _do_record_query():
      with tf.control_dependencies(
          [tf.assign(self._query_count, self._query_count + 1)]):
        return self._query_buffer.append(
            [self._sample_count, l2_norm_bound, noise_stddev])

    return self._cs.execute(_do_record_query)

  def finalize_sample(self):
    """Finalizes sample and records sample ledger entry."""
    with tf.control_dependencies([
        tf.assign(self._sample_var, [
            self._population_size, self._selection_probability,
            self._query_count
        ])
    ]):
      with tf.control_dependencies([
          tf.assign(self._sample_count, self._sample_count + 1),
          tf.assign(self._query_count, 0)
      ]):
        return self._sample_buffer.append(self._sample_var)

  def get_unformatted_ledger(self):
    return self._sample_buffer.values, self._query_buffer.values

  def get_formatted_ledger(self, sess):
    """Gets the formatted query ledger.

    Args:
      sess: The tensorflow session in which the ledger was created.

    Returns:
      The query ledger as a list of SampleEntries.
    """
    sample_array = sess.run(self._sample_buffer.values)
    query_array = sess.run(self._query_buffer.values)

    return format_ledger(sample_array, query_array)

  def get_formatted_ledger_eager(self):
    """Gets the formatted query ledger.

    Returns:
      The query ledger as a list of SampleEntries.
    """
    sample_array = self._sample_buffer.values.numpy()
    query_array = self._query_buffer.values.numpy()

    return format_ledger(sample_array, query_array)

  def set_sample_size(self, batch_size):
    self._selection_probability = tf.cast(batch_size,
                                          tf.float32) / self._population_size


class DummyLedger(object):
  """A ledger that records nothing.

  This ledger may be passed in place of a normal PrivacyLedger in case privacy
  accounting is to be handled externally.
  """

  def record_sum_query(self, l2_norm_bound, noise_stddev):
    del l2_norm_bound
    del noise_stddev
    return tf.no_op()

  def finalize_sample(self):
    return tf.no_op()

  def get_unformatted_ledger(self):
    empty_array = tf.zeros(shape=[0, 3])
    return empty_array, empty_array

  def get_formatted_ledger(self, sess):
    del sess
    empty_array = np.zeros(shape=[0, 3])
    return empty_array, empty_array

  def get_formatted_ledger_eager(self):
    empty_array = np.zeros(shape=[0, 3])
    return empty_array, empty_array


class QueryWithLedger(dp_query.DPQuery):
  """A class for DP queries that record events to a PrivacyLedger.

  QueryWithLedger should be the top-level query in a structure of queries that
  may include sum queries, nested queries, etc. It should simply wrap another
  query and contain a reference to the ledger. Any contained queries (including
  those contained in the leaves of a nested query) should also contain a
  reference to the same ledger object.

  For example usage, see privacy_ledger_test.py.
  """

  def __init__(self, query, ledger):
    """Initializes the QueryWithLedger.

    Args:
      query: The query whose events should be recorded to the ledger. Any
        subqueries (including those in the leaves of a nested query) should also
        contain a reference to the same ledger given here.
      ledger: A PrivacyLedger to which privacy events should be recorded.
    """
    self._query = query
    self._ledger = ledger

  def initial_global_state(self):
    """See base class."""
    return self._query.initial_global_state()

  def derive_sample_params(self, global_state):
    """See base class."""
    return self._query.derive_sample_params(global_state)

  def initial_sample_state(self, global_state, template):
    """See base class."""
    return self._query.initial_sample_state(global_state, template)

  def preprocess_record(self, params, record):
    """See base class."""
    return self._query.preprocess_record(params, record)

  def accumulate_preprocessed_record(self, sample_state, preprocessed_record):
    """See base class."""
    return self._query.accumulate_preprocessed_record(
        sample_state, preprocessed_record)

  def merge_sample_states(self, sample_state_1, sample_state_2):
    """See base class."""
    return self._query.merge_sample_states(sample_state_1, sample_state_2)

  def get_noised_result(self, sample_state, global_state):
    """Ensures sample is recorded to the ledger and returns noised result."""
    with tf.control_dependencies(nest.flatten(sample_state)):
      with tf.control_dependencies([self._ledger.finalize_sample()]):
        return self._query.get_noised_result(sample_state, global_state)

  def set_denominator(self, num_microbatches, microbatch_size=1):
    self._query.set_denominator(num_microbatches)
    self._ledger.set_sample_size(num_microbatches * microbatch_size)