Sparsity Preserving DP-SGD in TF Privacy [2 of 5]

Adds sparse noise utilities to privately select sparse indices from contribution counts. See https://research.google/blog/sparsity-preserving-differentially-private-training/ for more details on the algorithm. PiperOrigin-RevId: 654782588
2024-07-22 09:25:02 -07:00 · 2024-07-22 09:25:02 -07:00 · 8747858b5b
commit 8747858b5b
parent 348895a7a3
3 changed files with 471 additions and 0 deletions
--- a/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD
+++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD
@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"])
 licenses(["notice"])
 py_library(
    name = "sparse_noise_utils",
    srcs = ["sparse_noise_utils.py"],
 )
 py_test(
    name = "sparse_noise_utils_test",
    srcs = ["sparse_noise_utils_test.py"],
    deps = [":sparse_noise_utils"],
 )
 py_library(
    name = "type_aliases",
    srcs = ["type_aliases.py"],
--- a/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils.py
+++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils.py
@ -0,0 +1,171 @@
 # Copyright 2024, The TensorFlow Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utils for adding sparse noise to gradients.
 For more details on the algorithm, refer to https://arxiv.org/abs/2311.08357.
 """
 from typing import Optional, Sequence
 import tensorflow as tf
 import tensorflow_probability as tfp
 def split_noise_multiplier(
    noise_multiplier: float,
    sparse_selection_ratio: float,
    sparse_selection_contribution_counts: Sequence[Optional[tf.SparseTensor]],
 ) -> tuple[float, float]:
  """Splits noise multiplier between partition selection and gradient noise.
  Returns one noise multiplier for gradient noise and one noise multiplier
  for each sparse partition selection layer such that composing all gaussian
  mechanisms with these noise multipliers is equivalent to applying a single
  gaussian mechanism with the original noise multiplier.
  Args:
    noise_multiplier: The original noise multiplier.
    sparse_selection_ratio: The ratio of partition selection noise and gradient
      noise.
    sparse_selection_contribution_counts: The contribution counts for each
      sparse selection variable. If a sparse selection count is None, it will be
      ignored.
  Returns:
    A tuple of noise multipliers for sparse selection and gradient noise.
  Raises:
    ValueError: If the sparse selection ratio is not between 0 and 1, if the
      sparse selection contribution counts is None, or if there are no sparse
      selection contribution counts.
  """
  if sparse_selection_ratio <= 0.0 or sparse_selection_ratio >= 1.0:
    raise ValueError('Sparse selection ratio must be between 0 and 1.')
  num_sparse_selections = sum(
      1 for c in sparse_selection_contribution_counts if c is not None
  )
  if num_sparse_selections == 0:
    raise ValueError('No sparse selections contribution counts found.')
  ratio = (1.0 + sparse_selection_ratio**2.0) ** 0.5
  total_noise_multiplier_sparse = noise_multiplier * ratio
  noise_multiplier_partition_selection = (
      num_sparse_selections**0.5 * total_noise_multiplier_sparse
  )
  noise_multiplier_gradient_noise = (
      noise_multiplier * ratio / sparse_selection_ratio
  )
  return noise_multiplier_partition_selection, noise_multiplier_gradient_noise
 def _sample_sparse_indices_batch_size_heuristic(
    max_index: tf.Tensor,
    probability: float,
 ) -> tf.Tensor:
  """Returns a batch size using a rough heuristic to use for sampling.
  This heuristic should roughly allow for the sampling to only use a single
  batch to sample all indices >95% of the time.
  Args:
    max_index: The maximum index to sample.
    probability: The probability of sampling each index.
  Returns:
    The batch size to use for sampling.
  """
  max_num_samples = tf.cast(max_index + 1, tf.float32)
  expected_num_samples = max_num_samples * probability
  # For expected samples > 50, choosing a batch size of 1.2 * expected samples
  # will allow for sampling only once to get all indices >95% of the time.
  min_batch_size = 50.0
  return tf.cast(
      tf.maximum(min_batch_size, 1.2 * expected_num_samples), tf.int32
  )
@tf.function
 def sample_false_positive_indices(
    max_index: tf.Tensor, probability: float, batch_size: Optional[int] = None
 ) -> tf.Tensor:
  """Samples indices with probability `probability` iid sparsely.
  This function generates a list of indices in the range of [0, max_index]
  where each index is sampled with probability `probability` independently. To
  achieve this efficiently, we use the geometric distribution to sample a batch
  of indices at a time and repeat this process until all indices are sampled.
  Args:
    max_index: The maximum index to sample.
    probability: The probability of sampling each index.
    batch_size: The batch size to use for sampling. If None, a heuristic will be
      used to determine the batch size.
  Returns:
    A tensor of sampled indices.
  """
  if probability <= 0.0:
    return tf.constant([], dtype=tf.int64)
  sampled_indices = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
  batch_size = batch_size or _sample_sparse_indices_batch_size_heuristic(
      max_index, probability
  )
  geom = tfp.distributions.geometric.Geometric(probs=probability)
  i, current_max = tf.constant(0), tf.constant(-1)
  while current_max < max_index:
    sample = tf.cast(geom.sample(batch_size) + 1, tf.int32)
    indices = current_max + tf.cumsum(sample)
    current_max = indices[-1]
    sampled_indices = sampled_indices.write(i, indices)
    i += 1
  indices = tf.cast(sampled_indices.concat(), tf.int32)
  indices = indices[indices <= max_index]
  return tf.cast(indices, tf.int64)
 def sample_true_positive_indices(
    contribution_counts: tf.SparseTensor,
    noise_multiplier: float,
    threshold: int,
 ) -> tf.Tensor:
  """Samples indices where the count + Gaussian noise is above a threshold.
  Args:
    contribution_counts: The contribution counts for each index.
    noise_multiplier: The noise multiplier to use for the gaussian noise.
    threshold: The threshold to use for the selection.
  Returns:
    A tensor of sampled indices.
  """
  contribution_count_values = tf.reshape(contribution_counts.values, (-1,))
  noised_contribution_count_values = (
      contribution_count_values
      + tf.random.normal(
          tf.shape(contribution_count_values),
          mean=0.0,
          stddev=noise_multiplier,
          dtype=tf.float32,
      )
  )
  noised_contribution_counts_indices = contribution_counts.indices[
      noised_contribution_count_values >= threshold
  ][:, 0]
  return tf.reshape(noised_contribution_counts_indices, (-1,))
--- a/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils_test.py
+++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils_test.py
@ -0,0 +1,289 @@
 # Copyright 2024, The TensorFlow Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for sparse_noise_utils."""
 from absl.testing import parameterized
 import numpy as np
 from scipy import stats
 import tensorflow as tf
 from tensorflow_privacy.privacy.sparsity_preserving_noise import sparse_noise_utils
 class SparseNoiseUtilsTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.named_parameters(
      dict(
          testcase_name='one_sparse_layer',
          noise_multiplier=1.0,
          sparse_selection_ratio=0.8,
          sparse_selection_contribution_counts=[
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              )
          ],
      ),
      dict(
          testcase_name='multiple_sparse_layer',
          noise_multiplier=1.0,
          sparse_selection_ratio=0.1,
          sparse_selection_contribution_counts=[
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              ),
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              ),
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              ),
          ],
      ),
  )
  def test_split_noise_multiplier(
      self,
      noise_multiplier,
      sparse_selection_ratio,
      sparse_selection_contribution_counts,
  ):
    noise_multiplier_sparse, noise_multiplier_dense = (
        sparse_noise_utils.split_noise_multiplier(
            noise_multiplier,
            sparse_selection_ratio,
            sparse_selection_contribution_counts,
        )
    )
    num_sparse_layers = len(sparse_selection_contribution_counts)
    total_noise_multiplier_sparse = (
        noise_multiplier_sparse / num_sparse_layers**0.5
    )
    self.assertAlmostEqual(
        total_noise_multiplier_sparse,
        sparse_selection_ratio * noise_multiplier_dense,
    )
    total_noise_multiplier = (
        1.0
        / (
            1.0 / total_noise_multiplier_sparse**2
            + 1.0 / noise_multiplier_dense**2
        )
        ** 0.5
    )
    self.assertAlmostEqual(total_noise_multiplier, noise_multiplier)
  @parameterized.named_parameters(
      dict(
          testcase_name='no_sparse_layers',
          noise_multiplier=1.0,
          sparse_selection_ratio=0.5,
          sparse_selection_contribution_counts=[],
          error_message='No sparse selections contribution counts found.',
      ),
      dict(
          testcase_name='sparse_layers_none',
          noise_multiplier=1.0,
          sparse_selection_ratio=0.5,
          sparse_selection_contribution_counts=[None],
          error_message='No sparse selections contribution counts found.',
      ),
      dict(
          testcase_name='zero_ratio',
          noise_multiplier=1.0,
          sparse_selection_ratio=0.0,
          sparse_selection_contribution_counts=[
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              )
          ],
          error_message='Sparse selection ratio must be between 0 and 1.',
      ),
      dict(
          testcase_name='one_ratio',
          noise_multiplier=1.0,
          sparse_selection_ratio=1.0,
          sparse_selection_contribution_counts=[
              tf.SparseTensor(
                  indices=[[0]],
                  values=[1],
                  dense_shape=[3],
              )
          ],
          error_message='Sparse selection ratio must be between 0 and 1.',
      ),
  )
  def test_split_noise_multiplier_errors(
      self,
      noise_multiplier,
      sparse_selection_ratio,
      sparse_selection_contribution_counts,
      error_message,
  ):
    with self.assertRaisesRegex(ValueError, error_message):
      sparse_noise_utils.split_noise_multiplier(
          noise_multiplier,
          sparse_selection_ratio,
          sparse_selection_contribution_counts,
      )
  @parameterized.named_parameters(
      dict(
          testcase_name='max_index_0',
          max_index=0,
      ),
      dict(
          testcase_name='max_index_10',
          max_index=10,
      ),
  )
  def test_sample_false_positive_indices_one_prob(self, max_index):
    sampled_indices = (
        sparse_noise_utils.sample_false_positive_indices(max_index, 1.0)
        .numpy()
        .tolist()
    )
    expected_indices = list(range(max_index + 1))
    self.assertEqual(sampled_indices, expected_indices)
  @parameterized.named_parameters(
      dict(
          testcase_name='max_index_0',
          max_index=0,
      ),
      dict(
          testcase_name='max_index_10',
          max_index=10,
      ),
  )
  def test_sample_false_positive_indices_zero_prob(self, max_index):
    sampled_indices = (
        sparse_noise_utils.sample_false_positive_indices(max_index, 0.0)
        .numpy()
        .tolist()
    )
    self.assertEmpty(sampled_indices)
  @parameterized.named_parameters(
      dict(
          testcase_name='max_index_10_prob_50',
          prob=0.5,
          max_index=10,
      ),
      dict(
          testcase_name='max_index_20_prob_25',
          prob=0.25,
          max_index=20,
      ),
      dict(
          testcase_name='max_index_20_prob_75',
          prob=0.75,
          max_index=20,
      ),
  )
  def test_sample_false_positive_indices_random(self, max_index, prob):
    sampled_indices = sparse_noise_utils.sample_false_positive_indices(
        max_index, prob
    )
    sampled_indices = sampled_indices.numpy()
    self.assertLessEqual(np.max(sampled_indices), max_index)
    self.assertGreaterEqual(np.min(sampled_indices), 0)
    self.assertGreater(
        stats.binomtest(k=len(sampled_indices), n=max_index, p=prob).pvalue,
        1e-10,
    )
    bins = np.arange(max_index + 1) + 1
    histogram, _ = np.histogram(sampled_indices, bins=bins)
    num_trials = 10000
    for _ in range(num_trials):
      sampled_indices = sparse_noise_utils.sample_false_positive_indices(
          max_index, prob
      ).numpy()
      histogram += np.histogram(sampled_indices, bins=bins)[0]
    min_pvalue = min(
        stats.binomtest(k=h.item(), n=num_trials, p=prob).pvalue
        for h in histogram
    )
    self.assertGreater(min_pvalue, 1e-10)
  def test_sample_true_positive_indices_empty(self):
    contribution_counts = tf.SparseTensor(
        indices=np.zeros((0, 1), dtype=np.int64),
        values=[],
        dense_shape=[8],
    )
    noise_multiplier = 10.0
    threshold = 2
    sampled_indices = sparse_noise_utils.sample_true_positive_indices(
        contribution_counts, noise_multiplier, threshold
    )
    sampled_indices = list(sampled_indices.numpy())
    expected_indices = []
    self.assertEqual(sampled_indices, expected_indices)
  def test_sample_true_positive_indices_without_noise(self):
    contribution_counts = tf.SparseTensor(
        indices=[[0], [3], [5], [7]],
        values=[3.0, 1.0, 1.0, 2.0],
        dense_shape=[8],
    )
    noise_multiplier = 0.0
    threshold = 2
    sampled_indices = sparse_noise_utils.sample_true_positive_indices(
        contribution_counts, noise_multiplier, threshold
    )
    sampled_indices = list(sampled_indices.numpy())
    expected_indices = [0, 7]
    self.assertEqual(sampled_indices, expected_indices)
  def test_sample_true_positive_indices_with_noise(self):
    contribution_counts = tf.SparseTensor(
        indices=[[0], [3], [5], [7]],
        values=[30.0, 1.0, 1.0, 20.0],
        dense_shape=[8],
    )
    noise_multiplier = 1.0
    threshold = 10
    sampled_indices = sparse_noise_utils.sample_true_positive_indices(
        contribution_counts, noise_multiplier, threshold
    )
    sampled_indices = list(sampled_indices.numpy())
    expected_indices = [0, 7]
    self.assertEqual(sampled_indices, expected_indices)
  def test_batch_size_heuristic(self):
    max_index = 100
    prob = 0.5
    batch_size = sparse_noise_utils._sample_sparse_indices_batch_size_heuristic(
        max_index, prob
    )
    self.assertGreater(batch_size, 0)
    self.assertLess(batch_size, max_index + 1)
 if __name__ == '__main__':
  tf.test.main()