diff --git a/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD b/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD index e28361d..912b439 100644 --- a/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD +++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/BUILD @@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"]) licenses(["notice"]) +py_library( + name = "sparse_noise_utils", + srcs = ["sparse_noise_utils.py"], +) + +py_test( + name = "sparse_noise_utils_test", + srcs = ["sparse_noise_utils_test.py"], + deps = [":sparse_noise_utils"], +) + py_library( name = "type_aliases", srcs = ["type_aliases.py"], diff --git a/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils.py b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils.py new file mode 100644 index 0000000..dd98a45 --- /dev/null +++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils.py @@ -0,0 +1,171 @@ +# Copyright 2024, The TensorFlow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utils for adding sparse noise to gradients. + +For more details on the algorithm, refer to https://arxiv.org/abs/2311.08357. +""" + +from typing import Optional, Sequence + +import tensorflow as tf +import tensorflow_probability as tfp + + +def split_noise_multiplier( + noise_multiplier: float, + sparse_selection_ratio: float, + sparse_selection_contribution_counts: Sequence[Optional[tf.SparseTensor]], +) -> tuple[float, float]: + """Splits noise multiplier between partition selection and gradient noise. + + Returns one noise multiplier for gradient noise and one noise multiplier + for each sparse partition selection layer such that composing all gaussian + mechanisms with these noise multipliers is equivalent to applying a single + gaussian mechanism with the original noise multiplier. + + Args: + noise_multiplier: The original noise multiplier. + sparse_selection_ratio: The ratio of partition selection noise and gradient + noise. + sparse_selection_contribution_counts: The contribution counts for each + sparse selection variable. If a sparse selection count is None, it will be + ignored. + + Returns: + A tuple of noise multipliers for sparse selection and gradient noise. + + Raises: + ValueError: If the sparse selection ratio is not between 0 and 1, if the + sparse selection contribution counts is None, or if there are no sparse + selection contribution counts. + """ + if sparse_selection_ratio <= 0.0 or sparse_selection_ratio >= 1.0: + raise ValueError('Sparse selection ratio must be between 0 and 1.') + num_sparse_selections = sum( + 1 for c in sparse_selection_contribution_counts if c is not None + ) + if num_sparse_selections == 0: + raise ValueError('No sparse selections contribution counts found.') + + ratio = (1.0 + sparse_selection_ratio**2.0) ** 0.5 + total_noise_multiplier_sparse = noise_multiplier * ratio + noise_multiplier_partition_selection = ( + num_sparse_selections**0.5 * total_noise_multiplier_sparse + ) + noise_multiplier_gradient_noise = ( + noise_multiplier * ratio / sparse_selection_ratio + ) + + return noise_multiplier_partition_selection, noise_multiplier_gradient_noise + + +def _sample_sparse_indices_batch_size_heuristic( + max_index: tf.Tensor, + probability: float, +) -> tf.Tensor: + """Returns a batch size using a rough heuristic to use for sampling. + + This heuristic should roughly allow for the sampling to only use a single + batch to sample all indices >95% of the time. + + Args: + max_index: The maximum index to sample. + probability: The probability of sampling each index. + + Returns: + The batch size to use for sampling. + """ + max_num_samples = tf.cast(max_index + 1, tf.float32) + expected_num_samples = max_num_samples * probability + # For expected samples > 50, choosing a batch size of 1.2 * expected samples + # will allow for sampling only once to get all indices >95% of the time. + min_batch_size = 50.0 + return tf.cast( + tf.maximum(min_batch_size, 1.2 * expected_num_samples), tf.int32 + ) + + +@tf.function +def sample_false_positive_indices( + max_index: tf.Tensor, probability: float, batch_size: Optional[int] = None +) -> tf.Tensor: + """Samples indices with probability `probability` iid sparsely. + + This function generates a list of indices in the range of [0, max_index] + where each index is sampled with probability `probability` independently. To + achieve this efficiently, we use the geometric distribution to sample a batch + of indices at a time and repeat this process until all indices are sampled. + + Args: + max_index: The maximum index to sample. + probability: The probability of sampling each index. + batch_size: The batch size to use for sampling. If None, a heuristic will be + used to determine the batch size. + + Returns: + A tensor of sampled indices. + """ + if probability <= 0.0: + return tf.constant([], dtype=tf.int64) + + sampled_indices = tf.TensorArray(tf.int32, size=0, dynamic_size=True) + + batch_size = batch_size or _sample_sparse_indices_batch_size_heuristic( + max_index, probability + ) + + geom = tfp.distributions.geometric.Geometric(probs=probability) + + i, current_max = tf.constant(0), tf.constant(-1) + while current_max < max_index: + sample = tf.cast(geom.sample(batch_size) + 1, tf.int32) + indices = current_max + tf.cumsum(sample) + current_max = indices[-1] + sampled_indices = sampled_indices.write(i, indices) + i += 1 + + indices = tf.cast(sampled_indices.concat(), tf.int32) + indices = indices[indices <= max_index] + return tf.cast(indices, tf.int64) + + +def sample_true_positive_indices( + contribution_counts: tf.SparseTensor, + noise_multiplier: float, + threshold: int, +) -> tf.Tensor: + """Samples indices where the count + Gaussian noise is above a threshold. + + Args: + contribution_counts: The contribution counts for each index. + noise_multiplier: The noise multiplier to use for the gaussian noise. + threshold: The threshold to use for the selection. + + Returns: + A tensor of sampled indices. + """ + contribution_count_values = tf.reshape(contribution_counts.values, (-1,)) + noised_contribution_count_values = ( + contribution_count_values + + tf.random.normal( + tf.shape(contribution_count_values), + mean=0.0, + stddev=noise_multiplier, + dtype=tf.float32, + ) + ) + noised_contribution_counts_indices = contribution_counts.indices[ + noised_contribution_count_values >= threshold + ][:, 0] + return tf.reshape(noised_contribution_counts_indices, (-1,)) diff --git a/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils_test.py b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils_test.py new file mode 100644 index 0000000..5941e74 --- /dev/null +++ b/tensorflow_privacy/privacy/sparsity_preserving_noise/sparse_noise_utils_test.py @@ -0,0 +1,289 @@ +# Copyright 2024, The TensorFlow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for sparse_noise_utils.""" + +from absl.testing import parameterized +import numpy as np +from scipy import stats +import tensorflow as tf +from tensorflow_privacy.privacy.sparsity_preserving_noise import sparse_noise_utils + + +class SparseNoiseUtilsTest(tf.test.TestCase, parameterized.TestCase): + + @parameterized.named_parameters( + dict( + testcase_name='one_sparse_layer', + noise_multiplier=1.0, + sparse_selection_ratio=0.8, + sparse_selection_contribution_counts=[ + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ) + ], + ), + dict( + testcase_name='multiple_sparse_layer', + noise_multiplier=1.0, + sparse_selection_ratio=0.1, + sparse_selection_contribution_counts=[ + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ), + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ), + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ), + ], + ), + ) + def test_split_noise_multiplier( + self, + noise_multiplier, + sparse_selection_ratio, + sparse_selection_contribution_counts, + ): + noise_multiplier_sparse, noise_multiplier_dense = ( + sparse_noise_utils.split_noise_multiplier( + noise_multiplier, + sparse_selection_ratio, + sparse_selection_contribution_counts, + ) + ) + num_sparse_layers = len(sparse_selection_contribution_counts) + + total_noise_multiplier_sparse = ( + noise_multiplier_sparse / num_sparse_layers**0.5 + ) + self.assertAlmostEqual( + total_noise_multiplier_sparse, + sparse_selection_ratio * noise_multiplier_dense, + ) + total_noise_multiplier = ( + 1.0 + / ( + 1.0 / total_noise_multiplier_sparse**2 + + 1.0 / noise_multiplier_dense**2 + ) + ** 0.5 + ) + self.assertAlmostEqual(total_noise_multiplier, noise_multiplier) + + @parameterized.named_parameters( + dict( + testcase_name='no_sparse_layers', + noise_multiplier=1.0, + sparse_selection_ratio=0.5, + sparse_selection_contribution_counts=[], + error_message='No sparse selections contribution counts found.', + ), + dict( + testcase_name='sparse_layers_none', + noise_multiplier=1.0, + sparse_selection_ratio=0.5, + sparse_selection_contribution_counts=[None], + error_message='No sparse selections contribution counts found.', + ), + dict( + testcase_name='zero_ratio', + noise_multiplier=1.0, + sparse_selection_ratio=0.0, + sparse_selection_contribution_counts=[ + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ) + ], + error_message='Sparse selection ratio must be between 0 and 1.', + ), + dict( + testcase_name='one_ratio', + noise_multiplier=1.0, + sparse_selection_ratio=1.0, + sparse_selection_contribution_counts=[ + tf.SparseTensor( + indices=[[0]], + values=[1], + dense_shape=[3], + ) + ], + error_message='Sparse selection ratio must be between 0 and 1.', + ), + ) + def test_split_noise_multiplier_errors( + self, + noise_multiplier, + sparse_selection_ratio, + sparse_selection_contribution_counts, + error_message, + ): + with self.assertRaisesRegex(ValueError, error_message): + sparse_noise_utils.split_noise_multiplier( + noise_multiplier, + sparse_selection_ratio, + sparse_selection_contribution_counts, + ) + + @parameterized.named_parameters( + dict( + testcase_name='max_index_0', + max_index=0, + ), + dict( + testcase_name='max_index_10', + max_index=10, + ), + ) + def test_sample_false_positive_indices_one_prob(self, max_index): + sampled_indices = ( + sparse_noise_utils.sample_false_positive_indices(max_index, 1.0) + .numpy() + .tolist() + ) + expected_indices = list(range(max_index + 1)) + self.assertEqual(sampled_indices, expected_indices) + + @parameterized.named_parameters( + dict( + testcase_name='max_index_0', + max_index=0, + ), + dict( + testcase_name='max_index_10', + max_index=10, + ), + ) + def test_sample_false_positive_indices_zero_prob(self, max_index): + sampled_indices = ( + sparse_noise_utils.sample_false_positive_indices(max_index, 0.0) + .numpy() + .tolist() + ) + self.assertEmpty(sampled_indices) + + @parameterized.named_parameters( + dict( + testcase_name='max_index_10_prob_50', + prob=0.5, + max_index=10, + ), + dict( + testcase_name='max_index_20_prob_25', + prob=0.25, + max_index=20, + ), + dict( + testcase_name='max_index_20_prob_75', + prob=0.75, + max_index=20, + ), + ) + def test_sample_false_positive_indices_random(self, max_index, prob): + sampled_indices = sparse_noise_utils.sample_false_positive_indices( + max_index, prob + ) + sampled_indices = sampled_indices.numpy() + + self.assertLessEqual(np.max(sampled_indices), max_index) + self.assertGreaterEqual(np.min(sampled_indices), 0) + + self.assertGreater( + stats.binomtest(k=len(sampled_indices), n=max_index, p=prob).pvalue, + 1e-10, + ) + + bins = np.arange(max_index + 1) + 1 + histogram, _ = np.histogram(sampled_indices, bins=bins) + + num_trials = 10000 + for _ in range(num_trials): + sampled_indices = sparse_noise_utils.sample_false_positive_indices( + max_index, prob + ).numpy() + histogram += np.histogram(sampled_indices, bins=bins)[0] + + min_pvalue = min( + stats.binomtest(k=h.item(), n=num_trials, p=prob).pvalue + for h in histogram + ) + self.assertGreater(min_pvalue, 1e-10) + + def test_sample_true_positive_indices_empty(self): + contribution_counts = tf.SparseTensor( + indices=np.zeros((0, 1), dtype=np.int64), + values=[], + dense_shape=[8], + ) + noise_multiplier = 10.0 + threshold = 2 + sampled_indices = sparse_noise_utils.sample_true_positive_indices( + contribution_counts, noise_multiplier, threshold + ) + sampled_indices = list(sampled_indices.numpy()) + expected_indices = [] + self.assertEqual(sampled_indices, expected_indices) + + def test_sample_true_positive_indices_without_noise(self): + contribution_counts = tf.SparseTensor( + indices=[[0], [3], [5], [7]], + values=[3.0, 1.0, 1.0, 2.0], + dense_shape=[8], + ) + noise_multiplier = 0.0 + threshold = 2 + sampled_indices = sparse_noise_utils.sample_true_positive_indices( + contribution_counts, noise_multiplier, threshold + ) + sampled_indices = list(sampled_indices.numpy()) + expected_indices = [0, 7] + self.assertEqual(sampled_indices, expected_indices) + + def test_sample_true_positive_indices_with_noise(self): + contribution_counts = tf.SparseTensor( + indices=[[0], [3], [5], [7]], + values=[30.0, 1.0, 1.0, 20.0], + dense_shape=[8], + ) + noise_multiplier = 1.0 + threshold = 10 + sampled_indices = sparse_noise_utils.sample_true_positive_indices( + contribution_counts, noise_multiplier, threshold + ) + sampled_indices = list(sampled_indices.numpy()) + expected_indices = [0, 7] + self.assertEqual(sampled_indices, expected_indices) + + def test_batch_size_heuristic(self): + max_index = 100 + prob = 0.5 + batch_size = sparse_noise_utils._sample_sparse_indices_batch_size_heuristic( + max_index, prob + ) + self.assertGreater(batch_size, 0) + self.assertLess(batch_size, max_index + 1) + + +if __name__ == '__main__': + tf.test.main()