From 3055f4ad521343a82563dff6fa4f782d4f72da1f Mon Sep 17 00:00:00 2001 From: Shuang Song Date: Tue, 29 Jun 2021 21:26:05 -0700 Subject: [PATCH] Add header and some minor comments to secret sharer colab. PiperOrigin-RevId: 382225535 --- .../secret_sharer/secret_sharer_example.ipynb | 544 ++++++++++++++++++ 1 file changed, 544 insertions(+) create mode 100644 tensorflow_privacy/privacy/privacy_tests/secret_sharer/secret_sharer_example.ipynb diff --git a/tensorflow_privacy/privacy/privacy_tests/secret_sharer/secret_sharer_example.ipynb b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/secret_sharer_example.ipynb new file mode 100644 index 0000000..aab4085 --- /dev/null +++ b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/secret_sharer_example.ipynb @@ -0,0 +1,544 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "RY553UYW3gVB" + }, + "source": [ + "Copyright 2021 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8rRnn3VN3myz" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6JCBWY2_GP_e" + }, + "source": [ + "In this colab, we demonstrate how [secret sharer](https://arxiv.org/abs/1802.08232) works on a language model. We will train a model with \"secrets\", i.e. random sequences, inserted in the training data, and then evaluate if the model has \"memorized\" those secrets.\n", + "\n", + "(The code is largely based on a Tensorflow tutorial for training a character-level LSTM on Shakespeare dataset.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bGskLy4nCRGd" + }, + "source": [ + "#Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AdMDFipI5cBw" + }, + "source": [ + "You may set the runtime to use a GPU by Runtime \u003e Change runtime type \u003e Hardware accelerator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "epvtpPkE1Mkx" + }, + "outputs": [], + "source": [ + "# @title Install dependencies\n", + "# You may need to restart the runtime to use tensorflow-privacy.\n", + "from IPython.display import clear_output\n", + "\n", + "!pip install tf-models-official\n", + "!pip install -e git+https://github.com/tensorflow/privacy.git\n", + "clear_output()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9DnsfO3DBwL0" + }, + "outputs": [], + "source": [ + "import functools\n", + "import os\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from official.utils.misc import keras_utils\n", + "\n", + "from tensorflow_privacy.privacy.privacy_tests.secret_sharer.generate_secrets import SecretConfig, generate_secrets_and_references, construct_secret\n", + "from tensorflow_privacy.privacy.privacy_tests.secret_sharer.exposures import compute_exposure_interpolation, compute_exposure_extrapolation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XsntG_DoJCcW" + }, + "source": [ + "# Define functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ueYURd5kB7yk" + }, + "outputs": [], + "source": [ + "#@title Define some parameters for the model.\n", + "EMBEDDING_DIM = 256\n", + "RNN_UNITS = 1024\n", + "SEQ_LENGTH = 100\n", + "\n", + "BATCHES_PER_EPOCH = 11043\n", + "TRAIN_EPOCHS = 50\n", + "BATCH_SIZE = 64\n", + "LOG_STEPS = 100\n", + "\n", + "ENABLE_EAGER = True\n", + "\n", + "# We will download shakespeare.txt later\n", + "TRAINING_DATA = 'shakespeare.txt'\n", + "# We will try multiple sets of secrets, so we set a placeholder for the secret's name in the directory\n", + "MODEL_DIR = 'secret{}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sq5WkivECbJy" + }, + "outputs": [], + "source": [ + "#@title Function for getting dataset combined with secrets.\n", + "def get_dataset(path_to_file, secrets=None,\n", + " batch_size=None, seq_length=SEQ_LENGTH):\n", + " \"\"\"Creates a dataset from a given text file.\n", + "\n", + " Args:\n", + " path_to_file: The path to the training data.\n", + " secrets: Secrets to be inserted.\n", + " batch_size: Batch size to use.\n", + " seq_length: The length of the LSTM sequence.\n", + "\n", + " Returns:\n", + " A tuple, consisting of the Dataset and the class to character mapping\n", + " and character to class mapping.\n", + " \"\"\"\n", + " with tf.io.gfile.GFile(path_to_file, 'rb') as train_data:\n", + " text = train_data.read().decode(encoding='utf-8')\n", + "\n", + " # Create vocab\n", + " vocab = sorted(set(text))\n", + " char2idx = {u: i for i, u in enumerate(vocab)}\n", + " idx2char = np.array(vocab)\n", + "\n", + " # Gather all secrets\n", + " if not secrets:\n", + " pass\n", + " else:\n", + " all_secrets = []\n", + " for secret in secrets:\n", + " secret.datasets = {}\n", + " for r, seqs in secret.secrets.items():\n", + " all_secrets += seqs * r\n", + " indexed_seqs = np.array([[char2idx[c] for c in s] for s in seqs])\n", + " secret.datasets[r] = {\n", + " 'data': tf.data.Dataset.from_tensor_slices(indexed_seqs).map(lambda chunk: chunk[:-1]).batch(1),\n", + " 'label': list(map(lambda chunk: chunk[1:], indexed_seqs))\n", + " }\n", + " indexed_seqs = np.array([[char2idx[c] for c in s] for s in secret.references])\n", + " secret.datasets[0] = {\n", + " 'data': tf.data.Dataset.from_tensor_slices(indexed_seqs).map(lambda chunk: chunk[:-1]).batch(1),\n", + " 'label': list(map(lambda chunk: chunk[1:], indexed_seqs))\n", + " }\n", + " # Insert the secrets into the training data\n", + " idx = np.random.choice(len(text) - 1, len(all_secrets) - 1, replace=False) + 1\n", + " # split text into len(all_secrets) parts\n", + " text = np.split(list(text), np.sort(idx))\n", + " text = [''.join(t) for t in text]\n", + " text = ''.join(np.vstack([text, all_secrets]).transpose().reshape(-1))\n", + "\n", + " # Split text into sequence length + 1 chucks to create examples\n", + " def split_input_target(chunk):\n", + " input_text = chunk[:-1]\n", + " target_text = chunk[1:]\n", + " return input_text, tf.one_hot(target_text, len(vocab))\n", + "\n", + " text_as_int = np.array([char2idx[c] for c in text])\n", + " char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)\n", + " sequences = char_dataset.batch(seq_length+1, drop_remainder=True)\n", + "\n", + " dataset = sequences.map(split_input_target)\n", + " dataset = dataset.shuffle(10000).repeat()\n", + " dataset = dataset.batch(batch_size, drop_remainder=True)\n", + "\n", + " return dataset, idx2char, char2idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "fhTg2wl7CdtG" + }, + "outputs": [], + "source": [ + "#@title Functions for building, training model and making prediction.\n", + "def build_model(vocab_size,\n", + " embedding_dim=EMBEDDING_DIM,\n", + " rnn_units=RNN_UNITS,\n", + " batch_size=None,\n", + " stateful=False,\n", + " use_cudnn=True):\n", + " \"\"\"Builds the Shakespeare model.\n", + "\n", + " Args:\n", + " vocab_size: The number of character classes in the input.\n", + " embedding_dim: The dimension of the embedding space for each class.\n", + " rnn_units: The number of RNN units in the layer.\n", + " batch_size: When predicting, the batch size of the predictions.\n", + " stateful: If true, the LSTM is stateful.\n", + "\n", + " Returns:\n", + " A Keras Model.\n", + " \"\"\"\n", + " LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)\n", + "\n", + " # By indirecting the activation through a lambda layer, the logic to dispatch\n", + " # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN\n", + " # mode.\n", + " lstm_activation = ('tanh' if use_cudnn else\n", + " lambda x: tf.math.tanh(x))\n", + "\n", + " batch_shape = [batch_size if stateful else None, None]\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Embedding(vocab_size, embedding_dim,\n", + " batch_input_shape=batch_shape),\n", + " LSTM(rnn_units,\n", + " activation=lstm_activation,\n", + " return_sequences=True,\n", + " stateful=stateful,\n", + " recurrent_initializer='glorot_uniform'),\n", + " tf.keras.layers.Dense(vocab_size),\n", + " tf.keras.layers.Softmax(dtype=tf.float32)])\n", + "\n", + "\n", + "def train_model(dataset, vocab_size, batch_size,\n", + " checkpoint_dir=None,\n", + " use_cudnn=True):\n", + " \"\"\"Trains a Shakespeare model.\n", + "\n", + " Args:\n", + " dataset: the training data set.\n", + " vocab_size: the number of unique character classes.\n", + " batch_size: batch size of training.\n", + " checkpoint_dir: if not None, the directory in which to make checkpoints.\n", + "\n", + " Returns:\n", + " The training history and callbacks.\n", + " \"\"\"\n", + " train_steps = BATCHES_PER_EPOCH // batch_size\n", + "\n", + " model = build_model(vocab_size=vocab_size, batch_size=batch_size,\n", + " use_cudnn=use_cudnn)\n", + "\n", + " # When keras_use_ctl is False, Model.fit() automatically applies\n", + " # loss scaling so we don't need to create a LossScaleOptimizer.\n", + " model.compile(\n", + " optimizer=tf.keras.optimizers.Adam(),\n", + " loss=tf.keras.losses.CategoricalCrossentropy(),\n", + " metrics=[tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),\n", + " tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],\n", + " run_eagerly=ENABLE_EAGER)\n", + "\n", + " callbacks = []\n", + " if checkpoint_dir:\n", + " checkpoint_filepath = os.path.join(checkpoint_dir, 'ckpt_{epoch}')\n", + " checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(\n", + " filepath=checkpoint_filepath,\n", + " save_weights_only=True)\n", + " callbacks.append(checkpoint_callback)\n", + " checkpoint_latest = tf.train.latest_checkpoint(checkpoint_dir)\n", + " if checkpoint_latest:\n", + " model.load_weights(checkpoint_latest)\n", + " initial_epoch = int(os.path.basename(checkpoint_latest).split('.')[0].split('_')[-1])\n", + " print(f'Loaded from {checkpoint_latest}. Will train from epoch {initial_epoch}.')\n", + " else:\n", + " initial_epoch = 0\n", + "\n", + " time_callback = keras_utils.TimeHistory(batch_size, LOG_STEPS)\n", + " callbacks.append(time_callback)\n", + " history = model.fit(dataset,\n", + " epochs=TRAIN_EPOCHS,\n", + " initial_epoch=initial_epoch,\n", + " steps_per_epoch=train_steps,\n", + " callbacks=callbacks,\n", + " verbose=2)\n", + " return history, callbacks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "nuEvdB0MCjb6" + }, + "outputs": [], + "source": [ + "#@title Function to compute perplexity.\n", + "def compute_perplexity_for_secret(secrets, checkpoint_dir, vocab_size):\n", + " \"\"\"Get perplexity for given examples.\n", + "\n", + " Args:\n", + " secrets: a list, where each element is a secret dictionary\n", + " checkpoint_dir: the directory from which to load checkpoints\n", + " vocab_size: vocabulary size\n", + "\n", + " Returns:\n", + " Perplexity for the input examples.\n", + " \"\"\"\n", + " prediction_model = build_model(\n", + " vocab_size=vocab_size, batch_size=1, stateful=True)\n", + " prediction_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))\n", + " prediction_model.compile(loss=tf.keras.losses.CategoricalCrossentropy())\n", + "\n", + " cce = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)\n", + " ppls = [{} for _ in range(len(secrets))]\n", + " ppls_reference = [None] * len(secrets)\n", + " for i, secret in enumerate(secrets):\n", + " for r in secret.datasets.keys():\n", + " d = secret.datasets[r]\n", + " loss = np.mean(cce(d['label'], prediction_model.predict(d['data'])),\n", + " axis=1)\n", + " loss = list(loss)\n", + " if r == 0:\n", + " ppls_reference[i] = loss\n", + " else:\n", + " ppls[i][r] = loss\n", + " return ppls, ppls_reference" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tdd1Y_8TlWf2" + }, + "source": [ + "# Run training and secret sharer on Shakespeare dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NV1pu5RdKZ-j" + }, + "outputs": [], + "source": [ + "#@title Donwload the Shakespeare dataset\n", + "!wget --no-check-certificate --no-verbose https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt\n", + "!head -n 20 shakespeare.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kqhI0ypXna63" + }, + "outputs": [], + "source": [ + "#@title Check the vocabulary of the dataset\n", + "# (to make sure the secrets' vocabulary is a subset)\n", + "with open('shakespeare.txt', 'rb') as f:\n", + " text = f.read().decode(encoding='utf-8')\n", + " print(sorted(set(text)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDYu_vNyqCCh" + }, + "outputs": [], + "source": [ + "#@title Generate secrets\n", + "# Here we present just one example and you can try other combinations of patterns,\n", + "# vocabs and number of repetitions. For example, you may use other patterns and\n", + "# different vocabularies for them.\n", + "patterns = ['{}' * 5,\n", + " 'Info ' + '{}' * 5,\n", + " '{}{}-{}{}-{}',\n", + " '{}' * 10]\n", + "vocabs = [list('ABCDEFGHIJ')] * 4\n", + "num_repetitions = [1, 10, 100]\n", + "num_secrets_for_repetitions = [20] * len(num_repetitions)\n", + "num_references = 65536\n", + "secret_configs = [SecretConfig(vocab, pattern, num_repetitions,\n", + " num_secrets_for_repetitions, num_references)\n", + " for vocab, pattern in zip(vocabs, patterns)]\n", + "secrets = generate_secrets_and_references(secret_configs)\n", + "\n", + "# Let's look at the variable \"secrets\"\n", + "print(f'\"secrets\" is a list and the length is {len(secrets)} because we have four sets of secrets.')\n", + "print(f'The type of the element in \"secrets\" is {type(secrets[0])}.')\n", + "print(f'The field \"config\" contains its configuration: {secrets[0].config}')\n", + "print(f'The filed \"secrets\" is a dictionary, with key being the number of repetitions and value being a list of secrets.',\n", + " f'For example, the secrets that are going to be repeated for 100 times are {secrets[0].secrets[100]}.')\n", + "print(f'The field \"references\" is a list of reference sequences that are not going to be injected. The first 10 elements are {secrets[0].references[:10]}.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DVjBe-6-3mPj" + }, + "outputs": [], + "source": [ + "#@title Combine secrets with training data and train the models.\n", + "# We're going to train 4 models corresponding to the four sets of secrets,\n", + "# to examine the effect of different ways of generating secrets.\n", + "for i in range(len(secrets)):\n", + " dataset, idx2char, char2idx = get_dataset(\n", + " TRAINING_DATA,\n", + " [secrets[i]],\n", + " batch_size=BATCH_SIZE)\n", + "\n", + " print(f'Train model with secret set #{i}.')\n", + " train_model(dataset, len(idx2char), BATCH_SIZE,\n", + " checkpoint_dir=MODEL_DIR.format(i))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 217464, + "status": "ok", + "timestamp": 1624485918894, + "user": { + "displayName": "", + "photoUrl": "", + "userId": "" + }, + "user_tz": 420 + }, + "id": "6sf1GeayuELI", + "outputId": "198171d1-7d77-4812-9d68-03328e6d71c8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Secrets #1, vocab=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], pattern={}{}{}{}{}\n", + " Interpolation: repetition=1, avg_exposure=1.40±1.05; repetition=10, avg_exposure=3.47±1.70; repetition=100, avg_exposure=6.40±2.97\n", + " Extrapolation: repetition=1, avg_exposure=1.41±1.08; repetition=10, avg_exposure=3.59±1.82; repetition=100, avg_exposure=6.56±2.91\n", + "Secrets #2, vocab=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], pattern=Info {}{}{}{}{}\n", + " Interpolation: repetition=1, avg_exposure=2.82±1.75; repetition=10, avg_exposure=7.41±2.96; repetition=100, avg_exposure=8.49±3.27\n", + " Extrapolation: repetition=1, avg_exposure=2.81±1.72; repetition=10, avg_exposure=7.07±2.54; repetition=100, avg_exposure=7.95±2.77\n", + "Secrets #3, vocab=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], pattern={}{}-{}{}-{}\n", + " Interpolation: repetition=1, avg_exposure=2.47±1.53; repetition=10, avg_exposure=3.42±2.41; repetition=100, avg_exposure=5.23±3.55\n", + " Extrapolation: repetition=1, avg_exposure=2.49±1.52; repetition=10, avg_exposure=3.40±2.34; repetition=100, avg_exposure=5.01±3.07\n", + "Secrets #4, vocab=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], pattern={}{}{}{}{}{}{}{}{}{}\n", + " Interpolation: repetition=1, avg_exposure=1.82±1.67; repetition=10, avg_exposure=4.24±3.36; repetition=100, avg_exposure=6.95±4.61\n", + " Extrapolation: repetition=1, avg_exposure=1.82±1.66; repetition=10, avg_exposure=4.15±3.20; repetition=100, avg_exposure=6.88±4.59\n" + ] + } + ], + "source": [ + "#@title Secret sharer evaluation\n", + "ppls_all, ppls_reference_all = [], []\n", + "for i in range(len(secrets)):\n", + " ppls, ppls_reference = compute_perplexity_for_secret(\n", + " [secrets[i]], MODEL_DIR.format(i), len(idx2char))\n", + " ppls_all += ppls\n", + " ppls_reference_all += ppls_reference\n", + "\n", + "for i, (vocab, pattern, p, p_r) in enumerate(zip(vocabs, patterns, ppls_all, ppls_reference_all)):\n", + " exposure_interpolation = compute_exposure_interpolation(p, p_r)\n", + " exposure_extrapolation = compute_exposure_extrapolation(p, p_r)\n", + "\n", + " # exposure_interpolation (or exposure_extrapolation) is a dictionary, where the key is the number of repetitions and\n", + " # the value is a list of exposures (not necessarily in the same order as the secrets). We will now take a look at\n", + " # the average exposure (with standard deviation).\n", + " print(f'Secrets #{i+1}, vocab={vocab}, pattern={pattern}')\n", + " print(' Interpolation:', '; '.join([f'repetition={r}, avg_exposure={np.mean(exposure_interpolation[r]):.2f}±{np.std(exposure_interpolation[r]):.2f}' for r in num_repetitions]))\n", + " print(' Extrapolation:', '; '.join([f'repetition={r}, avg_exposure={np.mean(exposure_extrapolation[r]):.2f}±{np.std(exposure_extrapolation[r]):.2f}' for r in num_repetitions]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E2EHuZ7HJEcl" + }, + "source": [ + "We can see the following:\n", + "1. Comparing #1 vs. #2, having the prefix (#2) for the random sequences might help the model memorize.\n", + "2. Comparing #1 vs. #3, having the pattern (#3) did not help the memorization significantly. This is different from the result in the [paper](https://arxiv.org/abs/1802.08232). The effect might be model / data specific.\n", + "3. Comparing #1 vs. #4, longer secrets (#4) show higher exposure.\n", + "\n", + "You can also try some other secret configurations to see if there is any difference." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook", + "kind": "private" + }, + "name": "secret_sharer_example.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}