diff --git a/g3doc/README.md b/g3doc/README.md new file mode 100644 index 0000000..922266a --- /dev/null +++ b/g3doc/README.md @@ -0,0 +1 @@ +# Under construction diff --git a/g3doc/guide/_index.yaml b/g3doc/guide/_index.yaml new file mode 100644 index 0000000..701cccd --- /dev/null +++ b/g3doc/guide/_index.yaml @@ -0,0 +1,50 @@ +# TODO(b/181782485): Switch to the main book for launch - /responsible_ai/_book.yaml +book_path: /responsible_ai/privacy/_book.yaml +project_path: /responsible_ai/_project.yaml +description: > + Page description used for search and social. +landing_page: + nav: left + custom_css_path: /site-assets/css/style.css + rows: + - heading: TensorFlow Privacy does something great. + items: + - classname: devsite-landing-row-50 + description: > + This is a description of PROJECT_NAME. Lorem ipsum dolor sit amet, + consectetur adipiscing elit, sed do eiusmod tempor incididunt ut + labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud + exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + + code_block: | +
+        import tensorflow as tf
+        import PROJECT_NAME
+
+        # This is a short code snippet that shows off your project.
+        # Launch a Colab notebook to run this example.
+        print("Hello, PROJECT_NAME")
+        
+ {% dynamic if request.tld != 'cn' %} + Run in a Notebook + {% dynamic endif %} + + - classname: devsite-landing-row-cards + items: + - heading: "Introducing PROJECT_NAME" + image_path: /resources/images/tf-logo-card-16x9.png + path: https://blog.tensorflow.org + buttons: + - label: "Read on TensorFlow blog" + path: https://blog.tensorflow.org + - heading: "PROJECT_NAME video" + youtube_id: 3d34Hkf7KXA + buttons: + - label: Watch the video + path: https://www.youtube.com/watch?v=3d34Hkf7KXA + - heading: "PROJECT_NAME on GitHub" + image_path: /resources/images/github-card-16x9.png + path: https://github.com/tensorflow/PROJECT_NAME + buttons: + - label: "View on GitHub" + path: https://github.com/tensorflow/PROJECT_NAME diff --git a/g3doc/guide/_toc.yaml b/g3doc/guide/_toc.yaml new file mode 100644 index 0000000..3ea85f2 --- /dev/null +++ b/g3doc/guide/_toc.yaml @@ -0,0 +1,9 @@ +toc: +- title: Overview + path: /responsible_ai/privacy/guide/ +- title: Install + path: /responsible_ai/privacy/guide/install +- title: Get Started + path: /responsible_ai/privacy/guide/get_started +- title: Measure Privacy + path: /responsible_ai/privacy/guide/measure_privacy diff --git a/g3doc/guide/get_started.md b/g3doc/guide/get_started.md new file mode 100644 index 0000000..903cdd6 --- /dev/null +++ b/g3doc/guide/get_started.md @@ -0,0 +1,3 @@ +# Get Started + +## Tips diff --git a/g3doc/guide/install.md b/g3doc/guide/install.md new file mode 100644 index 0000000..86249f5 --- /dev/null +++ b/g3doc/guide/install.md @@ -0,0 +1,3 @@ +# Installation Instructions + +## Tips diff --git a/g3doc/guide/measure_privacy.md b/g3doc/guide/measure_privacy.md new file mode 100644 index 0000000..0cda83e --- /dev/null +++ b/g3doc/guide/measure_privacy.md @@ -0,0 +1,5 @@ +# Measure Privacy + +[TOC] + +## Tips diff --git a/g3doc/tutorials/_toc.yaml b/g3doc/tutorials/_toc.yaml new file mode 100644 index 0000000..57272e2 --- /dev/null +++ b/g3doc/tutorials/_toc.yaml @@ -0,0 +1,7 @@ +toc: +- title: Overview + path: /responsible_ai/privacy/tutorials/ +- title: Compute privacy + path: /responsible_ai/privacy/tutorials/classification_privacy +- title: Assess privacy risk + path: /responsible_ai/privacy/tutorials/privacy_report diff --git a/tutorials/Classification_Privacy.ipynb b/g3doc/tutorials/classification_privacy.ipynb similarity index 79% rename from tutorials/Classification_Privacy.ipynb rename to g3doc/tutorials/classification_privacy.ipynb index 060c3f7..71cf3f2 100644 --- a/tutorials/Classification_Privacy.ipynb +++ b/g3doc/tutorials/classification_privacy.ipynb @@ -3,17 +3,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\u003ca href=\"https://colab.research.google.com/github/tensorflow/privacy/blob/master/tutorials/Classification_Privacy.ipynb\" target=\"_parent\"\u003e\u003cimg src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/\u003e\u003c/a\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", "id": "XAVN6c8prKOL" }, "source": [ @@ -22,11 +11,9 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", - "colab": {}, - "colab_type": "code", "id": "SassPC7WQAUO" }, "outputs": [], @@ -47,7 +34,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "KwDK47gfLsYf" }, "source": [ @@ -57,24 +43,28 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "MfBg1C5NB3X0" }, "source": [ "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", " \u003ctd\u003e\n", + " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/privacy/tutorials/classification_privacy\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/privacy/blob/master/tutorials/Classification_Privacy.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", " \u003c/td\u003e\n", " \u003ctd\u003e\n", " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/privacy/blob/master/tutorials/Classification_Privacy.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/privacy/g3doc/tutorials/classification_privacy.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", + " \u003c/td\u003e\n", "\u003c/table\u003e" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "00fQV7e0Unz3" }, "source": [ @@ -84,25 +74,44 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", + "id": "vsCUvXP0W4j2" + }, + "source": [ + "[Differential privacy](https://en.wikipedia.org/wiki/Differential_privacy) (DP) is a framework for measuring the privacy guarantees provided by an algorithm. Through the lens of differential privacy, you can design machine learning algorithms that responsibly train models on private data. Learning with differential privacy provides provable guarantees of privacy, mitigating the risk of exposing sensitive training data in machine learning. Intuitively, a model trained with differential privacy should not be affected by any single training example, or small set of training examples, in its data set. This mitigates the risk of exposing sensitive training data in ML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6vd8qUwEW5pP" + }, + "source": [ + "The basic idea of this approach, called differentially private stochastic gradient descent (DP-SGD), is to modify the gradients\n", + "used in stochastic gradient descent (SGD), which lies at the core of almost all deep learning algorithms. Models trained with DP-SGD provide provable differential privacy guarantees for their input data. There are two modifications made to the vanilla SGD algorithm:" + ] + }, + { + "cell_type": "markdown", + "metadata": { "id": "TUphKzYu01O9" }, "source": [ - "[Differential privacy](https://en.wikipedia.org/wiki/Differential_privacy) (DP) is a framework for measuring the privacy guarantees provided by an algorithm. Through the lens of differential privacy, we can design machine learning algorithms that responsibly train models on private data. Learning with differential privacy provides provable guarantees of privacy, mitigating the risk of exposing sensitive training data in machine learning. Intuitively, a model trained with differential privacy should not be affected by any single training example, or small set of training examples, in its data set. This mitigates the risk of exposing sensitive training data in ML.\n", - "\n", - "The basic idea of this approach, called differentially private stochastic gradient descent (DP-SGD), is to modify the gradients\n", - "used in stochastic gradient descent (SGD), which lies at the core of almost all deep learning algorithms. Models trained with DP-SGD provide provable differential privacy guarantees for their input data. There are two modifications made to the vanilla SGD algorithm:\n", - "\n", - "1. First, the sensitivity of each gradient needs to be bounded. In other words, we need to limit how much each individual training point sampled in a minibatch can influence gradient computations and the resulting updates applied to model parameters. This can be done by *clipping* each gradient computed on each training point.\n", - "2. *Random noise* is sampled and added to the clipped gradients to make it statistically impossible to know whether or not a particular data point was included in the training dataset by comparing the updates SGD applies when it operates with or without this particular data point in the training dataset.\n", - "\n", + "1. First, the sensitivity of each gradient needs to be bounded. In other words, you need to limit how much each individual training point sampled in a minibatch can influence gradient computations and the resulting updates applied to model parameters. This can be done by *clipping* each gradient computed on each training point.\n", + "2. *Random noise* is sampled and added to the clipped gradients to make it statistically impossible to know whether or not a particular data point was included in the training dataset by comparing the updates SGD applies when it operates with or without this particular data point in the training dataset.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jXU7MZhhW-aL" + }, + "source": [ "This tutorial uses [tf.keras](https://www.tensorflow.org/guide/keras) to train a convolutional neural network (CNN) to recognize handwritten digits with the DP-SGD optimizer provided by the TensorFlow Privacy library. TensorFlow Privacy provides code that wraps an existing TensorFlow optimizer to create a variant that implements DP-SGD." ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "ijJYKVc05DYX" }, "source": [ @@ -112,40 +121,31 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "CKuHPYQCsV-x" }, "source": [ - "First, set this notebook's runtime to use a GPU, under Runtime \u003e Change runtime type \u003e Hardware accelerator. Then, begin importing the necessary libraries." + "Begin by importing the necessary libraries:" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "ef56gCUqrdVn" }, "outputs": [], "source": [ - "try:\n", - " # %tensorflow_version only exists in Colab.\n", - " %tensorflow_version 1.x\n", - "except Exception:\n", - " pass\n", - "\n", "import tensorflow as tf\n", + "tf.compat.v1.disable_v2_behavior()\n", "\n", "import numpy as np\n", "\n", - "tf.compat.v1.logging.set_verbosity(tf.logging.ERROR)" + "tf.get_logger().setLevel('ERROR')" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "r_fVhfUyeI3d" }, "source": [ @@ -154,24 +154,31 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, + "metadata": { + "id": "r56BqqyEqA16" + }, + "outputs": [], + "source": [ + "!pip install tensorflow-privacy" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "RseeuA7veIHU" }, "outputs": [], "source": [ - "!pip install tensorflow_privacy\n", + "import tensorflow_privacy\n", "\n", - "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy\n", - "from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer" + "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "mU1p8N7M5Mmn" }, "source": [ @@ -182,10 +189,8 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "_1ML23FlueTr" }, "outputs": [], @@ -215,32 +220,28 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "xVDcswOCtlr3" }, "source": [ - "## Define and tune learning model hyperparameters\n", + "## Define the hyperparameters\n", "Set learning model hyperparamter values. \n" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "E14tL1vUuTRV" }, "outputs": [], "source": [ - "epochs = 15\n", + "epochs = 3\n", "batch_size = 250" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "qXNp_25y7JP2" }, "source": [ @@ -256,10 +257,8 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "pVw_r2Mq7ntd" }, "outputs": [], @@ -276,21 +275,18 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "wXAmHcNOmHc5" }, "source": [ - "## Build the learning model\n", + "## Build the model\n", "\n", "Define a convolutional neural network as the learning model. " ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "oCOo8aOLmFta" }, "outputs": [], @@ -309,14 +305,13 @@ " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Flatten(),\n", " tf.keras.layers.Dense(32, activation='relu'),\n", - " tf.keras.layers.Dense(10, activation='softmax')\n", + " tf.keras.layers.Dense(10)\n", "])" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "FT4lByFg-I_r" }, "source": [ @@ -325,15 +320,13 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "bqBvjCf5-ZXy" }, "outputs": [], "source": [ - "optimizer = DPGradientDescentGaussianOptimizer(\n", + "optimizer = tensorflow_privacy.DPKerasSGDOptimizer(\n", " l2_norm_clip=l2_norm_clip,\n", " noise_multiplier=noise_multiplier,\n", " num_microbatches=num_microbatches,\n", @@ -346,19 +339,16 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "LI_3nXzEGmrP" }, "source": [ - "## Compile and train the learning model\n" + "## Train the model\n" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "z4iV03VqG1Bo" }, "outputs": [], @@ -374,14 +364,20 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", - "id": "TL7_lX5sHCTI" + "id": "0kkzQH2LXNjF" }, "source": [ "## Measure the differential privacy guarantee\n", "\n", - "Perform a privacy analysis to measure the DP guarantee achieved by a training algorithm. Knowing the level of DP achieved enables the objective comparison of two training runs to determine which of the two is more privacy-preserving. At a high level, the privacy analysis measures how much a potential adversary can improve their guess about properties of any individual training point by observing the outcome of our training procedure (e.g., model updates and parameters). \n", - "\n", + "Perform a privacy analysis to measure the DP guarantee achieved by a training algorithm. Knowing the level of DP achieved enables the objective comparison of two training runs to determine which of the two is more privacy-preserving. At a high level, the privacy analysis measures how much a potential adversary can improve their guess about properties of any individual training point by observing the outcome of the training procedure (e.g., model updates and parameters). \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TL7_lX5sHCTI" + }, + "source": [ "This guarantee is sometimes referred to as the **privacy budget**. A lower privacy budget bounds more tightly an adversary's ability to improve their guess. This ensures a stronger privacy guarantee. Intuitively, this is because it is harder for a single training point to affect the outcome of learning: for instance, the information contained in the training point cannot be memorized by the ML algorithm and the privacy of the individual who contributed this training point to the dataset is preserved.\n", "\n", "In this tutorial, the privacy analysis is performed in the framework of Rényi Differential Privacy (RDP), which is a relaxation of pure DP based on [this paper](https://arxiv.org/abs/1702.07476) that is particularly well suited for DP-SGD.\n" @@ -390,40 +386,47 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "wUEk25pgmnm-" }, "source": [ "Two metrics are used to express the DP guarantee of an ML algorithm:\n", "\n", "1. Delta ($\\delta$) - Bounds the probability of the privacy guarantee not holding. A rule of thumb is to set it to be less than the inverse of the size of the training dataset. In this tutorial, it is set to **10^-5** as the MNIST dataset has 60,000 training points.\n", - "2. Epsilon ($\\epsilon$) - This is the privacy budget. It measures the strength of the privacy guarantee by bounding how much the probability of a particular model output can vary by including (or excluding) a single training point. A smaller value for $\\epsilon$ implies a better privacy guarantee. However, the $\\epsilon$ value is only an upper bound and a large value could still mean good privacy in practice.\n", - "\n", - "Tensorflow Privacy provides a tool, `compute_dp_sgd_privacy.py`, to compute the value of $\\epsilon$ given a fixed value of $\\delta$ and the following hyperparameters from the training process:\n", - "\n", - "1. The total number of points in the training data, `n`.\n", - "2. The `batch_size`.\n", - "3. The `noise_multiplier`.\n", - "4. The number of `epochs` of training.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ws8-nVuVDgtJ" - }, - "outputs": [], - "source": [ - "compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=60000, batch_size=250, noise_multiplier=1.3, epochs=15, delta=1e-5)" + "2. Epsilon ($\\epsilon$) - This is the privacy budget. It measures the strength of the privacy guarantee by bounding how much the probability of a particular model output can vary by including (or excluding) a single training point. A smaller value for $\\epsilon$ implies a better privacy guarantee. However, the $\\epsilon$ value is only an upper bound and a large value could still mean good privacy in practice." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PczVdKsGyRQM" + }, + "source": [ + "Tensorflow Privacy provides a tool, `compute_dp_sgd_privacy`, to compute the value of $\\epsilon$ given a fixed value of $\\delta$ and the following hyperparameters from the training process:\n", + "\n", + "1. The total number of points in the training data, `n`.\n", + "2. The `batch_size`.\n", + "3. The `noise_multiplier`.\n", + "4. The number of `epochs` of training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ws8-nVuVDgtJ" + }, + "outputs": [], + "source": [ + "compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=train_data.shape[0],\n", + " batch_size=batch_size,\n", + " noise_multiplier=noise_multiplier,\n", + " epochs=epochs,\n", + " delta=1e-5)" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "c-KyttEWFRDc" }, "source": [ @@ -433,7 +436,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "SA_9HMGBWFM3" }, "source": [ @@ -446,10 +448,10 @@ } ], "metadata": { + "accelerator": "GPU", "colab": { "collapsed_sections": [], - "name": "Classification_Privacy.ipynb", - "provenance": [], + "name": "classification_privacy.ipynb", "toc_visible": true }, "kernelspec": { diff --git a/g3doc/tutorials/index.md b/g3doc/tutorials/index.md new file mode 100644 index 0000000..f6b8051 --- /dev/null +++ b/g3doc/tutorials/index.md @@ -0,0 +1,3 @@ +# PROJECT_NAME tutorials + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. diff --git a/tensorflow_privacy/privacy/membership_inference_attack/codelabs/privacy_report_codelab.ipynb b/g3doc/tutorials/privacy_report.ipynb similarity index 60% rename from tensorflow_privacy/privacy/membership_inference_attack/codelabs/privacy_report_codelab.ipynb rename to g3doc/tutorials/privacy_report.ipynb index 8eacc2e..1e89828 100644 --- a/tensorflow_privacy/privacy/membership_inference_attack/codelabs/privacy_report_codelab.ipynb +++ b/g3doc/tutorials/privacy_report.ipynb @@ -13,7 +13,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "both", + "cellView": "form", "id": "4rmwPgXeptiS" }, "outputs": [], @@ -48,11 +48,17 @@ "source": [ "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", " \u003ctd\u003e\n", + " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/privacy/tutorials/privacy_report\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/membership_inference_attack/codelabs/privacy_report_codelab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", " \u003c/td\u003e\n", " \u003ctd\u003e\n", " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/membership_inference_attack/codelabs/privacy_report_codelab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/privacy/g3doc/tutorials/privacy_report.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", + " \u003c/td\u003e\n", "\u003c/table\u003e" ] }, @@ -63,7 +69,7 @@ }, "source": [ "##Overview\n", - "In this codelab we'll train a simple image classification model on the CIFAR10 dataset, and then use the \"membership inference attack\" against this model to assess if the attacker is able to \"guess\" whether a particular sample was present in the training set. We will use the TF Privacy Report to visualize results from multiple models and model checkpoints." + "In this codelab you'll train a simple image classification model on the CIFAR10 dataset, and then use the \"membership inference attack\" against this model to assess if the attacker is able to \"guess\" whether a particular sample was present in the training set. You will use the TF Privacy Report to visualize results from multiple models and model checkpoints." ] }, { @@ -72,34 +78,34 @@ "id": "FUWqArj_q8vs" }, "source": [ - "## Setup\n", - "First, set this notebook's runtime to use a GPU, under Runtime \u003e Change runtime type \u003e Hardware accelerator. Then, begin importing the necessary libraries." + "## Setup\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "form", "id": "Lr1pwHcbralz" }, "outputs": [], "source": [ - "#@title Import statements.\n", "import numpy as np\n", - "from typing import Tuple, Text\n", + "from typing import Tuple\n", "from scipy import special\n", "from sklearn import metrics\n", "\n", "import tensorflow as tf\n", + "tf.compat.v1.disable_v2_behavior()\n", + "\n", "import tensorflow_datasets as tfds\n", "\n", "# Set verbosity.\n", "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", - "from warnings import simplefilter\n", "from sklearn.exceptions import ConvergenceWarning\n", - "simplefilter(action=\"ignore\", category=ConvergenceWarning)\n", - "simplefilter(action=\"ignore\", category=FutureWarning)" + "\n", + "import warnings\n", + "warnings.simplefilter(action=\"ignore\", category=ConvergenceWarning)\n", + "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" ] }, { @@ -111,6 +117,17 @@ "### Install TensorFlow Privacy." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1n0K00S6zmfb" + }, + "outputs": [], + "source": [ + "!pip install tensorflow_privacy" + ] + }, { "cell_type": "code", "execution_count": null, @@ -120,8 +137,6 @@ }, "outputs": [], "source": [ - "!pip3 install git+https://github.com/tensorflow/privacy\n", - "\n", "from tensorflow_privacy.privacy.membership_inference_attack import membership_inference_attack as mia\n", "from tensorflow_privacy.privacy.membership_inference_attack.data_structures import AttackInputData\n", "from tensorflow_privacy.privacy.membership_inference_attack.data_structures import AttackResultsCollection\n", @@ -138,33 +153,90 @@ "id": "pBbcG86th_sW" }, "source": [ - "## Train a model" + "## Train two models, with privacy metrics\n", + "\n", + "This section trains a pair of `keras.Model` classifiers on the `CIFAR-10` dataset. During the training process it collects privacy metrics, that will be used to generate reports in the bext section.\n", + "\n", + "The first step is to define some hyperparameters:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "cellView": "both", - "id": "vCyOWyyhXLib" + "id": "al0QK7O-0lk7" }, "outputs": [], "source": [ - "#@markdown Train a simple model on CIFAR10 with Keras.\n", - "\n", "dataset = 'cifar10'\n", "num_classes = 10\n", "activation = 'relu'\n", "lr = 0.02\n", "momentum = 0.9\n", "batch_size = 250\n", - "epochs = 50 # Privacy risks are especially visible with lots of epochs.\n", + "epochs_per_report = 5\n", + "num_reports = 10\n", + "# Privacy risks are especially visible with lots of epochs.\n", + "total_epochs = epochs_per_report*num_reports " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pu5IEzW6B-Oh" + }, + "source": [ + "Next, load the dataset. There's nothing privacy-specific in this code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f1TT3ofN0qrq" + }, + "outputs": [], + "source": [ + "#@title Load the data\n", + "print('Loading the dataset.')\n", + "train_ds = tfds.as_numpy(\n", + " tfds.load(dataset, split=tfds.Split.TRAIN, batch_size=-1))\n", + "test_ds = tfds.as_numpy(\n", + " tfds.load(dataset, split=tfds.Split.TEST, batch_size=-1))\n", + "x_train = train_ds['image'].astype('float32') / 255.\n", + "y_train_indices = train_ds['label'][:, np.newaxis]\n", + "x_test = test_ds['image'].astype('float32') / 255.\n", + "y_test_indices = test_ds['label'][:, np.newaxis]\n", "\n", + "# Convert class vectors to binary class matrices.\n", + "y_train = tf.keras.utils.to_categorical(y_train_indices, num_classes)\n", + "y_test = tf.keras.utils.to_categorical(y_test_indices, num_classes)\n", "\n", + "input_shape = x_train.shape[1:]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9l-55vOLCWZM" + }, + "source": [ + "Next define a function to build the models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vCyOWyyhXLib" + }, + "outputs": [], + "source": [ + "#@title Define the models\n", "def small_cnn(input_shape: Tuple[int],\n", " num_classes: int,\n", " num_conv: int,\n", - " activation: Text = 'relu') -\u003e tf.keras.models.Sequential:\n", + " activation: str = 'relu') -\u003e tf.keras.models.Sequential:\n", " \"\"\"Setup a small CNN for image classification.\n", "\n", " Args:\n", @@ -187,51 +259,75 @@ " model.add(tf.keras.layers.Flatten())\n", " model.add(tf.keras.layers.Dense(64, activation=activation))\n", " model.add(tf.keras.layers.Dense(num_classes))\n", - " return model\n", - "\n", - "\n", - "print('Loading the dataset.')\n", - "train_ds = tfds.as_numpy(\n", - " tfds.load(dataset, split=tfds.Split.TRAIN, batch_size=-1))\n", - "test_ds = tfds.as_numpy(\n", - " tfds.load(dataset, split=tfds.Split.TEST, batch_size=-1))\n", - "x_train = train_ds['image'].astype('float32') / 255.\n", - "y_train_indices = train_ds['label'][:, np.newaxis]\n", - "x_test = test_ds['image'].astype('float32') / 255.\n", - "y_test_indices = test_ds['label'][:, np.newaxis]\n", - "\n", - "# Convert class vectors to binary class matrices.\n", - "y_train = tf.keras.utils.to_categorical(y_train_indices, num_classes)\n", - "y_test = tf.keras.utils.to_categorical(y_test_indices, num_classes)\n", - "\n", - "input_shape = x_train.shape[1:]\n", + " return model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hs0Smn24Dty-" + }, + "source": [ + "Build two-layer and a three-layer CNN models using that function. Again there's nothing provacy specific about this code. It uses standard models, layers, losses, and optimizers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nexqXAjqDgad" + }, + "outputs": [], + "source": [ + "optimizer = tf.keras.optimizers.SGD(lr=lr, momentum=momentum)\n", + "loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)\n", "\n", "three_layer_model = small_cnn(\n", " input_shape, num_classes, num_conv=3, activation=activation)\n", - "optimizer = tf.keras.optimizers.SGD(lr=lr, momentum=momentum)\n", - "loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)\n", "three_layer_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])\n", "\n", "two_layer_model = small_cnn(\n", " input_shape, num_classes, num_conv=2, activation=activation)\n", - "two_layer_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])\n", + "two_layer_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D9nrWjP9D65l" + }, + "source": [ + "### Define a callback to collect privacy metrics\n", "\n", - "epoch_results = AttackResultsCollection([])\n", - "num_epochs = 5\n", - "models = {\n", - " 'two layer model': two_layer_model,\n", - " 'three layer model': three_layer_model,\n", - "}\n", - "for model_name, model in models.items():\n", - " # Incrementally train the model and store privacy metrics every num_epochs.\n", - " for i in range(10):\n", - " model.fit(\n", - " x_train,\n", - " y_train,\n", - " batch_size=batch_size,\n", - " epochs=num_epochs,\n", - " validation_data=(x_test, y_test),\n", - " shuffle=True)\n", + "Next define a `keras.callbacks.Callback` to periorically run some privacy attacks against the model, and log the results.\n", + "\n", + "The keras `fit` method will call the `on_epoch_end` method after each training epoch. The `n` argument is the (0-based) epoch number.\n", + "\n", + "You could implement this procedure by writing a loop that repeatedly calls `Model.fit(..., epochs=epochs_per_report)` and runs the attack code. The callback is used here just because it gives a clear separation between the training logic, and the privacy evaluation logic.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "won3NecEmzzg" + }, + "outputs": [], + "source": [ + "class PrivacyMetrics(tf.keras.callbacks.Callback):\n", + " def __init__(self, epochs_per_report, model_name):\n", + " self.epochs_per_report = epochs_per_report\n", + " self.model_name = model_name\n", + " self.epochs = []\n", + " self.attack_results = [] \n", + "\n", + " def on_epoch_end(self, n, logs=None):\n", + " epoch = n + 1\n", + " if epoch % self.epochs_per_report != 0:\n", + " return\n", + " \n", + " print(f\"\\nRunning privacy report for epoch: {epoch}\")\n", + " self.epochs.append(epoch)\n", "\n", " logits_train = model.predict(x_train, batch_size=batch_size)\n", " logits_test = model.predict(x_test, batch_size=batch_size)\n", @@ -245,8 +341,8 @@ " np.argmax(prob_train, axis=1)),\n", " accuracy_test=metrics.accuracy_score(y_test_indices,\n", " np.argmax(prob_test, axis=1)),\n", - " epoch_num=num_epochs * i,\n", - " model_variant_label=model_name)\n", + " epoch_num=epoch,\n", + " model_variant_label=self.model_name)\n", "\n", " attack_results = mia.run_attacks(\n", " AttackInputData(\n", @@ -258,30 +354,50 @@ " attack_types=(AttackType.THRESHOLD_ATTACK,\n", " AttackType.LOGISTIC_REGRESSION),\n", " privacy_report_metadata=privacy_report_metadata)\n", - " epoch_results.append(attack_results)" + " self.attack_results.append(attack_results)\n" ] }, { "cell_type": "markdown", "metadata": { - "id": "0snqR0Gbv3qk" + "id": "zLPHj5ZtFhC9" }, "source": [ - "## Load attack results\n", - "We can load attack results from the model above or replace the filepath with our own results." + "### Train the models\n", + "\n", + "The next code block trains the two models. The `all_reports` list is used to collect all the results from all the models' training runs. The individual reports are tagged witht the `model_name`, so there's no confusion about which model generated which report. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "UTZwiCAJt0R6" + "id": "Gywwxs6R1aLV" }, "outputs": [], "source": [ - "loaded_results = epoch_results\n", - "# Or load your own via\n", - "#loaded_results = AttackResultsCollection.load(filepath)" + "all_reports = []\n", + "\n", + "models = {\n", + " 'two layer model': two_layer_model,\n", + " 'three layer model': three_layer_model,\n", + "}\n", + "\n", + "for model_name, model in models.items():\n", + " print(f\"\\n\\n\\nFitting {model_name}\\n\")\n", + " callback = PrivacyMetrics(epochs_per_report, \n", + " model_name)\n", + "\n", + " model.fit(\n", + " x_train,\n", + " y_train,\n", + " batch_size=batch_size,\n", + " epochs=total_epochs,\n", + " validation_data=(x_test, y_test),\n", + " callbacks=[callback],\n", + " shuffle=True)\n", + " \n", + " all_reports.extend(callback.attack_results)\n" ] }, { @@ -292,9 +408,20 @@ "source": [ "## Epoch Plots\n", "\n", - "We can visualize how privacy risks happen as we train models. By probing the model periodically (e.g. every 10 epochs), we can pick the point in time with the best performance / privacy trade-off.\n", + "You can visualize how privacy risks happen as you train models by probing the model periodically (e.g. every 5 epochs), you can pick the point in time with the best performance / privacy trade-off.\n", "\n", - "We use the TF Privacy Membership Inference Attack module to generate AttackResults. These AttackResults get combined into an AttackResultsCollection. The TF Privacy Report is designed to analyze the provided AttackResultsCollection." + "Use the TF Privacy Membership Inference Attack module to generate `AttackResults`. These `AttackResults` get combined into an `AttackResultsCollection`. The TF Privacy Report is designed to analyze the provided `AttackResultsCollection`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wT7zfUC8HXRI" + }, + "outputs": [], + "source": [ + "results = AttackResultsCollection(all_reports)" ] }, { @@ -308,7 +435,7 @@ "source": [ "privacy_metrics = (PrivacyMetric.AUC, PrivacyMetric.ATTACKER_ADVANTAGE)\n", "epoch_plot = privacy_report.plot_by_epochs(\n", - " loaded_results, privacy_metrics=privacy_metrics)" + " results, privacy_metrics=privacy_metrics)" ] }, { @@ -317,7 +444,7 @@ "id": "ijjwGgyixsFg" }, "source": [ - "We see that as a rule, privacy vulnerability tends to increase as the number of epochs goes up. This is true across model variants as well as different attacker types.\n", + "See that as a rule, privacy vulnerability tends to increase as the number of epochs goes up. This is true across model variants as well as different attacker types.\n", "\n", "Two layer models (with fewer convolutional layers) are generally more vulnerable than their three layer model counterparts.\n", "\n", @@ -343,7 +470,7 @@ "source": [ "privacy_metrics = (PrivacyMetric.AUC, PrivacyMetric.ATTACKER_ADVANTAGE)\n", "utility_privacy_plot = privacy_report.plot_privacy_vs_accuracy(\n", - " loaded_results, privacy_metrics=privacy_metrics)" + " results, privacy_metrics=privacy_metrics)" ] }, { @@ -354,7 +481,7 @@ "source": [ "Three layer models (perhaps due to too many parameters) only achieve a train accuracy of 0.85. The two layer models achieve roughly equal performance for that level of privacy risk but they continue to get better accuracy.\n", "\n", - "We can also see how the line for two layer models gets steeper. This means that additional marginal gains in train accuracy come at an expense of vast privacy vulnerabilities." + "You can also see how the line for two layer models gets steeper. This means that additional marginal gains in train accuracy come at an expense of vast privacy vulnerabilities." ] }, { @@ -369,31 +496,15 @@ } ], "metadata": { + "accelerator": "GPU", "colab": { "collapsed_sections": [], - "last_runtime": { - "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook", - "kind": "private" - }, - "name": "TF Privacy Report codelab", - "provenance": [] + "name": "privacy_report.ipynb", + "toc_visible": true }, "kernelspec": { "display_name": "Python 3", - "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" } }, "nbformat": 4,