diff --git a/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack.py b/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack.py deleted file mode 100644 index 5c14b36..0000000 --- a/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack.py +++ /dev/null @@ -1,737 +0,0 @@ -# Copyright 2020, The TensorFlow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""Code that runs membership inference attacks based on the model outputs. - -Warning: This file belongs to the old API for membership inference attacks. This -file will be removed soon. membership_inference_attack_new.py contains the new -API. -""" - -import collections -import io -import os -import re - -from typing import Text, Dict, Iterable, Tuple, Union, Any - -from absl import logging -import numpy as np -from scipy import special - -from tensorflow_privacy.privacy.membership_inference_attack import plotting -from tensorflow_privacy.privacy.membership_inference_attack import trained_attack_models -from tensorflow_privacy.privacy.membership_inference_attack import utils - -from os import mkdir - -ArrayDict = Dict[Text, np.ndarray] -FloatDict = Dict[Text, float] -AnyDict = Dict[Text, Any] -Dataset = Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]] -MetricNames = Union[Text, Iterable[Text]] - - -def _get_vulnerabilities(result: ArrayDict, metrics: MetricNames) -> FloatDict: - """Gets the vulnerabilities according to the chosen metrics for all attacks.""" - vulns = {} - if isinstance(metrics, str): - metrics = [metrics] - for k in result: - for metric in metrics: - if k.endswith(metric.lower()) or k.endswith('n_examples'): - vulns[k] = float(result[k]) - return vulns - - -def _get_maximum_vulnerability( - attack_result: FloatDict, - metrics: MetricNames, - filterby: Text = '') -> Dict[Text, Dict[Text, Union[Text, float]]]: - """Returns the worst vulnerability according to the chosen metrics of all attacks.""" - vulns = {} - if isinstance(metrics, str): - metrics = [metrics] - for metric in metrics: - best_attack_value = -np.inf - for k in attack_result: - if (k.startswith(filterby.lower()) and k.endswith(metric.lower()) and - 'train' not in k): - if float(attack_result[k]) > best_attack_value: - best_attack_value = attack_result[k] - best_attacker = k - if best_attack_value > -np.inf: - newkey = filterby + '-' + metric if filterby else metric - vulns[newkey] = {'value': best_attack_value, 'attacker': best_attacker} - return vulns - - -def _get_maximum_class_gap_or_none(result: FloatDict, - metrics: MetricNames) -> FloatDict: - """Returns the biggest and smallest vulnerability and the gap across classes.""" - gaps = {} - if isinstance(metrics, str): - metrics = [metrics] - for metric in metrics: - hi = -np.inf - lo = np.inf - hi_idx, lo_idx = -1, -1 - for k in result: - if (k.startswith('class') and k.endswith(metric.lower()) and - 'train' not in k): - if float(result[k]) > hi: - hi = float(result[k]) - hi_idx = int(re.findall(r'class_(\d+)_', k)[0]) - if float(result[k]) < lo: - lo = float(result[k]) - lo_idx = int(re.findall(r'class_(\d+)_', k)[0]) - if lo - hi < np.inf: - gaps['max_class_gap_' + metric] = hi - lo - gaps[f'class_{hi_idx}_' + metric] = hi - gaps[f'class_{lo_idx}_' + metric] = lo - gaps['max_vuln_class_' + metric] = hi_idx - gaps['min_vuln_class_' + metric] = lo_idx - return gaps - - -# ------------------------------------------------------------------------------ -# Attacks -# ------------------------------------------------------------------------------ - - -def _run_threshold_loss_attack(features: ArrayDict, - figure_file_prefix: Text = '', - figure_directory: Text = None) -> ArrayDict: - """Runs the threshold attack on the loss.""" - logging.info('Run threshold attack on loss...') - is_train = features['is_train'] - attack_prefix = 'thresh_loss' - tmp_results = utils.compute_performance_metrics(is_train, -features['loss']) - if figure_directory is not None: - figpath = os.path.join(figure_directory, - figure_file_prefix + attack_prefix + '.png') - plotting.save_plot( - plotting.plot_curve_with_area( - tmp_results['fpr'], tmp_results['tpr'], xlabel='FPR', ylabel='TPR'), - figpath) - figpath = os.path.join(figure_directory, - figure_file_prefix + attack_prefix + '_hist.png') - plotting.save_plot( - plotting.plot_histograms( - features['loss'][is_train == 1], - features['loss'][is_train == 0], - xlabel='loss'), figpath) - return utils.prepend_to_keys(tmp_results, attack_prefix + '_') - - -def _run_threshold_attack_maxlogit(features: ArrayDict, - figure_file_prefix: Text = '', - figure_directory: Text = None) -> ArrayDict: - """Runs the threshold attack on the maximum logit.""" - is_train = features['is_train'] - preds = np.max(features['logits'], axis=-1) - tmp_results = utils.compute_performance_metrics(is_train, preds) - attack_prefix = 'thresh_maxlogit' - if figure_directory is not None: - figpath = os.path.join(figure_directory, - figure_file_prefix + attack_prefix + '.png') - plotting.save_plot( - plotting.plot_curve_with_area( - tmp_results['fpr'], tmp_results['tpr'], xlabel='FPR', ylabel='TPR'), - figpath) - figpath = os.path.join(figure_directory, - figure_file_prefix + attack_prefix + '_hist.png') - plotting.save_plot( - plotting.plot_histograms( - preds[is_train == 1], preds[is_train == 0], xlabel='loss'), figpath) - return utils.prepend_to_keys(tmp_results, attack_prefix + '_') - - -def _run_trained_attack(attack_classifier: Text, - data: Dataset, - attack_prefix: Text, - figure_file_prefix: Text = '', - figure_directory: Text = None) -> ArrayDict: - """Train a classifier for attack and evaluate it.""" - # Train the attack classifier - (x_train, y_train), (x_test, y_test) = data - clf_model = trained_attack_models.choose_model(attack_classifier) - clf_model.fit(x_train, y_train) - - # Calculate training set metrics - pred_train = clf_model.predict_proba(x_train)[:, clf_model.classes_ == 1] - results = utils.prepend_to_keys( - utils.compute_performance_metrics(y_train, pred_train), - attack_prefix + 'train_') - - # Calculate test set metrics - pred_test = clf_model.predict_proba(x_test)[:, clf_model.classes_ == 1] - results.update( - utils.prepend_to_keys( - utils.compute_performance_metrics(y_test, pred_test), - attack_prefix + 'test_')) - - if figure_directory is not None: - figpath = os.path.join(figure_directory, - figure_file_prefix + attack_prefix[:-1] + '.png') - plotting.save_plot( - plotting.plot_curve_with_area( - results[attack_prefix + 'test_fpr'], - results[attack_prefix + 'test_tpr'], - xlabel='FPR', - ylabel='TPR'), figpath) - return results - - -def _run_attacks_and_plot(features: ArrayDict, - attacks: Iterable[Text], - attack_classifiers: Iterable[Text], - balance: bool, - test_size: float, - random_state: int, - figure_file_prefix: Text = '', - figure_directory: Text = None) -> ArrayDict: - """Runs the specified attacks on the provided data.""" - if balance: - try: - features = utils.subsample_to_balance(features, random_state) - except RuntimeError: - logging.info('Not enough remaining data for attack: Empty results.') - return {} - - result = {} - # -------------------- Simple threshold attacks - if 'thresh_loss' in attacks: - result.update( - _run_threshold_loss_attack(features, figure_file_prefix, - figure_directory)) - - if 'thresh_maxlogit' in attacks: - result.update( - _run_threshold_attack_maxlogit(features, figure_file_prefix, - figure_directory)) - - # -------------------- Run learned attacks - # TODO(b/157632603): Add a prefix (for example 'trained_') for attacks which - # use classifiers to distinguish from threshould attacks. - if 'logits' in attacks: - data = utils.get_train_test_split( - features, add_loss=False, test_size=test_size) - for clf in attack_classifiers: - logging.info('Train %s on %d logits', clf, data[0][0].shape[1]) - attack_prefix = f'{clf}_logits_' - result.update( - _run_trained_attack(clf, data, attack_prefix, figure_file_prefix, - figure_directory)) - - if 'logits_loss' in attacks: - data = utils.get_train_test_split( - features, add_loss=True, test_size=test_size) - for clf in attack_classifiers: - logging.info('Train %s on %d logits + loss', clf, data[0][0].shape[1]) - attack_prefix = f'{clf}_logits_loss_' - result.update( - _run_trained_attack(clf, data, attack_prefix, figure_file_prefix, - figure_directory)) - return result - - -def run_attack(loss_train: np.ndarray = None, - loss_test: np.ndarray = None, - logits_train: np.ndarray = None, - logits_test: np.ndarray = None, - labels_train: np.ndarray = None, - labels_test: np.ndarray = None, - attack_classifiers: Iterable[Text] = None, - only_misclassified: bool = False, - by_class: Union[bool, Iterable[int], int] = False, - by_percentile: Union[bool, Iterable[int], int] = False, - figure_directory: Text = None, - output_directory: Text = None, - metric: MetricNames = 'auc', - balance: bool = True, - test_size: float = 0.2, - random_state: int = 0) -> FloatDict: - """Run membership inference attack(s). - - Based only on specific outputs of a machine learning model on some examples - used for training (train) and some examples not used for training (test), run - membership inference attacks that try to discriminate training from test - inputs based only on the model outputs. - While all inputs are optional, at least one train/test pair is required to run - any attacks (either losses or logits/probabilities). - Note that one can equally provide output probabilities instead of logits in - the logits_train / logits_test arguments. - - We measure the vulnerability of the model via the area under the ROC-curve - (auc) or via max |fpr - tpr| (advantage) of the attack classifier. These - measures are very closely related and may look almost indistinguishable. - - This function provides relatively fine grained control and outputs detailed - results. For a higher-level wrapper with sane internal default settings and - distilled output results, see `run_all_attacks`. - - Via the `figure_directory` argument and the `output_directory` argument more - detailed information as well as roc-curve plots can optionally be stored to - disk. - - If `loss_train` and `loss_test` are provided we run: - - simple threshold attack on the loss - - If `logits_train` and `logits_test` are provided we run: - - simple threshold attack on the top logit - - if `attack_classifiers` is not None and no losses are provided: train the - specified classifiers on the top 10 logits (or all logits if there are - less than 10) - - if `attack_classifiers` is not None and losses are provided: train the - specified classifiers on the top 10 logits (or all logits if there are - less than 10) and the loss - - Args: - loss_train: A 1D array containing the individual scalar losses for examples - used during training. - loss_test: A 1D array containing the individual scalar losses for examples - not used during training. - logits_train: A 2D array (n_train, n_classes) of the individual logits or - output probabilities of examples used during training. - logits_test: A 2D array (n_test, n_classes) of the individual logits or - output probabilities of examples not used during training. - labels_train: The true labels of the training examples. Labels are only - needed when `by_class` is specified (i.e., not False). - labels_test: The true labels of the test examples. Labels are only needed - when `by_class` is specified (i.e., not False). - attack_classifiers: Attack classifiers to train beyond simple thresholding - that require training a simple binary ML classifier. This argument is - ignored if logits are not provided. Classifiers can be 'lr' for logistic - regression, 'mlp' for multi-layered perceptron, 'rf' for random forests, - or 'knn' for k-nearest-neighbors. If 'None', don't train classifiers - beyond simple thresholding. - only_misclassified: Run and evaluate attacks only on misclassified examples. - Must specify `labels_train`, `labels_test`, `logits_train` and - `logits_test` to use this. If this is True, `by_class` and `by_percentile` - are ignored. - by_class: This argument determines whether attacks are run on the entire - data, or on examples grouped by their class label. If `True`, all attacks - are run separately for each class. If `by_class` is a single integer, run - attacks for this class only. If `by_class` is an iterable of integers, run - all attacks for each of the specified class labels separately. Only used - if `labels_train` and `labels_test` are specified. If `by_class` is - specified (not False), `by_percentile` is ignored. Ignored if - `only_misclassified` is True. - by_percentile: This argument determines whether attacks are run on the - entire data, or separately for examples where the most likely class - prediction is within a given percentile of all maximum predicitons. If - `True`, all attacks are run separately for the examples with max - probabilities within the ten deciles. If `by_precentile` is a single int - between 0 and 100, run attacks only for examples with confidence within - this percentile. If `by_percentile` is an iterable of ints between 0 and - 100, run all attacks for each of the specified percentiles separately. - Ignored if `by_class` is specified. Ignored if `logits_train` and - `logits_test` are not specified. Ignored if `only_misclassified` is True. - figure_directory: Where to store ROC-curve plots and histograms. If `None`, - don't create plots. - output_directory: Where to store detailed result data for all run attacks. - If `None`, don't store detailed result data. - metric: Available vulnerability metrics are 'auc' or 'advantage' for the - area under the ROC curve or the advantage (max |tpr - fpr|). Specify - either one of them or both. - balance: Whether to use the same number of train and test samples (by - randomly subsampling whichever happens to be larger). - test_size: The fraction of the input data to use for the evaluation of - trained ML attacks. This argument is ignored, if either attack_classifiers - is None, or no logits are provided. - random_state: Random seed for reproducibility. Only used if attack models - are trained. - - Returns: - results: Dictionary with the chosen vulnerability metric(s) for all ran - attacks. - """ - print( - 'Deprecation warning: function run_attack is ' - 'deprecated and will be removed soon. ' - 'Please use membership_inference_attack_new.run_attacks' - ) - attacks = [] - features = {} - # ---------- Check available data ---------- - if ((loss_train is None or loss_test is None) and - (logits_train is None or logits_test is None)): - raise ValueError( - 'Need at least train and test for loss or train and test for logits.') - - # ---------- If losses are provided ---------- - if loss_train is not None and loss_test is not None: - if loss_train.ndim != 1 or loss_test.ndim != 1: - raise ValueError('Losses must be 1D arrays.') - features['is_train'] = np.concatenate( - (np.ones(len(loss_train)), np.zeros(len(loss_test))), - axis=0).astype(int) - features['loss'] = np.concatenate((loss_train.ravel(), loss_test.ravel()), - axis=0) - attacks.append('thresh_loss') - - # ---------- If logits are provided ---------- - if logits_train is not None and logits_test is not None: - assert logits_train.ndim == 2 and logits_test.ndim == 2, \ - 'Logits must be 2D arrays.' - assert logits_train.shape[1] == logits_test.shape[1], \ - 'Train and test logits must agree along axis 1 (number of classes).' - if 'is_train' in features: - assert (loss_train.shape[0] == logits_train.shape[0] and - loss_test.shape[0] == logits_test.shape[0]), \ - 'Number of examples must match between loss and logits.' - else: - features['is_train'] = np.concatenate( - (np.ones(logits_train.shape[0]), np.zeros(logits_test.shape[0])), - axis=0).astype(int) - attacks.append('thresh_maxlogit') - features['logits'] = np.concatenate((logits_train, logits_test), axis=0) - if attack_classifiers: - attacks.append('logits') - if 'loss' in features: - attacks.append('logits_loss') - - # ---------- If labels are provided ---------- - if labels_train is not None and labels_test is not None: - if labels_train.ndim != 1 or labels_test.ndim != 1: - raise ValueError('Labels must be 1D arrays.') - if 'loss' in features: - assert (loss_train.shape[0] == labels_train.shape[0] and - loss_test.shape[0] == labels_test.shape[0]), \ - 'Number of examples must match between loss and labels.' - else: - assert (logits_train.shape[0] == labels_train.shape[0] and - logits_test.shape[0] == labels_test.shape[0]), \ - 'Number of examples must match between logits and labels.' - features['label'] = np.concatenate((labels_train, labels_test), axis=0) - - # ---------- Data subsampling or filtering ---------- - filtertype = None - filtervals = [None] - if only_misclassified: - if (labels_train is None or labels_test is None or logits_train is None or - logits_test is None): - raise ValueError('Must specify labels_train, labels_test, logits_train, ' - 'and logits_test for the only_misclassified option.') - filtertype = 'misclassified' - elif by_class: - if labels_train is None or labels_test is None: - raise ValueError('Must specify labels_train and labels_test when using ' - 'the by_class option.') - if isinstance(by_class, bool): - filtervals = list(set(labels_train) | set(labels_test)) - elif isinstance(by_class, int): - filtervals = [by_class] - elif isinstance(by_class, collections.Iterable): - filtervals = list(by_class) - filtertype = 'class' - elif by_percentile: - if logits_train is None or logits_test is None: - raise ValueError('Must specify logits_train and logits_test when using ' - 'the by_percentile option.') - if isinstance(by_percentile, bool): - filtervals = list(range(10, 101, 10)) - elif isinstance(by_percentile, int): - filtervals = [by_percentile] - elif isinstance(by_percentile, collections.Iterable): - filtervals = [int(percentile) for percentile in by_percentile] - filtertype = 'percentile' - - # ---------- Need to create figure directory? ---------- - if figure_directory is not None: - mkdir(figure_directory) - - # ---------- Actually run attacks and plot if required ---------- - logging.info('Selecting %s with values %s', filtertype, filtervals) - num = None - result = {} - for filterval in filtervals: - if filtertype is None: - tmp_features = features - elif filtertype == 'misclassified': - idx = features['label'] != np.argmax(features['logits'], axis=-1) - tmp_features = utils.select_indices(features, idx) - num = np.sum(idx) - elif filtertype == 'class': - idx = features['label'] == filterval - tmp_features = utils.select_indices(features, idx) - num = np.sum(idx) - elif filtertype == 'percentile': - certainty = np.max(special.softmax(features['logits'], axis=-1), axis=-1) - idx = certainty <= np.percentile(certainty, filterval) - tmp_features = utils.select_indices(features, idx) - - prefix = f'{filtertype}_' if filtertype is not None else '' - prefix += f'{filterval}_' if filterval is not None else '' - tmp_result = _run_attacks_and_plot(tmp_features, attacks, - attack_classifiers, balance, test_size, - random_state, prefix, figure_directory) - if num is not None: - tmp_result['n_examples'] = float(num) - if tmp_result: - result.update(utils.prepend_to_keys(tmp_result, prefix)) - - # ---------- Store data ---------- - if output_directory is not None: - mkdir(output_directory) - resultpath = os.path.join(output_directory, 'attack_results.npz') - logging.info('Store aggregate results at %s.', resultpath) - with open(resultpath, 'wb') as fp: - io_buffer = io.BytesIO() - np.savez(io_buffer, **result) - fp.write(io_buffer.getvalue()) - - return _get_vulnerabilities(result, metric) - - -def run_all_attacks(loss_train: np.ndarray = None, - loss_test: np.ndarray = None, - logits_train: np.ndarray = None, - logits_test: np.ndarray = None, - labels_train: np.ndarray = None, - labels_test: np.ndarray = None, - attack_classifiers: Iterable[Text] = ('lr', 'mlp', 'rf', - 'knn'), - decimals: Union[int, None] = 4) -> FloatDict: - """Runs all possible membership inference attacks. - - Check 'run_attack' for detailed information of how attacks are performed - and evaluated. - - This function internally chooses sane default settings for all attacks and - returns all possible output combinations. - For fine grained control and partial attacks, please see `run_attack`. - - Args: - loss_train: A 1D array containing the individual scalar losses for examples - used during training. - loss_test: A 1D array containing the individual scalar losses for examples - not used during training. - logits_train: A 2D array (n_train, n_classes) of the individual logits or - output probabilities of examples used during training. - logits_test: A 2D array (n_test, n_classes) of the individual logits or - output probabilities of examples not used during training. - labels_train: The true labels of the training examples. Labels are only - needed when `by_class` is specified (i.e., not False). - labels_test: The true labels of the test examples. Labels are only needed - when `by_class` is specified (i.e., not False). - attack_classifiers: Which binary classifiers to train (in addition to simple - threshold attacks). This can include 'lr' (logistic regression), 'mlp' - (multi-layered perceptron), 'rf' (random forests), 'knn' (k-nearest - neighbors), which will be trained with cross validation to determine good - hyperparameters. - decimals: Round all float results to this number of decimals. If decimals is - None, don't round. - - Returns: - result: dictionary with all attack results - """ - print( - 'Deprecation warning: function run_all_attacks is ' - 'deprecated and will be removed soon. ' - 'Please use membership_inference_attack_new.run_attacks' - ) - metrics = ['auc', 'advantage'] - - # Entire data - result = run_attack( - loss_train, - loss_test, - logits_train, - logits_test, - attack_classifiers=attack_classifiers, - metric=metrics) - result = utils.prepend_to_keys(result, 'all_') - - # Misclassified examples - if (labels_train is not None and labels_test is not None and - logits_train is not None and logits_test is not None): - result.update( - run_attack( - loss_train, - loss_test, - logits_train, - logits_test, - labels_train, - labels_test, - attack_classifiers=attack_classifiers, - only_misclassified=True, - metric=metrics)) - - # Split per class - if labels_train is not None and labels_test is not None: - result.update( - run_attack( - loss_train, - loss_test, - logits_train, - logits_test, - labels_train, - labels_test, - by_class=True, - attack_classifiers=attack_classifiers, - metric=metrics)) - - # Different deciles - if logits_train is not None and logits_test is not None: - result.update( - run_attack( - loss_train, - loss_test, - logits_train, - logits_test, - by_percentile=True, - attack_classifiers=attack_classifiers, - metric=metrics)) - - if decimals is not None: - result = {k: round(v, decimals) for k, v in result.items()} - - return result - - -def run_all_attacks_and_create_summary( - loss_train: np.ndarray = None, - loss_test: np.ndarray = None, - logits_train: np.ndarray = None, - logits_test: np.ndarray = None, - labels_train: np.ndarray = None, - labels_test: np.ndarray = None, - return_dict: bool = True, - decimals: Union[int, None] = 4) -> Union[Text, Tuple[Text, AnyDict]]: - """Runs all possible membership inference attack(s) and distill results. - - Check 'run_attack' for detailed information of how attacks are performed - and evaluated. - - This function internally chooses sane default settings for all attacks and - returns all possible output combinations. - For fine grained control and partial attacks, please see `run_attack`. - - Args: - loss_train: A 1D array containing the individual scalar losses for examples - used during training. - loss_test: A 1D array containing the individual scalar losses for examples - not used during training. - logits_train: A 2D array (n_train, n_classes) of the individual logits or - output probabilities of examples used during training. - logits_test: A 2D array (n_test, n_classes) of the individual logits or - output probabilities of examples not used during training. - labels_train: The true labels of the training examples. Labels are only - needed when `by_class` is specified (i.e., not False). - labels_test: The true labels of the test examples. Labels are only needed - when `by_class` is specified (i.e., not False). - return_dict: Whether to also return a dictionary with the results summarized - in the summary string. - decimals: Round all float results to this number of decimals. If decimals is - None, don't round. - - Returns: - summarystring: A string with natural language summary of the attacks. In the - summary string printed numbers will be rounded to `decimal` decimals if - provided, otherwise will round to 3 diits by default for readability. - result: a dictionary with all the distilled attack information summarized - in the summarystring - """ - print( - 'Deprecation warning: function run_all_attacks_and_create_summary is ' - 'deprecated and will be removed soon. ' - 'Please use membership_inference_attack_new.run_attacks' - ) - summary = [] - metrics = ['auc', 'advantage'] - attack_classifiers = ['lr', 'knn'] - results = run_all_attacks( - loss_train, - loss_test, - logits_train, - logits_test, - labels_train, - labels_test, - attack_classifiers=attack_classifiers, - decimals=None) - output = _get_maximum_vulnerability(results, metrics, filterby='all') - - if decimals is not None: - strdec = decimals - else: - strdec = 4 - - for metric in metrics: - summary.append(f'========== {metric.upper()} ==========') - best_value = output['all-' + metric]['value'] - best_attacker = output['all-' + metric]['attacker'] - summary.append(f'The best attack ({best_attacker}) achieved an {metric} of ' - f'{best_value:.{strdec}f}.') - summary.append('') - - classgap = _get_maximum_class_gap_or_none(results, metrics) - if classgap: - output.update(classgap) - for metric in metrics: - summary.append(f'========== {metric.upper()} per class ==========') - hi_idx = output[f'max_vuln_class_{metric}'] - lo_idx = output[f'min_vuln_class_{metric}'] - hi = output[f'class_{hi_idx}_{metric}'] - lo = output[f'class_{lo_idx}_{metric}'] - gap = output[f'max_class_gap_{metric}'] - summary.append(f'The most vulnerable class {hi_idx} has {metric} of ' - f'{hi:.{strdec}f}.') - summary.append(f'The least vulnerable class {lo_idx} has {metric} of ' - f'{lo:.{strdec}f}.') - summary.append(f'=> The maximum gap between class vulnerabilities is ' - f'{gap:.{strdec}f}.') - summary.append('') - - misclassified = _get_maximum_vulnerability( - results, metrics, filterby='misclassified') - if misclassified: - for metric in metrics: - best_value = misclassified['misclassified-' + metric]['value'] - best_attacker = misclassified['misclassified-' + metric]['attacker'] - summary.append(f'========== {metric.upper()} for misclassified ' - '==========') - summary.append('Among misclassified examples, the best attack ' - f'({best_attacker}) achieved an {metric} of ' - f'{best_value:.{strdec}f}.') - summary.append('') - output.update(misclassified) - - n_examples = {k: v for k, v in results.items() if k.endswith('n_examples')} - if n_examples: - output.update(n_examples) - - # Flatten remaining dicts in output - fresh_output = {} - for k, v in output.items(): - if isinstance(v, dict): - if k.startswith('all'): - fresh_output[k[4:]] = v['value'] - fresh_output['best_attacker_' + k[4:]] = v['attacker'] - else: - fresh_output[k] = v - output = fresh_output - - if decimals is not None: - for k, v in output.items(): - if isinstance(v, float): - output[k] = round(v, decimals) - - summary = '\n'.join(summary) - if return_dict: - return summary, output - else: - return summary diff --git a/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack_test.py b/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack_test.py deleted file mode 100644 index 8e571ad..0000000 --- a/tensorflow_privacy/privacy/membership_inference_attack/membership_inference_attack_test.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright 2020, The TensorFlow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""Tests for tensorflow_privacy.privacy.membership_inference_attack.utils.""" -from absl.testing import absltest - -import numpy as np - -from tensorflow_privacy.privacy.membership_inference_attack import membership_inference_attack as mia - - -def get_result_dict(): - """Get an example result dictionary.""" - return { - 'test_n_examples': np.ones(1), - 'test_examples': np.zeros(1), - 'test_auc': np.ones(1), - 'test_advantage': np.ones(1), - 'all_0-metric': np.array([1]), - 'all_1-metric': np.array([2]), - 'test_2-metric': np.array([3]), - 'test_score': np.array([4]), - } - - -def get_test_inputs(): - """Get example inputs for attacks.""" - n_train = n_test = 500 - rng = np.random.RandomState(4) - loss_train = rng.randn(n_train) - 0.4 - loss_test = rng.randn(n_test) + 0.4 - logits_train = rng.randn(n_train, 5) + 0.2 - logits_test = rng.randn(n_test, 5) - 0.2 - labels_train = np.array([i % 5 for i in range(n_train)]) - labels_test = np.array([(3 * i) % 5 for i in range(n_test)]) - return (loss_train, loss_test, logits_train, logits_test, - labels_train, labels_test) - - -class GetVulnerabilityTest(absltest.TestCase): - - def test_get_vulnerabilities(self): - """Test extraction of vulnerability scores.""" - testdict = get_result_dict() - for key in ['auc', 'advantage']: - res = mia._get_vulnerabilities(testdict, key) - self.assertLen(res, 2) - self.assertEqual(res[f'test_{key}'], 1) - self.assertEqual(res['test_n_examples'], 1) - - res = mia._get_vulnerabilities(testdict, ['auc', 'advantage']) - self.assertLen(res, 3) - self.assertEqual(res['test_auc'], 1) - self.assertEqual(res['test_advantage'], 1) - self.assertEqual(res['test_n_examples'], 1) - - -class GetMaximumVulnerabilityTest(absltest.TestCase): - - def test_get_maximum_vulnerability(self): - """Test extraction of maximum vulnerability score.""" - testdict = get_result_dict() - for i in range(3): - key = f'{i}-metric' - res = mia._get_maximum_vulnerability(testdict, key) - self.assertLen(res, 1) - self.assertEqual(res[key]['value'], i + 1) - if i < 2: - self.assertEqual(res[key]['attacker'], f'all_{i}-metric') - else: - self.assertEqual(res[key]['attacker'], 'test_2-metric') - - res = mia._get_maximum_vulnerability(testdict, 'metric') - self.assertLen(res, 1) - self.assertEqual(res['metric']['value'], 3) - - res = mia._get_maximum_vulnerability(testdict, ['metric'], - filterby='all') - self.assertLen(res, 1) - self.assertEqual(res['all-metric']['value'], 2) - - res = mia._get_maximum_vulnerability(testdict, ['metric', 'score']) - self.assertLen(res, 2) - self.assertEqual(res['metric']['value'], 3) - self.assertEqual(res['score']['value'], 4) - self.assertEqual(res['score']['attacker'], 'test_score') - - -class ThresholdAttackLossTest(absltest.TestCase): - - def test_threshold_attack_loss(self): - """Test simple threshold attack on loss.""" - features = { - 'loss': np.zeros(10), - 'is_train': np.concatenate((np.zeros(5), np.ones(5))), - } - res = mia._run_threshold_loss_attack(features) - for k in res: - self.assertStartsWith(k, 'thresh_loss') - self.assertEqual(res['thresh_loss_auc'], 0.5) - self.assertEqual(res['thresh_loss_advantage'], 0.0) - - rng = np.random.RandomState(4) - n_train = 1000 - n_test = 500 - loss_train = rng.randn(n_train) - 0.4 - loss_test = rng.randn(n_test) + 0.4 - features = { - 'loss': np.concatenate((loss_train, loss_test)), - 'is_train': np.concatenate((np.ones(n_train), np.zeros(n_test))), - } - res = mia._run_threshold_loss_attack(features) - self.assertBetween(res['thresh_loss_auc'], 0.7, 0.75) - self.assertBetween(res['thresh_loss_advantage'], 0.3, 0.35) - - -class ThresholdAttackMaxlogitTest(absltest.TestCase): - - def test_threshold_attack_maxlogits(self): - """Test simple threshold attack on maximum logit.""" - features = { - 'logits': np.eye(10, 14), - 'is_train': np.concatenate((np.zeros(5), np.ones(5))), - } - res = mia._run_threshold_attack_maxlogit(features) - for k in res: - self.assertStartsWith(k, 'thresh_maxlogit') - self.assertEqual(res['thresh_maxlogit_auc'], 0.5) - self.assertEqual(res['thresh_maxlogit_advantage'], 0.0) - - rng = np.random.RandomState(4) - n_train = 1000 - n_test = 500 - logits_train = rng.randn(n_train, 12) + 0.2 - logits_test = rng.randn(n_test, 12) - 0.2 - features = { - 'logits': np.concatenate((logits_train, logits_test), axis=0), - 'is_train': np.concatenate((np.ones(n_train), np.zeros(n_test))), - } - res = mia._run_threshold_attack_maxlogit(features) - self.assertBetween(res['thresh_maxlogit_auc'], 0.7, 0.75) - self.assertBetween(res['thresh_maxlogit_advantage'], 0.3, 0.35) - - -class TrainedAttackTrivialTest(absltest.TestCase): - - def test_trained_attack(self): - """Test trained attacks.""" - # Trivially easy problem - x_train, x_test = np.ones((500, 3)), np.ones((20, 3)) - x_train[:200] *= -1 - x_test[:8] *= -1 - y_train, y_test = np.ones(500).astype(int), np.ones(20).astype(int) - y_train[:200] = 0 - y_test[:8] = 0 - data = (x_train, y_train), (x_test, y_test) - for clf in ['lr', 'rf', 'mlp', 'knn']: - res = mia._run_trained_attack(clf, data, attack_prefix='a-') - self.assertEqual(res['a-train_auc'], 1) - self.assertEqual(res['a-test_auc'], 1) - self.assertEqual(res['a-train_advantage'], 1) - self.assertEqual(res['a-test_advantage'], 1) - - -class TrainedAttackRandomFeaturesTest(absltest.TestCase): - - def test_trained_attack(self): - """Test trained attacks.""" - # Random labels and features - rng = np.random.RandomState(4) - x_train, x_test = rng.randn(500, 3), rng.randn(500, 3) - y_train = rng.binomial(1, 0.5, size=(500,)) - y_test = rng.binomial(1, 0.5, size=(500,)) - data = (x_train, y_train), (x_test, y_test) - for clf in ['lr', 'rf', 'mlp', 'knn']: - res = mia._run_trained_attack(clf, data, attack_prefix='a-') - self.assertBetween(res['a-train_auc'], 0.5, 1.) - self.assertBetween(res['a-test_auc'], 0.4, 0.6) - self.assertBetween(res['a-train_advantage'], 0., 1.0) - self.assertBetween(res['a-test_advantage'], 0., 0.2) - - -class AttackLossesTest(absltest.TestCase): - - def test_attack(self): - """Test individual attack function.""" - # losses only, both metrics - loss_train, loss_test, _, _, _, _ = get_test_inputs() - res = mia.run_attack(loss_train, loss_test, metric=('auc', 'advantage')) - self.assertBetween(res['thresh_loss_auc'], 0.7, 0.75) - self.assertBetween(res['thresh_loss_advantage'], 0.3, 0.35) - - -class AttackLossesLogitsTest(absltest.TestCase): - - def test_attack(self): - """Test individual attack function.""" - # losses and logits, two classifiers, single metric - loss_train, loss_test, logits_train, logits_test, _, _ = get_test_inputs() - res = mia.run_attack( - loss_train, - loss_test, - logits_train, - logits_test, - attack_classifiers=('rf', 'knn'), - metric='auc') - self.assertBetween(res['rf_logits_test_auc'], 0.7, 0.9) - self.assertBetween(res['knn_logits_test_auc'], 0.7, 0.9) - self.assertBetween(res['rf_logits_loss_test_auc'], 0.7, 0.9) - self.assertBetween(res['knn_logits_loss_test_auc'], 0.7, 0.9) - - -class AttackLossesLabelsByClassTest(absltest.TestCase): - - def test_attack(self): - # losses and labels, single metric, split by class - loss_train, loss_test, _, _, labels_train, labels_test = get_test_inputs() - n_train = loss_train.shape[0] - n_test = loss_test.shape[0] - res = mia.run_attack( - loss_train, - loss_test, - labels_train=labels_train, - labels_test=labels_test, - by_class=True, - metric='auc') - self.assertLen(res, 10) - for k in res: - self.assertStartsWith(k, 'class_') - if k.endswith('n_examples'): - self.assertEqual(int(res[k]), (n_train + n_test) // 5) - else: - self.assertBetween(res[k], 0.65, 0.75) - - -class AttackLossesLabelsSingleClassTest(absltest.TestCase): - - def test_attack(self): - # losses and labels, both metrics, single class - loss_train, loss_test, _, _, labels_train, labels_test = get_test_inputs() - n_train = loss_train.shape[0] - n_test = loss_test.shape[0] - res = mia.run_attack( - loss_train, - loss_test, - labels_train=labels_train, - labels_test=labels_test, - by_class=2, - metric=('auc', 'advantage')) - self.assertLen(res, 3) - for k in res: - self.assertStartsWith(k, 'class_2') - if k.endswith('n_examples'): - self.assertEqual(int(res[k]), (n_train + n_test) // 5) - elif k.endswith('advantage'): - self.assertBetween(res[k], 0.3, 0.5) - elif k.endswith('auc'): - self.assertBetween(res[k], 0.7, 0.75) - - -class AttackLogitsLabelsMisclassifiedTest(absltest.TestCase): - - def test_attack(self): - # logits and labels, single metric, single classifier, misclassified only - (_, _, logits_train, logits_test, - labels_train, labels_test) = get_test_inputs() - res = mia.run_attack( - logits_train=logits_train, - logits_test=logits_test, - labels_train=labels_train, - labels_test=labels_test, - only_misclassified=True, - attack_classifiers=('lr',), - metric='advantage') - self.assertBetween(res['misclassified_lr_logits_test_advantage'], 0.3, 0.8) - self.assertEqual(res['misclassified_n_examples'], 802) - - -class AttackLogitsByPrecentileTest(absltest.TestCase): - - def test_attack(self): - # only logits, single metric, no classifiers, split by deciles - _, _, logits_train, logits_test, _, _ = get_test_inputs() - res = mia.run_attack( - logits_train=logits_train, - logits_test=logits_test, - by_percentile=True, - metric='auc') - for k in res: - self.assertStartsWith(k, 'percentile') - self.assertBetween(res[k], 0.60, 0.75) - - -if __name__ == '__main__': - absltest.main() diff --git a/tensorflow_privacy/privacy/membership_inference_attack/trained_attack_models.py b/tensorflow_privacy/privacy/membership_inference_attack/trained_attack_models.py deleted file mode 100644 index f03c035..0000000 --- a/tensorflow_privacy/privacy/membership_inference_attack/trained_attack_models.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2020, The TensorFlow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -r"""A collection of sklearn models for binary classification. - -This module contains some sklearn pipelines for finding models for binary -classification from a variable number of numerical input features. -These models are used to train binary classifiers for membership inference. -""" - -from typing import Text - -import numpy as np -from sklearn import ensemble -from sklearn import linear_model -from sklearn import model_selection -from sklearn import neighbors -from sklearn import neural_network - - -def choose_model(attack_classifier: Text): - """Choose a model based on a string classifier.""" - if attack_classifier == 'lr': - return logistic_regression() - elif attack_classifier == 'mlp': - return mlp() - elif attack_classifier == 'rf': - return random_forest() - elif attack_classifier == 'knn': - return knn() - else: - raise ValueError(f'Unknown attack classifier {attack_classifier}.') - - -def logistic_regression(verbose: int = 0, n_jobs: int = 1): - """Setup a logistic regression pipeline with cross-validation.""" - lr = linear_model.LogisticRegression(solver='lbfgs') - param_grid = { - 'C': np.logspace(-4, 2, 10), - } - pipe = model_selection.GridSearchCV( - lr, param_grid=param_grid, cv=3, n_jobs=n_jobs, iid=False, - verbose=verbose) - return pipe - - -def random_forest(verbose: int = 0, n_jobs: int = 1): - """Setup a random forest pipeline with cross-validation.""" - rf = ensemble.RandomForestClassifier() - - n_estimators = [100] - max_features = ['auto', 'sqrt'] - max_depth = [5, 10, 20] - max_depth.append(None) - min_samples_split = [2, 5, 10] - min_samples_leaf = [1, 2, 4] - random_grid = {'n_estimators': n_estimators, - 'max_features': max_features, - 'max_depth': max_depth, - 'min_samples_split': min_samples_split, - 'min_samples_leaf': min_samples_leaf} - - pipe = model_selection.RandomizedSearchCV( - rf, param_distributions=random_grid, n_iter=7, cv=3, n_jobs=n_jobs, - iid=False, verbose=verbose) - return pipe - - -def mlp(verbose: int = 0, n_jobs: int = 1): - """Setup a MLP pipeline with cross-validation.""" - mlpmodel = neural_network.MLPClassifier() - - param_grid = { - 'hidden_layer_sizes': [(64,), (32, 32)], - 'solver': ['adam'], - 'alpha': [0.0001, 0.001, 0.01], - } - pipe = model_selection.GridSearchCV( - mlpmodel, param_grid=param_grid, cv=3, n_jobs=n_jobs, iid=False, - verbose=verbose) - return pipe - - -def knn(verbose: int = 0, n_jobs: int = 1): - """Setup a k-nearest neighbors pipeline with cross-validation.""" - knnmodel = neighbors.KNeighborsClassifier() - - param_grid = { - 'n_neighbors': [3, 5, 7], - } - pipe = model_selection.GridSearchCV( - knnmodel, param_grid=param_grid, cv=3, n_jobs=n_jobs, iid=False, - verbose=verbose) - return pipe