Fixes comments and membership scores for thresholds attack.

PiperOrigin-RevId: 555579896
This commit is contained in:
Shuang Song 2023-08-10 11:29:52 -07:00 committed by A. Unique TensorFlower
parent fafa69b65c
commit 27069d347d
7 changed files with 17 additions and 17 deletions

View file

@ -94,15 +94,15 @@ which should give (something like) the following output
``` ```
Attack Ours (online) Attack Ours (online)
AUC 0.6675, Accuracy 0.6074, TPR@0.1%FPR of 0.0104 AUC 0.6676, Accuracy 0.6077, TPR@0.1%FPR of 0.0169
Attack Ours (online, fixed variance) Attack Ours (online, fixed variance)
AUC 0.6831, Accuracy 0.6140, TPR@0.1%FPR of 0.0541 AUC 0.6856, Accuracy 0.6137, TPR@0.1%FPR of 0.0593
Attack Ours (offline) Attack Ours (offline)
AUC 0.5465, Accuracy 0.5486, TPR@0.1%FPR of 0.0073 AUC 0.5488, Accuracy 0.5500, TPR@0.1%FPR of 0.0130
Attack Ours (offline, fixed variance) Attack Ours (offline, fixed variance)
AUC 0.5518, Accuracy 0.5485, TPR@0.1%FPR of 0.0259 AUC 0.5549, Accuracy 0.5537, TPR@0.1%FPR of 0.0299
Attack Global threshold Attack Global threshold
AUC 0.5900, Accuracy 0.6018, TPR@0.1%FPR of 0.0007 AUC 0.5921, Accuracy 0.6044, TPR@0.1%FPR of 0.0009
``` ```
where the global threshold attack is the baseline, and our online, where the global threshold attack is the baseline, and our online,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 37 KiB

View file

@ -69,7 +69,7 @@ def main(argv):
for dy in range(0, 2*shift+1, stride): for dy in range(0, 2*shift+1, stride):
this_x = aug_pad[:, dx:dx+32, dy:dy+32, :].transpose((0,3,1,2)) this_x = aug_pad[:, dx:dx+32, dy:dy+32, :].transpose((0,3,1,2))
logits = model.model(this_x, training=False) logits = model.model(this_x, training=True)
outs.append(logits) outs.append(logits)
print(np.array(outs).shape) print(np.array(outs).shape)

View file

@ -88,13 +88,13 @@ which should give (something like) the following output
``` ```
Attack No poison (LiRA) Attack No poison (LiRA)
AUC 0.6992, Accuracy 0.6240, TPR@0.1%FPR of 0.0529 AUC 0.7025, Accuracy 0.6258, TPR@0.1%FPR of 0.0544
Attack No poison (Global threshold) Attack No poison (Global threshold)
AUC 0.6200, Accuracy 0.6167, TPR@0.1%FPR of 0.0011 AUC 0.6191, Accuracy 0.6173, TPR@0.1%FPR of 0.0012
Attack With poison (LiRA) Attack With poison (LiRA)
AUC 0.9904, Accuracy 0.9617, TPR@0.1%FPR of 0.3730 AUC 0.9943, Accuracy 0.9653, TPR@0.1%FPR of 0.4945
Attack With poison (Global threshold) Attack With poison (Global threshold)
AUC 0.9911, Accuracy 0.9580, TPR@0.1%FPR of 0.2130 AUC 0.9922, Accuracy 0.9603, TPR@0.1%FPR of 0.3930
``` ```
where the baselines are LiRA and a simple global threshold on the membership where the baselines are LiRA and a simple global threshold on the membership

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View file

@ -109,8 +109,7 @@ def _run_trained_attack(
labels[train_indices], labels[train_indices],
sample_weight=sample_weights_train, sample_weight=sample_weights_train,
) )
predictions = attacker.predict(features[test_indices]) scores[test_indices] = attacker.predict(features[test_indices])
scores[test_indices] = predictions
except ValueError as ve: except ValueError as ve:
if 'cannot be greater than the number of members in each class.' in str(ve): if 'cannot be greater than the number of members in each class.' in str(ve):
logging.warning('kf.split in _run_trained_attack fails with: %s', str(ve)) logging.warning('kf.split in _run_trained_attack fails with: %s', str(ve))
@ -200,8 +199,9 @@ def _run_threshold_attack(attack_input: AttackInputData):
slice_spec=_get_slice_spec(attack_input), slice_spec=_get_slice_spec(attack_input),
data_size=DataSize(ntrain=ntrain, ntest=ntest), data_size=DataSize(ntrain=ntrain, ntest=ntest),
attack_type=AttackType.THRESHOLD_ATTACK, attack_type=AttackType.THRESHOLD_ATTACK,
membership_scores_train=attack_input.get_loss_train(), # Negate loss because training examples are expected to have lower loss.
membership_scores_test=attack_input.get_loss_test(), membership_scores_train=-attack_input.get_loss_train(),
membership_scores_test=-attack_input.get_loss_test(),
roc_curve=roc_curve, roc_curve=roc_curve,
epsilon_lower_bound_value=epsilon_lower_bound_value, epsilon_lower_bound_value=epsilon_lower_bound_value,
) )

View file

@ -185,19 +185,19 @@ class TrainedAttacker(object):
raise NotImplementedError() raise NotImplementedError()
def predict(self, input_features): def predict(self, input_features):
"""Predicts whether input_features belongs to train or test. """Predicts the probability that input_features belongs to train.
Args: Args:
input_features : A vector of features with the same semantics as x_train input_features : A vector of features with the same semantics as x_train
passed to train_model. passed to train_model.
Returns: Returns:
An array of probabilities denoting whether the example belongs to test. An array of probabilities that the examples belongs to train.
""" """
if self.model is None: if self.model is None:
raise AssertionError( raise AssertionError(
'Model not trained yet. Please call train_model first.') 'Model not trained yet. Please call train_model first.')
return self.model.predict_proba(input_features)[:, 1] return self.model.predict_proba(input_features)[:, 1] # Train has label 1
class LogisticRegressionAttacker(TrainedAttacker): class LogisticRegressionAttacker(TrainedAttacker):