Fixes comments and membership scores for thresholds attack.

PiperOrigin-RevId: 555579896
This commit is contained in:
Shuang Song 2023-08-10 11:29:52 -07:00 committed by A. Unique TensorFlower
parent fafa69b65c
commit 27069d347d
7 changed files with 17 additions and 17 deletions

View file

@ -94,15 +94,15 @@ which should give (something like) the following output
```
Attack Ours (online)
AUC 0.6675, Accuracy 0.6074, TPR@0.1%FPR of 0.0104
AUC 0.6676, Accuracy 0.6077, TPR@0.1%FPR of 0.0169
Attack Ours (online, fixed variance)
AUC 0.6831, Accuracy 0.6140, TPR@0.1%FPR of 0.0541
AUC 0.6856, Accuracy 0.6137, TPR@0.1%FPR of 0.0593
Attack Ours (offline)
AUC 0.5465, Accuracy 0.5486, TPR@0.1%FPR of 0.0073
AUC 0.5488, Accuracy 0.5500, TPR@0.1%FPR of 0.0130
Attack Ours (offline, fixed variance)
AUC 0.5518, Accuracy 0.5485, TPR@0.1%FPR of 0.0259
AUC 0.5549, Accuracy 0.5537, TPR@0.1%FPR of 0.0299
Attack Global threshold
AUC 0.5900, Accuracy 0.6018, TPR@0.1%FPR of 0.0007
AUC 0.5921, Accuracy 0.6044, TPR@0.1%FPR of 0.0009
```
where the global threshold attack is the baseline, and our online,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 37 KiB

View file

@ -69,7 +69,7 @@ def main(argv):
for dy in range(0, 2*shift+1, stride):
this_x = aug_pad[:, dx:dx+32, dy:dy+32, :].transpose((0,3,1,2))
logits = model.model(this_x, training=False)
logits = model.model(this_x, training=True)
outs.append(logits)
print(np.array(outs).shape)

View file

@ -88,13 +88,13 @@ which should give (something like) the following output
```
Attack No poison (LiRA)
AUC 0.6992, Accuracy 0.6240, TPR@0.1%FPR of 0.0529
AUC 0.7025, Accuracy 0.6258, TPR@0.1%FPR of 0.0544
Attack No poison (Global threshold)
AUC 0.6200, Accuracy 0.6167, TPR@0.1%FPR of 0.0011
AUC 0.6191, Accuracy 0.6173, TPR@0.1%FPR of 0.0012
Attack With poison (LiRA)
AUC 0.9904, Accuracy 0.9617, TPR@0.1%FPR of 0.3730
AUC 0.9943, Accuracy 0.9653, TPR@0.1%FPR of 0.4945
Attack With poison (Global threshold)
AUC 0.9911, Accuracy 0.9580, TPR@0.1%FPR of 0.2130
AUC 0.9922, Accuracy 0.9603, TPR@0.1%FPR of 0.3930
```
where the baselines are LiRA and a simple global threshold on the membership

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View file

@ -109,8 +109,7 @@ def _run_trained_attack(
labels[train_indices],
sample_weight=sample_weights_train,
)
predictions = attacker.predict(features[test_indices])
scores[test_indices] = predictions
scores[test_indices] = attacker.predict(features[test_indices])
except ValueError as ve:
if 'cannot be greater than the number of members in each class.' in str(ve):
logging.warning('kf.split in _run_trained_attack fails with: %s', str(ve))
@ -200,8 +199,9 @@ def _run_threshold_attack(attack_input: AttackInputData):
slice_spec=_get_slice_spec(attack_input),
data_size=DataSize(ntrain=ntrain, ntest=ntest),
attack_type=AttackType.THRESHOLD_ATTACK,
membership_scores_train=attack_input.get_loss_train(),
membership_scores_test=attack_input.get_loss_test(),
# Negate loss because training examples are expected to have lower loss.
membership_scores_train=-attack_input.get_loss_train(),
membership_scores_test=-attack_input.get_loss_test(),
roc_curve=roc_curve,
epsilon_lower_bound_value=epsilon_lower_bound_value,
)

View file

@ -185,19 +185,19 @@ class TrainedAttacker(object):
raise NotImplementedError()
def predict(self, input_features):
"""Predicts whether input_features belongs to train or test.
"""Predicts the probability that input_features belongs to train.
Args:
input_features : A vector of features with the same semantics as x_train
passed to train_model.
Returns:
An array of probabilities denoting whether the example belongs to test.
An array of probabilities that the examples belongs to train.
"""
if self.model is None:
raise AssertionError(
'Model not trained yet. Please call train_model first.')
return self.model.predict_proba(input_features)[:, 1]
return self.model.predict_proba(input_features)[:, 1] # Train has label 1
class LogisticRegressionAttacker(TrainedAttacker):