Fixes comments and membership scores for thresholds attack.
PiperOrigin-RevId: 555579896
This commit is contained in:
parent
fafa69b65c
commit
27069d347d
7 changed files with 17 additions and 17 deletions
|
@ -94,15 +94,15 @@ which should give (something like) the following output
|
|||
|
||||
```
|
||||
Attack Ours (online)
|
||||
AUC 0.6675, Accuracy 0.6074, TPR@0.1%FPR of 0.0104
|
||||
AUC 0.6676, Accuracy 0.6077, TPR@0.1%FPR of 0.0169
|
||||
Attack Ours (online, fixed variance)
|
||||
AUC 0.6831, Accuracy 0.6140, TPR@0.1%FPR of 0.0541
|
||||
AUC 0.6856, Accuracy 0.6137, TPR@0.1%FPR of 0.0593
|
||||
Attack Ours (offline)
|
||||
AUC 0.5465, Accuracy 0.5486, TPR@0.1%FPR of 0.0073
|
||||
AUC 0.5488, Accuracy 0.5500, TPR@0.1%FPR of 0.0130
|
||||
Attack Ours (offline, fixed variance)
|
||||
AUC 0.5518, Accuracy 0.5485, TPR@0.1%FPR of 0.0259
|
||||
AUC 0.5549, Accuracy 0.5537, TPR@0.1%FPR of 0.0299
|
||||
Attack Global threshold
|
||||
AUC 0.5900, Accuracy 0.6018, TPR@0.1%FPR of 0.0007
|
||||
AUC 0.5921, Accuracy 0.6044, TPR@0.1%FPR of 0.0009
|
||||
```
|
||||
|
||||
where the global threshold attack is the baseline, and our online,
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 38 KiB After Width: | Height: | Size: 37 KiB |
|
@ -69,7 +69,7 @@ def main(argv):
|
|||
for dy in range(0, 2*shift+1, stride):
|
||||
this_x = aug_pad[:, dx:dx+32, dy:dy+32, :].transpose((0,3,1,2))
|
||||
|
||||
logits = model.model(this_x, training=False)
|
||||
logits = model.model(this_x, training=True)
|
||||
outs.append(logits)
|
||||
|
||||
print(np.array(outs).shape)
|
||||
|
|
|
@ -88,13 +88,13 @@ which should give (something like) the following output
|
|||
|
||||
```
|
||||
Attack No poison (LiRA)
|
||||
AUC 0.6992, Accuracy 0.6240, TPR@0.1%FPR of 0.0529
|
||||
AUC 0.7025, Accuracy 0.6258, TPR@0.1%FPR of 0.0544
|
||||
Attack No poison (Global threshold)
|
||||
AUC 0.6200, Accuracy 0.6167, TPR@0.1%FPR of 0.0011
|
||||
AUC 0.6191, Accuracy 0.6173, TPR@0.1%FPR of 0.0012
|
||||
Attack With poison (LiRA)
|
||||
AUC 0.9904, Accuracy 0.9617, TPR@0.1%FPR of 0.3730
|
||||
AUC 0.9943, Accuracy 0.9653, TPR@0.1%FPR of 0.4945
|
||||
Attack With poison (Global threshold)
|
||||
AUC 0.9911, Accuracy 0.9580, TPR@0.1%FPR of 0.2130
|
||||
AUC 0.9922, Accuracy 0.9603, TPR@0.1%FPR of 0.3930
|
||||
```
|
||||
|
||||
where the baselines are LiRA and a simple global threshold on the membership
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
|
@ -109,8 +109,7 @@ def _run_trained_attack(
|
|||
labels[train_indices],
|
||||
sample_weight=sample_weights_train,
|
||||
)
|
||||
predictions = attacker.predict(features[test_indices])
|
||||
scores[test_indices] = predictions
|
||||
scores[test_indices] = attacker.predict(features[test_indices])
|
||||
except ValueError as ve:
|
||||
if 'cannot be greater than the number of members in each class.' in str(ve):
|
||||
logging.warning('kf.split in _run_trained_attack fails with: %s', str(ve))
|
||||
|
@ -200,8 +199,9 @@ def _run_threshold_attack(attack_input: AttackInputData):
|
|||
slice_spec=_get_slice_spec(attack_input),
|
||||
data_size=DataSize(ntrain=ntrain, ntest=ntest),
|
||||
attack_type=AttackType.THRESHOLD_ATTACK,
|
||||
membership_scores_train=attack_input.get_loss_train(),
|
||||
membership_scores_test=attack_input.get_loss_test(),
|
||||
# Negate loss because training examples are expected to have lower loss.
|
||||
membership_scores_train=-attack_input.get_loss_train(),
|
||||
membership_scores_test=-attack_input.get_loss_test(),
|
||||
roc_curve=roc_curve,
|
||||
epsilon_lower_bound_value=epsilon_lower_bound_value,
|
||||
)
|
||||
|
|
|
@ -185,19 +185,19 @@ class TrainedAttacker(object):
|
|||
raise NotImplementedError()
|
||||
|
||||
def predict(self, input_features):
|
||||
"""Predicts whether input_features belongs to train or test.
|
||||
"""Predicts the probability that input_features belongs to train.
|
||||
|
||||
Args:
|
||||
input_features : A vector of features with the same semantics as x_train
|
||||
passed to train_model.
|
||||
|
||||
Returns:
|
||||
An array of probabilities denoting whether the example belongs to test.
|
||||
An array of probabilities that the examples belongs to train.
|
||||
"""
|
||||
if self.model is None:
|
||||
raise AssertionError(
|
||||
'Model not trained yet. Please call train_model first.')
|
||||
return self.model.predict_proba(input_features)[:, 1]
|
||||
return self.model.predict_proba(input_features)[:, 1] # Train has label 1
|
||||
|
||||
|
||||
class LogisticRegressionAttacker(TrainedAttacker):
|
||||
|
|
Loading…
Reference in a new issue