O1: get audit vectors

2024-12-03 23:02:55 -07:00 · 2024-12-03 23:02:55 -07:00 · 5d6f7e2916
commit 5d6f7e2916
parent 4692502763
1 changed files with 17 additions and 12 deletions
--- a/one_run_audit/audit.py
+++ b/one_run_audit/audit.py
@ -49,11 +49,11 @@ def get_dataloaders(m=1000, train_batch_size=128, test_batch_size=10):

    # Original dataset
    x = np.stack(train_ds[i][0].numpy() for i in range(len(train_ds)))  # Applies transforms
+    p = np.random.permutation(len(train_ds))

    # Choose m points to randomly exclude at chance
    S = np.full(len(train_ds), True)
    S[:m] = np.random.choice([True, False], size=m)  # Vector of determining if each point is in or out
-    p = np.random.permutation(len(train_ds))

    # Store the m points which could have been included/excluded
    mask = np.full(len(train_ds), False)
@ -62,6 +62,7 @@ def get_dataloaders(m=1000, train_batch_size=128, test_batch_size=10):

    x_m = x[mask] # These are the points being guessed at
    y_m = np.array(train_ds.targets)[mask].astype(np.int64)
+    S_m = S[p][mask]  # Ground truth of inclusion/exclusion for x_m

    # Remove excluded points from dataset
    x_in = x[S[p]]
@ -72,7 +73,7 @@ def get_dataloaders(m=1000, train_batch_size=128, test_batch_size=10):
    train_dl = DataLoader(td, batch_size=train_batch_size, shuffle=True, num_workers=4)
    test_dl = DataLoader(test_ds, batch_size=test_batch_size, shuffle=True, num_workers=4)

-    return train_dl, test_dl, x_in, x_m, y_m, S[p]
+    return train_dl, test_dl, x_in, x_m, y_m, S_m


 def evaluate_on(model, dataloader):
@ -224,7 +225,9 @@ def main():
        "delta": 1e-5,
        "norm": args.norm,
        "batch_size": 4096,
-        "epochs": 20,
+        "epochs": 2,
+        "k+": 300,
+        "k-": 300,
    }

    hp['logfile'] = Path('WideResNet_{}_{}_{}_{}s_x{}_{}e_{}d_{}C.txt'.format(
@ -238,18 +241,13 @@ def main():
        hp['norm'],
    ))

-    train_dl, test_dl, x_in, x_m, y_m, S = get_dataloaders(hp['target_points'], hp['batch_size'])
+    train_dl, test_dl, x_in, x_m, y_m, S_m = get_dataloaders(hp['target_points'], hp['batch_size'])
    print(f"len train: {len(train_dl)}")
-    print(f"Got vector S: {S.shape}, sum={np.sum(S)}, S[:{hp['target_points']}] = {S[:8]}")
+    print(f"Got vector Sm: {S_m.shape}, sum={np.sum(S_m)}")
    print(f"Got x_in: {x_in.shape}")
    print(f"Got x_m: {x_m.shape}")
    print(f"Got y_m: {y_m.shape}")

-    for x, y in train_dl:
-        print(f"dl x shape: {x.shape}")
-        print(f"dl y shape: {y.shape}")
-        break
-
    model_init, model_trained = train(hp, train_dl, test_dl)

    # torch.save(model_init.state_dict(), "data/init_model.pt")
@ -265,15 +263,22 @@ def main():
        for i in range(len(x_m)):
            x_point = x_m[i].unsqueeze(0)
            y_point = y_m[i].unsqueeze(0)
+            is_in = S_m[i]

            init_loss = criterion(model_init(x_point)[0], y_point)
            trained_loss = criterion(model_trained(x_point)[0], y_point)

-            scores.append(init_loss - trained_loss)
+            scores.append(((init_loss - trained_loss).item(), is_in))
+
+    scores = sorted(scores, key=lambda x: x[0])
+    scores = np.array([x[1] for x in scores])

-    print(len(scores))
    print(scores[:10])

+    correct = np.sum(~scores[:hp['k-']]) + np.sum(scores[-hp['k+']:])
+    total = len(scores)
+    print(f"Audit total: {correct}/{total} = {round(correct/total*100, 2)}")
+
    correct, total = evaluate_on(model_init, train_dl)
    print(f"Init model accuracy: {correct}/{total} = {round(correct/total*100, 2)}")
    correct, total = evaluate_on(model_trained, test_dl)