From a697d4687ced17e82903101c09c6e07eaa4ddcc0 Mon Sep 17 00:00:00 2001 From: Akemi Izuko Date: Thu, 5 Dec 2024 00:13:50 -0700 Subject: [PATCH] O1: new data splitting --- one_run_audit/audit.py | 246 +++++++++++++++++++++++++++++++------ one_run_audit/equations.py | 3 +- 2 files changed, 210 insertions(+), 39 deletions(-) diff --git a/one_run_audit/audit.py b/one_run_audit/audit.py index 48012f3..2f1c308 100644 --- a/one_run_audit/audit.py +++ b/one_run_audit/audit.py @@ -7,13 +7,14 @@ import torch import torch.nn as nn from torch import optim from torch.optim.lr_scheduler import MultiStepLR -from torch.utils.data import DataLoader, Subset, TensorDataset +from torch.utils.data import DataLoader, Subset, TensorDataset, ConcatDataset import torch.nn.functional as F from pathlib import Path from torchvision import transforms from torchvision.datasets import CIFAR10 import pytorch_lightning as pl import opacus +import random from opacus.validators import ModuleValidator from opacus.utils.batch_memory_manager import BatchMemoryManager from WideResNet import WideResNet @@ -50,25 +51,33 @@ def get_dataloaders(m=1000, train_batch_size=128, test_batch_size=10): # Original dataset x = np.stack(train_ds[i][0].numpy() for i in range(len(train_ds))) # Applies transforms + y = np.array(train_ds.targets).astype(np.int64) p = np.random.permutation(len(train_ds)) # Choose m points to randomly exclude at chance S = np.full(len(train_ds), True) S[:m] = np.random.choice([True, False], size=m) # Vector of determining if each point is in or out + S = S[p] # Store the m points which could have been included/excluded mask = np.full(len(train_ds), False) mask[:m] = True mask = mask[p] + # Mislabel inclusion/exclusion examples intentionally! + for i in range(len(y_m)): + possible_values = np.array([x for x in range(10) if x != original_array[i]]) + y_m[i] = np.random.choice(possible_values) + x_m = x[mask] # These are the points being guessed at + S_m = S[mask] # Ground truth of inclusion/exclusion for x_m + y_m = np.array(train_ds.targets)[mask].astype(np.int64) - S_m = S[p][mask] # Ground truth of inclusion/exclusion for x_m # Remove excluded points from dataset - x_in = x[S[p]] + x_in = x[S] y_in = np.array(train_ds.targets).astype(np.int64) - y_in = y_in[S[p]] + y_in = y_in[S] td = TensorDataset(torch.from_numpy(x_in), torch.from_numpy(y_in).long()) train_dl = DataLoader(td, batch_size=train_batch_size, shuffle=True, num_workers=4) @@ -77,6 +86,110 @@ def get_dataloaders(m=1000, train_batch_size=128, test_batch_size=10): return train_dl, test_dl, x_in, x_m, y_m, S_m +def get_dataloaders2(m=1000, train_batch_size=128, test_batch_size=10): + seed = np.random.randint(0, 1e9) + seed ^= int(time.time()) + pl.seed_everything(seed) + + train_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), mode='reflect').squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + test_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + datadir = Path("./data") + + train_ds = CIFAR10(root=datadir, train=True, download=True, transform=train_transform) + trainp_ds = CIFAR10(root=datadir, train=False, download=True, transform=test_transform) + test_ds = CIFAR10(root=datadir, train=False, download=True, transform=test_transform) + + mask = random.sample(range(len(trainp_ds)), m) + S = np.random.choice([True, False], size=m) + S_mask = list(map(lambda x: x[1], filter(lambda x: S[x[0]], enumerate(mask)))) + + x_adv = Subset(trainp_ds, mask) + x_in_adv = Subset(trainp_ds, S_mask) + + train_ds = ConcatDataset([train_ds, x_in_adv]) + + check_train_dl = DataLoader(train_ds, batch_size=1, shuffle=False, num_workers=1) + train_dl = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True, num_workers=4) + x_adv_dl = DataLoader(x_adv, batch_size=1, shuffle=False, num_workers=1) + test_dl = DataLoader(test_ds, batch_size=test_batch_size, shuffle=True, num_workers=4) + + return train_dl, test_dl, x_adv_dl, S, check_train_dl + + +def get_dataloaders3(m=1000, train_batch_size=128, test_batch_size=10): + seed = np.random.randint(0, 1e9) + seed ^= int(time.time()) + pl.seed_everything(seed) + + train_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), mode='reflect').squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + test_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + datadir = Path("./data") + train_ds = CIFAR10(root=datadir, train=True, download=True, transform=train_transform) + test_ds = CIFAR10(root=datadir, train=False, download=True, transform=test_transform) + + # Original dataset + x_train = np.stack(train_ds[i][0].numpy() for i in range(len(train_ds))) + y_train = np.array(train_ds.targets).astype(np.int64) + + x = np.stack(test_ds[i][0].numpy() for i in range(len(test_ds))) # Applies transforms + y = np.array(test_ds.targets).astype(np.int64) + + # Store the m points which could have been included/excluded + mask = np.full(len(test_ds), False) + mask[:m] = True + mask = mask[np.random.permutation(len(test_ds))] + + adv_points = x[mask] + adv_labels = y[mask] + + # Mislabel inclusion/exclusion examples intentionally! + for i in range(len(adv_labels)): + while True: + c = np.random.choice(range(10)) + if adv_labels[i] != c: + adv_labels[i] = c + break + + # Choose m points to randomly exclude at chance + S = np.random.choice([True, False], size=m) # Vector of determining if each point is in or out + + assert len(adv_points) == m + inc_points = adv_points[S] + inc_labels = adv_labels[S] + + td = TensorDataset(torch.from_numpy(inc_points).float(), torch.from_numpy(inc_labels).long()) + td2 = TensorDataset(torch.from_numpy(x_train).float(), torch.from_numpy(y_train).long()) + td = ConcatDataset([td, td2]) + train_dl = DataLoader(td, batch_size=train_batch_size, shuffle=True, num_workers=4) + test_dl = DataLoader(test_ds, batch_size=test_batch_size, shuffle=True, num_workers=4) + + return train_dl, test_dl, adv_points, adv_labels, S + + def evaluate_on(model, dataloader): correct = 0 total = 0 @@ -126,6 +239,54 @@ def train_no_cap(model, hp, train_dl, test_dl, optimizer, criterion, scheduler): return best_test_set_accuracy +def load(hp, model_path, train_dl): + init_model = model_path / "init_model.pt" + trained_model = model_path / "trained_model.pt" + + model = WideResNet( + d=hp["wrn_depth"], + k=hp["wrn_width"], + n_classes=10, + input_features=3, + output_features=16, + strides=[1, 1, 2, 2], + ) + model = ModuleValidator.fix(model) + ModuleValidator.validate(model, strict=True) + model_init = copy.deepcopy(model) + + privacy_engine = opacus.PrivacyEngine() + optimizer = optim.SGD( + model.parameters(), + lr=0.1, + momentum=0.9, + nesterov=True, + weight_decay=5e-4 + ) + model, optimizer, train_loader = privacy_engine.make_private_with_epsilon( + module=model, + optimizer=optimizer, + data_loader=train_dl, + epochs=hp['epochs'], + target_epsilon=hp['epsilon'], + target_delta=hp['delta'], + max_grad_norm=hp['norm'], + ) + + model_init.load_state_dict(torch.load(init_model, weights_only=True)) + model.load_state_dict(torch.load(trained_model, weights_only=True)) + + model_init = model_init.to(DEVICE) + model = model.to(DEVICE) + + adv_points = np.load("data/adv_points.npy") + adv_labels = np.load("data/adv_labels.npy") + S = np.load("data/S.npy") + + return model_init, model, adv_points, adv_labels, S + + + def train(hp, train_dl, test_dl): model = WideResNet( d=hp["wrn_depth"], @@ -209,6 +370,9 @@ def main(): parser.add_argument('--cuda', type=int, help='gpu index', required=False) parser.add_argument('--epsilon', type=float, help='dp epsilon', required=False, default=None) parser.add_argument('--m', type=int, help='number of target points', required=True) + parser.add_argument('--k', type=int, help='number of symmetric guesses', required=True) + parser.add_argument('--epochs', type=int, help='number of epochs', required=True) + parser.add_argument('--load', type=Path, help='number of epochs', required=False) args = parser.parse_args() if torch.cuda.is_available() and args.cuda: @@ -226,9 +390,9 @@ def main(): "delta": 1e-5, "norm": args.norm, "batch_size": 4096, - "epochs": 100, - "k+": 300, - "k-": 300, + "epochs": args.epochs, + "k+": args.k, + "k-": args.k, "p_value": 0.05, } @@ -243,29 +407,31 @@ def main(): hp['norm'], )) - train_dl, test_dl, x_in, x_m, y_m, S_m = get_dataloaders(hp['target_points'], hp['batch_size']) - print(f"len train: {len(train_dl)}") - print(f"Got vector Sm: {S_m.shape}, sum={np.sum(S_m)}") - print(f"Got x_in: {x_in.shape}") - print(f"Got x_m: {x_m.shape}") - print(f"Got y_m: {y_m.shape}") + if args.load: + train_dl, test_dl, _, __, ___ = get_dataloaders3(hp['target_points'], hp['batch_size']) + model_init, model_trained, adv_points, adv_labels, S = load(hp, args.load, train_dl) + test_dl = None + else: + train_dl, test_dl, adv_points, adv_labels, S = get_dataloaders3(hp['target_points'], hp['batch_size']) + model_init, model_trained = train(hp, train_dl, test_dl) - model_init, model_trained = train(hp, train_dl, test_dl) - - # torch.save(model_init.state_dict(), "data/init_model.pt") - # torch.save(model_trained.state_dict(), "data/trained_model.pt") + np.save("data/adv_points", adv_points) + np.save("data/adv_labels", adv_labels) + np.save("data/S", S) + torch.save(model_init.state_dict(), "data/init_model.pt") + torch.save(model_trained.state_dict(), "data/trained_model.pt") scores = list() criterion = nn.CrossEntropyLoss() with torch.no_grad(): model_init.eval() - x_m = torch.from_numpy(x_m).to(DEVICE) - y_m = torch.from_numpy(y_m).long().to(DEVICE) + x_m = torch.from_numpy(adv_points).to(DEVICE) + y_m = torch.from_numpy(adv_labels).long().to(DEVICE) for i in range(len(x_m)): - x_point = x_m[i].unsqueeze(0) - y_point = y_m[i].unsqueeze(0) - is_in = S_m[i] + x_point = x_m[i].unsqueeze(0).to(DEVICE) + y_point = y_m[i].unsqueeze(0).to(DEVICE) + is_in = S[i] init_loss = criterion(model_init(x_point)[0], y_point) trained_loss = criterion(model_trained(x_point)[0], y_point) @@ -277,24 +443,30 @@ def main(): print(scores[:10]) - correct = np.sum(~scores[:hp['k-']]) + np.sum(scores[-hp['k+']:]) - total = len(scores) + audits = (0, 0, 0, 0) + for k in [10, 20, 50, 100, 200, 300, 500, 800, 1000, 1200, 1400, 1600, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500]: + correct = np.sum(~scores[:k]) + np.sum(scores[-k:]) + total = len(scores) - eps_lb = get_eps_audit( - hp['target_points'], - hp['k+'] + hp['k-'], - correct, - hp['delta'], - hp['p_value'] - ) + eps_lb = get_eps_audit( + hp['target_points'], + 2*k, + correct, + hp['delta'], + hp['p_value'] + ) - print(f"Audit total: {correct}/{total} = {round(correct/total*100, 2)}") - print(f"p[ε < {eps_lb}] < {hp['p_value']}") + if eps_lb > audits[0]: + audits = (eps_lb, k, correct, total) - correct, total = evaluate_on(model_init, train_dl) - print(f"Init model accuracy: {correct}/{total} = {round(correct/total*100, 2)}") - correct, total = evaluate_on(model_trained, test_dl) - print(f"Done model accuracy: {correct}/{total} = {round(correct/total*100, 2)}") + print(f"Audit total: {audits[2]}/{2*audits[1]}/{audits[3]}") + print(f"p[ε < {audits[0]}] < {hp['p_value']} for true epsilon {hp['epsilon']}") + + if test_dl is not None: + correct, total = evaluate_on(model_init, test_dl) + print(f"Init model accuracy: {correct}/{total} = {round(correct/total*100, 2)}") + correct, total = evaluate_on(model_trained, test_dl) + print(f"Done model accuracy: {correct}/{total} = {round(correct/total*100, 2)}") if __name__ == '__main__': diff --git a/one_run_audit/equations.py b/one_run_audit/equations.py index b66ede9..2e67e60 100644 --- a/one_run_audit/equations.py +++ b/one_run_audit/equations.py @@ -49,5 +49,4 @@ def get_eps_audit(m, r, v, delta, p): if __name__ == '__main__': - x = 100 - print(f"For m=100 r=100 v=100 p=0.05: {get_eps_audit(x, x, x, 1e-5, 0.05)}") + print(get_eps_audit(1000, 600, 600, 1e-5, 0.05))