Init

2024-11-20 12:11:10 -07:00 · 2024-11-20 12:11:10 -07:00 · 18426c7552
commit 18426c7552
10 changed files with 4725 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,17 @@
 ## Sources
 - [cifar10-fast-simple](https://github.com/99991/cifar10-fast-simple)
 ## Setup
 Get miniconda [here](https://docs.anaconda.com/miniconda/install/#quick-command-line-install)
 ```bash
 conda create --name mia_distilled python=3.11.2
 conda activate mia_distilled
 conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
 ```
 We've found that CUDA 12.2 will still run without issue on `pytorch-cuda=12.1`.
 There is also a `pytorch-cuda=12.4`. Check your system CUDA version with
 `nvidia-smi`.
--- a/cifar10-fast-simple/.gitignore
+++ b/cifar10-fast-simple/.gitignore
@ -0,0 +1,129 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
--- a/cifar10-fast-simple/LICENSE
+++ b/cifar10-fast-simple/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Thomas Germer
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/cifar10-fast-simple/README.md
+++ b/cifar10-fast-simple/README.md
@ -0,0 +1,99 @@
 # Description
 This project is a simplified version of David Page's amazing blog post [How to Train Your ResNet 8: Bag of Tricks](https://myrtle.ai/learn/how-to-train-your-resnet-8-bag-of-tricks/), where a modified ResNet is trained to reach 94% accuracy in 26 seconds on a V100 GPU.
 **Update:** Also check out https://github.com/tysam-code/hlb-CIFAR10 for even faster training!
 # Usage
 ```bash
 git clone https://github.com/99991/cifar10-fast-simple.git
 cd cifar10-fast-simple
 python3 train.py
 ```
 # Example output
 * Timing results using an A100 GPU only including training and excluding preprocessing and evaluation. The first run still includes some PyTorch/CuDNN initialization work and takes 15.49 sec.
 ```
 epoch    batch    train time [sec]    validation accuracy
    1       97                1.43                 0.1557
    2      194                2.86                 0.7767
    3      291                4.29                 0.8756
    4      388                5.73                 0.8975
    5      485                7.16                 0.9118
    6      582                8.59                 0.9204
    7      679               10.02                 0.9294
    8      776               11.45                 0.9373
    9      873               12.88                 0.9401
   10      970               14.32                 0.9427
 84 of 100 runs >= 94.0 % accuracy
 Min  accuracy: 0.9379000000000001
 Max  accuracy: 0.9438000000000001
 Mean accuracy: 0.9409949999999995 +- 0.0012262442660416419
 ```
 ### Epoch vs validation accuracy
 ![epoch vs validation accuracy](https://raw.githubusercontent.com/99991/cifar10-fast-simple/main/doc/a100_epoch_vs_validation_error.png)
 * Timing results using a P100 GPU.
 ```
 Preprocessing: 3.03 seconds
 epoch    batch    train time [sec]    validation accuracy
    1       97               10.07                 0.2460
    2      194               18.60                 0.7690
    3      291               27.13                 0.8754
    4      388               35.65                 0.8985
    5      485               44.18                 0.9107
    6      582               52.70                 0.9195
    7      679               61.23                 0.9272
    8      776               69.75                 0.9337
    9      873               78.28                 0.9397
   10      970               86.81                 0.9428
 ```
 Train time does not include preprocessing, evaluating validation accuracy or importing the pytorch library.
 The total time, i.e. what `time python3 train.py` would report, was 42.125 and 103.699 seconds respectively.
 * Timing results on a V100 GPU ([thanks to @ZipengFeng](https://github.com/99991/cifar10-fast-simple/issues/1#issuecomment-1057876448))
 ```
 Preprocessing: 4.78 seconds
 epoch    batch    train time [sec]    validation accuracy
    1       97                4.24                 0.2051
    2      194                7.09                 0.7661
    3      291                9.93                 0.8749
    4      388               12.78                 0.8982
    5      485               15.62                 0.9139
    6      582               18.48                 0.9237
    7      679               21.33                 0.9301
    8      776               24.18                 0.9348
    9      873               27.04                 0.9396
   10      970               29.90                 0.9422
 ```
 * Timing results on an RTX 3060 Laptop GPU (6 GB VRAM)
 ```
 Files already downloaded and verified
 Preprocessing: 4.67 seconds
 epoch    batch    train time [sec]    validation accuracy
    1       97               10.50                 0.2578
    2      194               19.47                 0.7549
    3      291               28.21                 0.8737
    4      388               36.97                 0.9013
    5      485               45.72                 0.9127
    6      582               54.62                 0.9213
    7      679               63.39                 0.9286
    8      776               72.17                 0.9348
    9      873               80.95                 0.9395
   10      970               89.74                 0.9412
 ```
--- a/cifar10-fast-simple/doc/a100_epoch_vs_validation_error.png
+++ b/cifar10-fast-simple/doc/a100_epoch_vs_validation_error.png
--- a/cifar10-fast-simple/doc/plot.py
+++ b/cifar10-fast-simple/doc/plot.py
@ -0,0 +1,28 @@
 import matplotlib.pyplot as plt
 result = """
    1       97                4.37                 0.2109
    2      194                7.77                 0.7620
    3      291               11.16                 0.8764
    4      388               14.54                 0.8979
    5      485               17.93                 0.9098
    6      582               21.32                 0.9177
    7      679               24.71                 0.9280
    8      776               28.09                 0.9332
    9      873               31.48                 0.9395
   10      970               34.86                 0.9430
 """
 rows = []
 for row in result.strip().split("\n"):
    numbers = [float(x) for x in row.split()]
    rows.append(numbers)
 epoch, batch, t, accuracy = map(list, zip(*rows))
 plt.plot(epoch, [100 - 100 * x for x in accuracy])
 plt.xticks(epoch)
 plt.xlabel("Epoch")
 plt.ylabel("Validation error [%]")
 plt.savefig("a100_epoch_vs_validation_error.png")
 plt.show()
--- a/cifar10-fast-simple/logs/A100.txt
+++ b/cifar10-fast-simple/logs/A100.txt
--- a/cifar10-fast-simple/logs/P100.txt
+++ b/cifar10-fast-simple/logs/P100.txt
--- a/cifar10-fast-simple/model.py
+++ b/cifar10-fast-simple/model.py
@ -0,0 +1,141 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 def label_smoothing_loss(inputs, targets, alpha):
    log_probs = torch.nn.functional.log_softmax(inputs, dim=1, _stacklevel=5)
    kl = -log_probs.mean(dim=1)
    xent = torch.nn.functional.nll_loss(log_probs, targets, reduction="none")
    loss = (1 - alpha) * xent + alpha * kl
    return loss
 class GhostBatchNorm(nn.BatchNorm2d):
    def __init__(self, num_features, num_splits, **kw):
        super().__init__(num_features, **kw)
        running_mean = torch.zeros(num_features * num_splits)
        running_var = torch.ones(num_features * num_splits)
        self.weight.requires_grad = False
        self.num_splits = num_splits
        self.register_buffer("running_mean", running_mean)
        self.register_buffer("running_var", running_var)
    def train(self, mode=True):
        if (self.training is True) and (mode is False):
            # lazily collate stats when we are going to use them
            self.running_mean = torch.mean(
                self.running_mean.view(self.num_splits, self.num_features), dim=0
            ).repeat(self.num_splits)
            self.running_var = torch.mean(
                self.running_var.view(self.num_splits, self.num_features), dim=0
            ).repeat(self.num_splits)
        return super().train(mode)
    def forward(self, input):
        n, c, h, w = input.shape
        if self.training or not self.track_running_stats:
            assert n % self.num_splits == 0, f"Batch size ({n}) must be divisible by num_splits ({self.num_splits}) of GhostBatchNorm"
            return F.batch_norm(
                input.view(-1, c * self.num_splits, h, w),
                self.running_mean,
                self.running_var,
                self.weight.repeat(self.num_splits),
                self.bias.repeat(self.num_splits),
                True,
                self.momentum,
                self.eps,
            ).view(n, c, h, w)
        else:
            return F.batch_norm(
                input,
                self.running_mean[: self.num_features],
                self.running_var[: self.num_features],
                self.weight,
                self.bias,
                False,
                self.momentum,
                self.eps,
            )
 def conv_bn_relu(c_in, c_out, kernel_size=(3, 3), padding=(1, 1)):
    return nn.Sequential(
        nn.Conv2d(c_in, c_out, kernel_size=kernel_size, padding=padding, bias=False),
        GhostBatchNorm(c_out, num_splits=16),
        nn.CELU(alpha=0.3),
    )
 def conv_pool_norm_act(c_in, c_out):
    return nn.Sequential(
        nn.Conv2d(c_in, c_out, kernel_size=(3, 3), padding=(1, 1), bias=False),
        nn.MaxPool2d(kernel_size=2, stride=2),
        GhostBatchNorm(c_out, num_splits=16),
        nn.CELU(alpha=0.3),
    )
 def patch_whitening(data, patch_size=(3, 3)):
    # Compute weights from data such that
    # torch.std(F.conv2d(data, weights), dim=(2, 3))
    # is close to 1.
    h, w = patch_size
    c = data.size(1)
    patches = data.unfold(2, h, 1).unfold(3, w, 1)
    patches = patches.transpose(1, 3).reshape(-1, c, h, w).to(torch.float32)
    n, c, h, w = patches.shape
    X = patches.reshape(n, c * h * w)
    X = X / (X.size(0) - 1) ** 0.5
    covariance = X.t() @ X
    eigenvalues, eigenvectors = torch.linalg.eigh(covariance)
    eigenvalues = eigenvalues.flip(0)
    eigenvectors = eigenvectors.t().reshape(c * h * w, c, h, w).flip(0)
    return eigenvectors / torch.sqrt(eigenvalues + 1e-2).view(-1, 1, 1, 1)
 class ResNetBagOfTricks(nn.Module):
    def __init__(self, first_layer_weights, c_in, c_out, scale_out):
        super().__init__()
        c = first_layer_weights.size(0)
        conv1 = nn.Conv2d(c_in, c, kernel_size=(3, 3), padding=(1, 1), bias=False)
        conv1.weight.data = first_layer_weights
        conv1.weight.requires_grad = False
        self.conv1 = conv1
        self.conv2 = conv_bn_relu(c, 64, kernel_size=(1, 1), padding=0)
        self.conv3 = conv_pool_norm_act(64, 128)
        self.conv4 = conv_bn_relu(128, 128)
        self.conv5 = conv_bn_relu(128, 128)
        self.conv6 = conv_pool_norm_act(128, 256)
        self.conv7 = conv_pool_norm_act(256, 512)
        self.conv8 = conv_bn_relu(512, 512)
        self.conv9 = conv_bn_relu(512, 512)
        self.pool10 = nn.MaxPool2d(kernel_size=4, stride=4)
        self.linear11 = nn.Linear(512, c_out, bias=False)
        self.scale_out = scale_out
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x + self.conv5(self.conv4(x))
        x = self.conv6(x)
        x = self.conv7(x)
        x = x + self.conv9(self.conv8(x))
        x = self.pool10(x)
        x = x.reshape(x.size(0), x.size(1))
        x = self.linear11(x)
        x = self.scale_out * x
        return x
 Model = ResNetBagOfTricks
--- a/cifar10-fast-simple/train.py
+++ b/cifar10-fast-simple/train.py
@ -0,0 +1,278 @@
 import time
 import copy
 import torch
 import torch.nn as nn
 import torchvision
 import model
 def train(seed=0):
    # Configurable parameters
    epochs = 10
    batch_size = 512
    momentum = 0.9
    weight_decay = 0.256
    weight_decay_bias = 0.004
    ema_update_freq = 5
    ema_rho = 0.99 ** ema_update_freq
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.float16 if device.type != "cpu" else torch.float32
    # First, the learning rate rises from 0 to 0.002 for the first 194 batches.
    # Next, the learning rate shrinks down to 0.0002 over the next 582 batches.
    lr_schedule = torch.cat([
        torch.linspace(0e+0, 2e-3, 194),
        torch.linspace(2e-3, 2e-4, 582),
    ])
    lr_schedule_bias = 64.0 * lr_schedule
    # Print information about hardware on first run
    if seed == 0:
        if device.type == "cuda":
            print("Device :", torch.cuda.get_device_name(device.index))
        print("Dtype  :", dtype)
        print()
    # Start measuring time
    start_time = time.perf_counter()
    # Set random seed to increase chance of reproducability
    torch.manual_seed(seed)
    # Setting cudnn.benchmark to True hampers reproducability, but is faster
    torch.backends.cudnn.benchmark = True
    # Load dataset
    train_data, train_targets, valid_data, valid_targets = load_cifar10(device, dtype)
    # Compute special weights for first layer
    weights = model.patch_whitening(train_data[:10000, :, 4:-4, 4:-4])
    # Construct the neural network
    train_model = model.Model(weights, c_in=3, c_out=10, scale_out=0.125)
    # Convert model weights to half precision
    train_model.to(dtype)
    # Convert BatchNorm back to single precision for better accuracy
    for module in train_model.modules():
        if isinstance(module, nn.BatchNorm2d):
            module.float()
    # Upload model to GPU
    train_model.to(device)
    # Collect weights and biases and create nesterov velocity values
    weights = [
        (w, torch.zeros_like(w))
        for w in train_model.parameters()
        if w.requires_grad and len(w.shape) > 1
    ]
    biases = [
        (w, torch.zeros_like(w))
        for w in train_model.parameters()
        if w.requires_grad and len(w.shape) <= 1
    ]
    # Copy the model for validation
    valid_model = copy.deepcopy(train_model)
    print(f"Preprocessing: {time.perf_counter() - start_time:.2f} seconds")
    # Train and validate
    print("\nepoch    batch    train time [sec]    validation accuracy")
    train_time = 0.0
    batch_count = 0
    for epoch in range(1, epochs + 1):
        # Flush CUDA pipeline for more accurate time measurement
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        start_time = time.perf_counter()
        # Randomly shuffle training data
        indices = torch.randperm(len(train_data), device=device)
        data = train_data[indices]
        targets = train_targets[indices]
        # Crop random 32x32 patches from 40x40 training data
        data = [
            random_crop(data[i : i + batch_size], crop_size=(32, 32))
            for i in range(0, len(data), batch_size)
        ]
        data = torch.cat(data)
        # Randomly flip half the training data
        data[: len(data) // 2] = torch.flip(data[: len(data) // 2], [-1])
        for i in range(0, len(data), batch_size):
            # discard partial batches
            if i + batch_size > len(data):
                break
            # Slice batch from data
            inputs = data[i : i + batch_size]
            target = targets[i : i + batch_size]
            batch_count += 1
            # Compute new gradients
            train_model.zero_grad()
            train_model.train(True)
            logits = train_model(inputs)
            loss = model.label_smoothing_loss(logits, target, alpha=0.2)
            loss.sum().backward()
            lr_index = min(batch_count, len(lr_schedule) - 1)
            lr = lr_schedule[lr_index]
            lr_bias = lr_schedule_bias[lr_index]
            # Update weights and biases of training model
            update_nesterov(weights, lr, weight_decay, momentum)
            update_nesterov(biases, lr_bias, weight_decay_bias, momentum)
            # Update validation model with exponential moving averages
            if (i // batch_size % ema_update_freq) == 0:
                update_ema(train_model, valid_model, ema_rho)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        # Add training time
        train_time += time.perf_counter() - start_time
        valid_correct = []
        for i in range(0, len(valid_data), batch_size):
            valid_model.train(False)
            # Test time agumentation: Test model on regular and flipped data
            regular_inputs = valid_data[i : i + batch_size]
            flipped_inputs = torch.flip(regular_inputs, [-1])
            logits1 = valid_model(regular_inputs).detach()
            logits2 = valid_model(flipped_inputs).detach()
            # Final logits are average of augmented logits
            logits = torch.mean(torch.stack([logits1, logits2], dim=0), dim=0)
            # Compute correct predictions
            correct = logits.max(dim=1)[1] == valid_targets[i : i + batch_size]
            valid_correct.append(correct.detach().type(torch.float64))
        # Accuracy is average number of correct predictions
        valid_acc = torch.mean(torch.cat(valid_correct)).item()
        print(f"{epoch:5} {batch_count:8d} {train_time:19.2f} {valid_acc:22.4f}")
    return valid_acc
 def preprocess_data(data, device, dtype):
    # Convert to torch float16 tensor
    data = torch.tensor(data, device=device).to(dtype)
    # Normalize
    mean = torch.tensor([125.31, 122.95, 113.87], device=device).to(dtype)
    std = torch.tensor([62.99, 62.09, 66.70], device=device).to(dtype)
    data = (data - mean) / std
    # Permute data from NHWC to NCHW format
    data = data.permute(0, 3, 1, 2)
    return data
 def load_cifar10(device, dtype, data_dir="~/data"):
    train = torchvision.datasets.CIFAR10(root=data_dir, download=True)
    valid = torchvision.datasets.CIFAR10(root=data_dir, train=False)
    train_data = preprocess_data(train.data, device, dtype)
    valid_data = preprocess_data(valid.data, device, dtype)
    train_targets = torch.tensor(train.targets).to(device)
    valid_targets = torch.tensor(valid.targets).to(device)
    # Pad 32x32 to 40x40
    train_data = nn.ReflectionPad2d(4)(train_data)
    return train_data, train_targets, valid_data, valid_targets
 def update_ema(train_model, valid_model, rho):
    # The trained model is not used for validation directly. Instead, the
    # validation model weights are updated with exponential moving averages.
    train_weights = train_model.state_dict().values()
    valid_weights = valid_model.state_dict().values()
    for train_weight, valid_weight in zip(train_weights, valid_weights):
        if valid_weight.dtype in [torch.float16, torch.float32]:
            valid_weight *= rho
            valid_weight += (1 - rho) * train_weight
 def update_nesterov(weights, lr, weight_decay, momentum):
    for weight, velocity in weights:
        if weight.requires_grad:
            gradient = weight.grad.data
            weight = weight.data
            gradient.add_(weight, alpha=weight_decay).mul_(-lr)
            velocity.mul_(momentum).add_(gradient)
            weight.add_(gradient.add_(velocity, alpha=momentum))
 def random_crop(data, crop_size):
    crop_h, crop_w = crop_size
    h = data.size(2)
    w = data.size(3)
    x = torch.randint(w - crop_w, size=(1,))[0]
    y = torch.randint(h - crop_h, size=(1,))[0]
    return data[:, :, y : y + crop_h, x : x + crop_w]
 def sha256(path):
    import hashlib
    with open(path, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()
 def getrelpath(abspath):
    import os
    return os.path.relpath(abspath, os.getcwd())
 def print_info():
    # Knowing this information might improve chance of reproducability
    print("File   :", getrelpath(__file__), sha256(__file__))
    print("Model  :", getrelpath(model.__file__), sha256(model.__file__))
    print("PyTorch:", torch.__version__)
 def main():
    print_info()
    accuracies = []
    threshold = 0.94
    for run in range(100):
        valid_acc = train(seed=run)
        accuracies.append(valid_acc)
        # Print accumulated results
        within_threshold = sum(acc >= threshold for acc in accuracies)
        acc = threshold * 100.0
        print()
        print(f"{within_threshold} of {run + 1} runs >= {acc} % accuracy")
        mean = sum(accuracies) / len(accuracies)
        variance = sum((acc - mean)**2 for acc in accuracies) / len(accuracies)
        std = variance**0.5
        print(f"Min  accuracy: {min(accuracies)}")
        print(f"Max  accuracy: {max(accuracies)}")
        print(f"Mean accuracy: {mean} +- {std}")
        print()
 if __name__ == "__main__":
    main()