First commit

2017-08-17 11:43:17 -07:00 · 2017-08-17 11:43:17 -07:00 · b7e1e0fa0f
commit b7e1e0fa0f
98 changed files with 42749 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+*.pyc
+__pycache__/
+.eggs/
+*.egg-info/
+.cache
--- a/pytorch/.gitignore
+++ b/pytorch/.gitignore
@ -0,0 +1,6 @@
+*.pyc
+__pycache__/
+.eggs/
+*.egg-info/
+.cache
+data/
--- a/pytorch/CIFAR10/benchmark/init.py
+++ b/pytorch/CIFAR10/benchmark/init.py
--- a/pytorch/CIFAR10/benchmark/infer.py
+++ b/pytorch/CIFAR10/benchmark/infer.py
@ -0,0 +1,140 @@
+import os
+import timeit
+from glob import glob
+from collections import OrderedDict
+
+import click
+import torch
+import numpy as np
+from torch.autograd import Variable
+from torchvision import transforms
+from torchvision import datasets
+
+from benchmark.train import load, MEAN, STD, save_result, MODELS
+
+
+class PyTorchEngine:
+    def __init__(self, filename, use_cuda=False, name=None):
+        self.filename = filename
+        self.use_cuda = use_cuda
+        self.name = name
+        model, epoch, accuracy = load(self.filename)
+
+        if self.use_cuda:
+            self.model = model.cuda()
+        else:
+            self.model = model.cpu()
+        self.epoch = epoch
+        self.accuracy = accuracy
+
+    def pred(self, inputs):
+        inputs = Variable(inputs, requires_grad=False, volatile=True)
+
+        if self.use_cuda:
+            inputs = inputs.cuda()
+            return self.model(inputs).data.cpu().numpy()
+        else:
+            return self.model(inputs).data.numpy()
+
+
+def time_batch_size(dataset, batch_size, pred, use_cuda, repeat=100, bestof=3):
+    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
+                                         shuffle=False, pin_memory=use_cuda)
+    inputs, targets = loader.__iter__().next()
+    assert inputs.size(0) == batch_size
+
+    times = timeit.repeat('pred(inputs)', globals=locals(),
+                          repeat=repeat, number=1)
+
+    return times
+
+
+def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
+                  output=None):
+    if log2:
+        start = int(np.floor(np.log2(start)))
+        end = int(np.ceil(np.log2(end)))
+        assert start >= 0
+        assert end >= start
+        batch_sizes = map(lambda x: 2**x, range(start, end + 1))
+    else:
+        batch_sizes = range(start, end + 1)
+    results = []
+    for batch_size in batch_sizes:
+        times = time_batch_size(dataset, batch_size, engine.pred,
+                                engine.use_cuda, repeat=repeat)
+
+        result = OrderedDict()
+        result['nodename'] = os.uname().nodename
+        result['model'] = engine.name
+        result['use_cuda'] = engine.use_cuda
+        result['batch_size'] = batch_size
+        result['mean'] = np.mean(times)
+        result['std'] = np.std(times)
+        result['throughput'] = batch_size / np.mean(times)
+        result['filename'] = engine.filename
+        if output is not None:
+            save_result(result, output)
+
+        print('batch_size: {batch_size:4d}'
+              ' - mean: {mean:.4f}'
+              ' - std: {std:.4f}'
+              ' - throughput: {throughput:.4f}'.format(**result))
+        results.append(result)
+
+    return results
+
+
+@click.command()
+@click.option('--dataset-dir', default='./data/cifar10')
+@click.option('--run-dir', default='./run/')
+@click.option('--output-file', default='inference.csv')
+@click.option('--start', '-s', default=1)
+@click.option('--end', '-e', default=128)
+@click.option('--repeat', '-r', default=100)
+@click.option('--log2/--no-log2', default=True)
+@click.option('--cpu/--no-cpu', default=True)
+@click.option('--gpu/--no-gpu', default=True)
+@click.option('--append', is_flag=True)
+@click.option('--models', '-m', type=click.Choice(MODELS.keys()),
+              multiple=True)
+def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
+          cpu, gpu, append, models):
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD)
+    ])
+
+    testset = datasets.CIFAR10(root=dataset_dir, train=False, download=True,
+                               transform=transform_test)
+    models = models or os.listdir(run_dir)
+    output_path = os.path.join(run_dir, output_file)
+    assert not os.path.exists(output_path) or append
+    for model in models:
+        model_dir = os.path.join(run_dir, model)
+        paths = glob(f"{model_dir}/*/checkpoint_best_model.t7")
+        assert len(paths) > 0
+        path = os.path.abspath(paths[0])
+
+        print(f'Model: {model}')
+        print(f'Path: {path}')
+
+        if cpu:
+            print('With CPU:')
+            engine = PyTorchEngine(path, use_cuda=False, name=model)
+            infer_cifar10(testset, engine, start=start, end=end, log2=log2,
+                          repeat=repeat, output=output_path)
+
+        if gpu and torch.cuda.is_available():
+            print('With GPU:')
+            engine = PyTorchEngine(path, use_cuda=True, name=model)
+            # Warmup
+            time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)
+
+            infer_cifar10(testset, engine, start=start, end=end, log2=log2,
+                          repeat=repeat, output=output_path)
+
+
+if __name__ == '__main__':
+    infer()
--- a/pytorch/CIFAR10/benchmark/models/densenet.py
+++ b/pytorch/CIFAR10/benchmark/models/densenet.py
@ -0,0 +1,108 @@
+'''DenseNet in PyTorch.'''
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, growth_rate):
+        super(Bottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, 4 * growth_rate, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(4 * growth_rate)
+        self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
+
+    def forward(self, x):
+        out = self.conv1(F.relu(self.bn1(x)))
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = torch.cat([out, x], 1)
+        return out
+
+
+class Transition(nn.Module):
+    def __init__(self, in_planes, out_planes, last=False, pool_size=2):
+        super(Transition, self).__init__()
+        self.last = last
+        self.pool_size = pool_size
+        self.bn = nn.BatchNorm2d(in_planes)
+        if not self.last:
+            self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        out = F.relu(self.bn(x))
+        if not self.last:
+            out = self.conv(out)
+        out = F.avg_pool2d(out, self.pool_size)
+        return out
+
+
+class DenseNet(nn.Module):
+    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
+        super(DenseNet, self).__init__()
+        # TODO: Add drop for CIFAR10 without data augmentation
+        self.growth_rate = growth_rate
+
+        num_planes = 2 * growth_rate
+        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
+
+        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
+        num_planes += nblocks[0] * growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans1 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
+        num_planes += nblocks[1] * growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans2 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
+        num_planes += nblocks[2] * growth_rate
+        self.trans3 = Transition(num_planes, num_planes, last=True, pool_size=8)
+
+        self.linear = nn.Linear(num_planes, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_dense_layers(self, block, in_planes, nblock):
+        layers = []
+        for i in range(nblock):
+            layers.append(block(in_planes, self.growth_rate))
+            in_planes += self.growth_rate
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.trans1(self.dense1(out))
+        out = self.trans2(self.dense2(out))
+        out = self.trans3(self.dense3(out))
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def DenseNetBC(L, k):
+    assert (L - 4) % 6 == 0
+    num_blocks = int((L - 4) / 6)
+    return DenseNet(Bottleneck, [num_blocks] * 3, growth_rate=k, reduction=0.5)
+
+
+def DenseNetBC100():
+    return DenseNetBC(100, 12)
+
+
+def DenseNetBC250():
+    return DenseNetBC(250, 24)
+
+
+def DenseNetBC190():
+    return DenseNetBC(190, 40)
--- a/pytorch/CIFAR10/benchmark/models/resnet.py
+++ b/pytorch/CIFAR10/benchmark/models/resnet.py
@ -0,0 +1,372 @@
+import math
+from functools import partial
+
+from torch import nn
+from torch.nn import functional as F
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        if stride != 1 or inplanes != (planes * self.expansion):
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
+                          bias=False),
+                nn.BatchNorm2d(planes * self.expansion)
+            )
+        else:
+            self.shortcut = nn.Sequential()
+
+    def forward(self, inputs):
+        H = self.conv1(inputs)
+        H = self.bn1(H)
+        H = F.relu(H)
+
+        H = self.conv2(H)
+        H = self.bn2(H)
+
+        H += self.shortcut(inputs)
+        outputs = F.relu(H)
+
+        return outputs
+
+
+class PreActBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
+                               bias=False)
+
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+
+        self.increasing = stride != 1 or inplanes != (planes * self.expansion)
+        if self.increasing:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
+                          bias=False)
+            )
+        else:
+            self.shortcut = nn.Sequential()
+
+    def forward(self, inputs):
+        H = self.bn1(inputs)
+        H = F.relu(H)
+        if self.increasing:
+            inputs = H
+        H = self.conv1(H)
+
+        H = self.bn2(H)
+        H = F.relu(H)
+        H = self.conv2(H)
+
+        H += self.shortcut(inputs)
+        return H
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.conv3 = nn.Conv2d(planes, planes * 4, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+
+        if stride != 1 or inplanes != (planes * self.expansion):
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
+                          bias=False),
+                nn.BatchNorm2d(planes * self.expansion)
+            )
+        else:
+            self.shortcut = nn.Sequential()
+
+    def forward(self, inputs):
+        H = self.conv1(inputs)
+        H = self.bn1(H)
+        H = F.relu(H)
+
+        H = self.conv2(H)
+        H = self.bn2(H)
+        H = F.relu(H)
+
+        H = self.conv3(H)
+        H = self.bn3(H)
+
+        H += self.shortcut(inputs)
+        outputs = F.relu(H)
+
+        return outputs
+
+
+class ResNeXtBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, cardinality=32,
+                 base_width=4):
+        super().__init__()
+
+        width = math.floor(planes * (base_width / 64.0))
+
+        self.conv1 = nn.Conv2d(inplanes, width * cardinality, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * cardinality)
+
+        self.conv2 = nn.Conv2d(width * cardinality, width * cardinality, 3,
+                               groups=cardinality, padding=1, stride=stride,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(width * cardinality)
+
+        self.conv3 = nn.Conv2d(width * cardinality, planes * 4, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+
+        if stride != 1 or inplanes != (planes * self.expansion):
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
+                          bias=False),
+                nn.BatchNorm2d(planes * self.expansion)
+            )
+        else:
+            self.shortcut = nn.Sequential()
+
+    def forward(self, inputs):
+        H = self.conv1(inputs)
+        H = self.bn1(H)
+        H = F.relu(H)
+
+        H = self.conv2(H)
+        H = self.bn2(H)
+        H = F.relu(H)
+
+        H = self.conv3(H)
+        H = self.bn3(H)
+
+        H += self.shortcut(inputs)
+        outputs = F.relu(H)
+
+        return outputs
+
+
+class PreActBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, stride=stride,
+                               bias=False)
+
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, 1, bias=False)
+
+        self.increasing = stride != 1 or inplanes != (planes * self.expansion)
+        if self.increasing:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
+                          bias=False)
+            )
+        else:
+            self.shortcut = nn.Sequential()
+
+    def forward(self, inputs):
+        H = self.bn1(inputs)
+        H = F.relu(H)
+        if self.increasing:
+            inputs = H
+        H = self.conv1(H)
+
+        H = self.bn2(H)
+        H = F.relu(H)
+        H = self.conv2(H)
+
+        H = self.bn3(H)
+        H = F.relu(H)
+        H = self.conv3(H)
+
+        H += self.shortcut(inputs)
+        return H
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, Block, layers, filters, num_classes=10, inplanes=None):
+        self.inplanes = inplanes or filters[0]
+        super().__init__()
+
+        self.pre_act = 'Pre' in Block.__name__
+
+        self.conv1 = nn.Conv2d(3, self.inplanes, 3, padding=1, bias=False)
+        if not self.pre_act:
+            self.bn1 = nn.BatchNorm2d(self.inplanes)
+
+        self.num_sections = len(layers)
+        for section_index, (size, planes) in enumerate(zip(layers, filters)):
+            section = []
+            for layer_index in range(size):
+                if section_index != 0 and layer_index == 0:
+                    stride = 2
+                else:
+                    stride = 1
+                section.append(Block(self.inplanes, planes, stride=stride))
+                self.inplanes = planes * Block.expansion
+            section = nn.Sequential(*section)
+            setattr(self, f'section_{section_index}', section)
+
+        if self.pre_act:
+            self.bn1 = nn.BatchNorm2d(self.inplanes)
+
+        self.fc = nn.Linear(filters[-1] * Block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, inputs):
+        H = self.conv1(inputs)
+
+        if not self.pre_act:
+            H = self.bn1(H)
+            H = F.relu(H)
+
+        for section_index in range(self.num_sections):
+            H = getattr(self, f'section_{section_index}')(H)
+
+        if self.pre_act:
+            H = self.bn1(H)
+            H = F.relu(H)
+
+        H = F.avg_pool2d(H, H.size()[2:])
+        H = H.view(H.size(0), -1)
+        outputs = self.fc(H)
+
+        return outputs
+
+
+# From "Deep Residual Learning for Image Recognition"
+def ResNet20():
+    return ResNet(BasicBlock, layers=[3] * 3, filters=[16, 32, 64])
+
+
+def ResNet32():
+    return ResNet(BasicBlock, layers=[5] * 3, filters=[16, 32, 64])
+
+
+def ResNet44():
+    return ResNet(BasicBlock, layers=[7] * 3, filters=[16, 32, 64])
+
+
+def ResNet56():
+    return ResNet(BasicBlock, layers=[9] * 3, filters=[16, 32, 64])
+
+
+def ResNet110():
+    return ResNet(BasicBlock, layers=[18] * 3, filters=[16, 32, 64])
+
+
+def ResNet1202():
+    return ResNet(BasicBlock, layers=[200] * 3, filters=[16, 32, 64])
+
+
+# Based on but not it "Identity Mappings in Deep Residual Networks"
+def PreActResNet20():
+    return ResNet(PreActBlock, layers=[3] * 3, filters=[16, 32, 64])
+
+
+def PreActResNet56():
+    return ResNet(PreActBlock, layers=[9] * 3, filters=[16, 32, 64])
+
+
+def PreActResNet164Basic():
+    return ResNet(PreActBlock, layers=[27] * 3, filters=[16, 32, 64])
+
+
+# From "Identity Mappings in Deep Residual Networks"
+def PreActResNet110():
+    return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
+
+
+def PreActResNet164():
+    return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
+
+
+def PreActResNet1001():
+    return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
+
+
+# From "Wide Residual Networks"
+def WRN(n, k):
+    assert (n - 4) % 6 == 0
+    base_filters = [16, 32, 64]
+    filters = [num_filters * k for num_filters in base_filters]
+    d = (n - 4) / 2  # l = 2
+    return ResNet(PreActBlock, layers=[int(d / 3)] * 3, filters=filters,
+                  inplanes=16)
+
+
+def WRN_40_4():
+    return WRN(40, 4)
+
+
+def WRN_16_8():
+    return WRN(16, 8)
+
+
+def WRN_28_10():
+    return WRN(28, 10)
+
+
+# From "Aggregated Residual Transformations for Deep Neural Networks"
+def ResNeXt29(cardinality, base_width):
+    Block = partial(ResNeXtBottleneck, cardinality=cardinality,
+                    base_width=base_width)
+    Block.__name__ = ResNeXtBottleneck.__name__
+    Block.expansion = ResNeXtBottleneck.expansion
+    return ResNet(Block, layers=[3, 3, 3], filters=[64, 128, 256])
+
+
+# From kunagliu/pytorch
+def ResNet18():
+    return ResNet(BasicBlock, layers=[2, 2, 2, 2], filters=[64, 128, 256, 512])
+
+
+def ResNet34():
+    return ResNet(BasicBlock, layers=[3, 4, 6, 3], filters=[64, 128, 256, 512])
+
+
+def ResNet50():
+    return ResNet(Bottleneck, layers=[3, 4, 6, 3], filters=[64, 128, 256, 512])
+
+
+def ResNet101():
+    return ResNet(Bottleneck,
+                  layers=[3, 4, 23, 3], filters=[64, 128, 256, 512])
+
+
+def ResNet152():
+    return ResNet(Bottleneck,
+                  layers=[3, 8, 36, 3], filters=[64, 128, 256, 512])
--- a/pytorch/CIFAR10/benchmark/train.py
+++ b/pytorch/CIFAR10/benchmark/train.py
@ -0,0 +1,336 @@
+import os
+import re
+import json
+from functools import reduce
+from datetime import datetime
+from collections import OrderedDict
+
+import click
+import torch
+import progressbar
+from torch import nn, optim
+from torch.autograd import Variable
+from torchvision import transforms
+from torchvision import datasets as dset
+
+from benchmark.models import resnet, densenet
+
+MEAN = (0.4914, 0.4822, 0.4465)
+STD = (0.2023, 0.1994, 0.2010)
+
+MODELS = {
+        # "Deep Residual Learning for Image Recognition"
+        'resnet20': resnet.ResNet20,
+        'resnet32': resnet.ResNet32,
+        'resnet44': resnet.ResNet44,
+        'resnet56': resnet.ResNet56,
+        'resnet110': resnet.ResNet110,
+        'resnet1202': resnet.ResNet1202,
+
+        # "Wide Residual Networks"
+        'wrn-40-4': resnet.WRN_40_4,
+        'wrn-16-8': resnet.WRN_16_8,
+        'wrn-28-10': resnet.WRN_28_10,
+
+        # Based on "Identity Mappings in Deep Residual Networks"
+        'preact20': resnet.PreActResNet20,
+        'preact56': resnet.PreActResNet56,
+        'preact164-basic': resnet.PreActResNet164Basic,
+
+        # "Identity Mappings in Deep Residual Networks"
+        'preact110': resnet.PreActResNet110,
+        'preact164': resnet.PreActResNet164,
+        'preact1001': resnet.PreActResNet1001,
+
+        # "Aggregated Residual Transformations for Deep Neural Networks"
+        'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
+        'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
+
+        # "Densely Connected Convolutional Networks"
+        'densenetbc100': densenet.DenseNetBC100,
+        'densenetbc250': densenet.DenseNetBC250,
+        'densenetbc190': densenet.DenseNetBC190,
+
+        # Kuangliu/pytorch-cifar
+        'resnet18': resnet.ResNet18,
+        'resnet50': resnet.ResNet50,
+        'resnet101': resnet.ResNet101,
+        'resnet152': resnet.ResNet152,
+}
+
+
+def count_parameters(model):
+    c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
+    return sum(c)
+
+
+def correct(outputs, targets, top=(1, )):
+    _, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
+    targets = targets.view(-1, 1).expand_as(predictions)
+    corrects = predictions.eq(targets).cpu().cumsum(1).sum(0)
+    tops = list(map(lambda k: corrects.data[0][k - 1], top))
+    return tops
+
+
+def save_result(result, path):
+    write_heading = not os.path.exists(path)
+    with open(path, mode='a') as out:
+        if write_heading:
+            out.write(",".join([str(k) for k, v in result.items()]) + '\n')
+        out.write(",".join([str(v) for k, v in result.items()]) + '\n')
+
+
+def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
+        use_cuda=False, tracking=None, max_value=None, train=True):
+
+    assert criterion is not None or not train, 'Need criterion to train model'
+    assert optimizer is not None or not train, 'Need optimizer to train model'
+    max_value = max_value or progressbar.UnknownLength
+    bar = progressbar.ProgressBar(max_value=max_value)
+    total = 0
+    correct_counts = {}
+    if train:
+        model.train()
+    else:
+        model.eval()
+
+    start = datetime.now()
+    for batch_index, (inputs, targets) in enumerate(loader):
+        inputs = Variable(inputs, requires_grad=False, volatile=not train)
+        targets = Variable(targets, requires_grad=False, volatile=not train)
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+
+        outputs = model(inputs)
+
+        if train:
+            loss = criterion(outputs, targets)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        _, predictions = torch.max(outputs.data, 1)
+        batch_size = targets.size(0)
+        top_correct = correct(outputs, targets, top=top)
+        total += batch_size
+        for k, count in zip(top, top_correct):
+            correct_counts[k] = correct_counts.get(k, 0) + count
+
+        end = datetime.now()
+        if tracking is not None:
+            result = OrderedDict()
+            result['timestamp'] = datetime.now()
+            result['batch_duration'] = end - start
+            result['epoch'] = epoch
+            result['batch'] = batch_index
+            result['batch_size'] = batch_size
+            for i, k in enumerate(top):
+                result['top{}_correct'.format(k)] = top_correct[i]
+            if train:
+                result['loss'] = loss.data[0]
+            save_result(result, tracking)
+
+        bar.update(batch_index + 1)
+        start = datetime.now()
+
+    print()
+    if train:
+        message = 'Training accuracy of'
+    else:
+        message = 'Test accuracy of'
+    for k in top:
+        accuracy = correct_counts[k] / total
+        message += ' top-{}: {}'.format(k, accuracy)
+    print(message)
+    return (1. * correct_counts[top[0]]) / total, batch_index + 1
+
+
+def save(model, directory, epoch, accuracy, use_cuda=False, filename=None):
+    state = {
+        'model': model.module if use_cuda else model,
+        'epoch': epoch,
+        'accuracy': accuracy
+    }
+
+    filename = filename or 'checkpoint_{}.t7'.format(epoch)
+    torch.save(state, os.path.join(directory, filename))
+
+
+def save_config(config, run_dir):
+    path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
+    with open(path, 'w') as config_file:
+        json.dump(config, config_file)
+        config_file.write('\n')
+
+
+def load(path):
+    assert os.path.exists(path)
+    state = torch.load(path)
+    model = state['model']
+    epoch = state['epoch']
+    accuracy = state['accuracy']
+    return model, epoch, accuracy
+
+
+def latest_file(model):
+    restore = f'./run/{model}'
+    timestamps = sorted(os.listdir(restore))
+    assert len(timestamps) > 0
+    run_dir = os.path.join(restore, timestamps[-1])
+    files = os.listdir(run_dir)
+    max_checkpoint = -1
+    for filename in files:
+        if re.search('checkpoint_\d+.t7', filename):
+            num = int(re.search('\d+', filename).group())
+
+            if num > max_checkpoint:
+                max_checkpoint = num
+                max_checkpoint_file = filename
+
+    assert max_checkpoint != -1
+    return os.path.join(run_dir, max_checkpoint_file)
+
+
+@click.command()
+@click.option('--dataset-dir', default='./data/cifar10')
+@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
+              default='last')
+@click.option('--restore', '-r')
+@click.option('--tracking/--no-tracking', default=True)
+@click.option('--cuda/--no-cuda', default=True)
+@click.option('--epochs', '-e', default=200)
+@click.option('--batch-size', '-b', default=32)
+@click.option('--learning-rate', '-l', default=1e-3)
+@click.option('--sgd', 'optimizer', flag_value='sgd')
+@click.option('--adam', 'optimizer', flag_value='adam', default=True)
+@click.option('--augmentation/--no-augmentation', default=True)
+@click.option('--num-workers', type=int)
+@click.option('--weight-decay', default=5e-4)
+@click.option('--model', '-m', type=click.Choice(MODELS.keys()),
+              default='resnet20')
+def main(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
+         batch_size, learning_rate, optimizer, augmentation, num_workers,
+         weight_decay, model):
+    timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
+    config = {k: v for k, v in locals().items()}
+
+    use_cuda = cuda and torch.cuda.is_available()
+    if use_cuda:
+        num_workers = num_workers or torch.cuda.device_count()
+    else:
+        num_workers = num_workers or 1
+
+    print(f"using {num_workers} workers for data loading")
+
+    print("Preparing data:")
+
+    if augmentation:
+        transform_train = [
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip()
+        ]
+    else:
+        transform_train = []
+
+    transform_train = transforms.Compose(transform_train + [
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD),
+    ])
+
+    trainset = dset.CIFAR10(root=dataset_dir, train=True, download=True,
+                            transform=transform_train)
+    train_loader = torch.utils.data.DataLoader(
+        trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
+        pin_memory=use_cuda)
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD),
+    ])
+
+    testset = dset.CIFAR10(root=dataset_dir, train=False, download=True,
+                           transform=transform_test)
+    test_loader = torch.utils.data.DataLoader(
+        testset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
+        pin_memory=use_cuda)
+
+    if restore is not None:
+        if restore == 'latest':
+            restore = latest_file(model)
+        print(f'Restoring model from {restore}')
+        model, start_epoch, best_accuracy = load(restore)
+        start_epoch += 1
+        print('Starting accuracy is {}'.format(best_accuracy))
+        run_dir = os.path.split(restore)[0]
+    else:
+        print(f'Building {model} model')
+        best_accuracy = -1
+        start_epoch = 1
+        run_dir = f"./run/{model}/{timestamp}"
+        model = MODELS[model]()
+
+    if not os.path.exists(run_dir):
+        os.makedirs(run_dir)
+    save_config(config, run_dir)
+
+    print(model)
+    print("{} parameters".format(count_parameters(model)))
+    print(f"Run directory set to {run_dir}")
+
+    # Save model text description
+    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
+        file.write(str(model))
+
+    if tracking:
+        train_results_file = os.path.join(run_dir, 'train_results.csv')
+        test_results_file = os.path.join(run_dir, 'test_results.csv')
+    else:
+        train_results_file = None
+        test_results_file = None
+
+    if use_cuda:
+        print('Copying model to GPU')
+        model.cuda()
+        model = torch.nn.DataParallel(
+            model, device_ids=range(torch.cuda.device_count()))
+    criterion = nn.CrossEntropyLoss()
+
+    # Other parameters?
+    if optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+    elif optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=learning_rate,
+                              momentum=0.9,
+                              weight_decay=weight_decay)
+    else:
+        raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
+
+    train_max_value = None
+    test_max_value = None
+    end_epoch = start_epoch + epochs
+    for epoch in range(start_epoch, end_epoch):
+        print('Epoch {} of {}'.format(epoch, end_epoch - 1))
+        train_acc, train_max_value = run(epoch, model, train_loader, criterion,
+                                         optimizer, use_cuda=use_cuda,
+                                         tracking=train_results_file,
+                                         max_value=train_max_value, train=True)
+
+        test_acc, test_max_value = run(epoch, model, test_loader,
+                                       use_cuda=use_cuda,
+                                       tracking=test_results_file, train=False)
+
+        if test_acc > best_accuracy:
+            print('New best model!')
+            save(model, run_dir, epoch, test_acc, use_cuda=use_cuda,
+                 filename='checkpoint_best_model.t7')
+            best_accuracy = test_acc
+
+        last_epoch = epoch == (end_epoch - 1)
+        if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
+            save(model, run_dir, epoch, test_acc, use_cuda=use_cuda)
+
+
+if __name__ == '__main__':
+    main()
--- a/pytorch/CIFAR10/setup.py
+++ b/pytorch/CIFAR10/setup.py
@ -0,0 +1,20 @@
+from setuptools import setup
+
+setup(
+    name='benchmark',
+    version='0.0.0',
+    url='http://www.codycoleman.com',
+    author='Cody Austun Coleman',
+    author_email='cody.coleman@cs.stanford.edu',
+    packages=['benchmark'],
+    entry_points={
+        'console_scripts': [
+            'bench = benchmark.train:main'
+        ]
+    },
+    install_requires=[
+        'torchvision',
+        'click',
+        'progressbar2'
+    ]
+)
--- a/tensorflow/CIFAR10/README.md
+++ b/tensorflow/CIFAR10/README.md
@ -0,0 +1,18 @@
+# ResNets on TensorFlow
+
+To train a ResNet, run,
+
+```bash
+python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* --log_root=data/resnet20/log_root \
+                              --train_dir=data/resnet20/log_root/train --dataset='cifar10' --model=resnet20 \
+                              --num_gpus=1 --checkpoint_dir=data/resnet20/checkpoints --data_format=NCHW
+```
+
+To evaluate resulting checkpoints, run,
+
+```bash
+python3 eval_checkpoints.py -i data/resnet20/checkpoints \
+                            -c "python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin --eval_dir=data/resnet20/log_root/eval --dataset='cifar10' --model=resnet20 --num_gpus=1 --eval_batch_count=100 --eval_once=True --data_format=NCHW"
+```
+
+Make sure to first follow the instructions in `resnet/README.md` to get necessary data, etc.
--- a/tensorflow/CIFAR10/eval_checkpoints.py
+++ b/tensorflow/CIFAR10/eval_checkpoints.py
@ -0,0 +1,59 @@
+import argparse
+import os
+import subprocess
+import sys
+
+def main(checkpoints_path, command, start_cnt):
+  cnt = start_cnt
+
+  times = {}
+  cum_time = 0.0
+  with open(os.path.join(checkpoints_path, "times.log"), 'r') as f:
+    output = f.read().strip()
+    output_lines = output.split('\n')
+    for output_line in output_lines:
+        [step, time] = output_line.split('\t')
+        step = int(step.split(': ')[1])
+        time = float(time.split(': ')[1])
+        cum_time += time
+        times[step] = cum_time
+
+  print("Time (in secs)\tNumber of minibatches\tTop 1 accuracy\tTop 5 accuracy")
+  while True:
+    ckpt_path = ("%5d" % cnt).replace(' ', '0')
+    full_ckpt_path = os.path.join(checkpoints_path, ckpt_path)
+    if not os.path.exists(full_ckpt_path):
+      break
+    if len(os.listdir(full_ckpt_path)) <= 2:
+      cnt += 1
+      continue
+    full_command = command + " --log_root=%s 2>/dev/null" % full_ckpt_path
+    output = subprocess.check_output(full_command, shell=True)
+    output = output.decode('utf8').strip()
+    for line in output.split('\n'):
+      if "Precision" in line and "Recall" in line:
+        tokens = line.split(", ")  # TODO: Nasty hack, make more robust.
+        precision_at_1 = float(tokens[0].split()[-1])
+        recall_at_5 = float(tokens[1].split()[-1])
+        step = int(tokens[2].split()[3])
+        stats = [times[step], step, precision_at_1, recall_at_5]
+        print("\t".join([str(stat) for stat in stats]))
+        sys.stdout.flush()
+    cnt += 1
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(
+    description=("Backup model checkpoints periodically")
+  )
+  parser.add_argument('-i', "--checkpoints_path", type=str, required=True,
+                      help="Path to dumped model checkpoints")
+  parser.add_argument('-c', "--command", type=str, required=True,
+                      help="Command to evaluate each individual checkpoint")
+  parser.add_argument('-s', "--start_cnt", type=int, default=1,
+                      help="Count to start evaluating checkpoints from")
+
+  cmdline_args = parser.parse_args()
+  opt_dict = vars(cmdline_args)
+
+  main(opt_dict["checkpoints_path"], opt_dict["command"], opt_dict["start_cnt"])
--- a/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet164_b_train.log
+++ b/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet164_b_train.log
--- a/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet164_nb_train.log
+++ b/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet164_nb_train.log
--- a/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet20_train.log
+++ b/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet20_train.log
--- a/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet56_train.log
+++ b/tensorflow/CIFAR10/logs/16vCPUs_gc/resnet56_train.log
--- a/tensorflow/CIFAR10/logs/1k80_ec2/resnet164_b_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_ec2/resnet164_b_train.log
--- a/tensorflow/CIFAR10/logs/1k80_ec2/resnet164_nb_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_ec2/resnet164_nb_train.log
--- a/tensorflow/CIFAR10/logs/1k80_ec2/resnet20_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_ec2/resnet20_train.log
--- a/tensorflow/CIFAR10/logs/1k80_ec2/resnet56_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_ec2/resnet56_train.log
--- a/tensorflow/CIFAR10/logs/1k80_gc/resnet164_b_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_gc/resnet164_b_train.log
--- a/tensorflow/CIFAR10/logs/1k80_gc/resnet164_nb_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_gc/resnet164_nb_train.log
--- a/tensorflow/CIFAR10/logs/1k80_gc/resnet20_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_gc/resnet20_train.log
--- a/tensorflow/CIFAR10/logs/1k80_gc/resnet56_train.log
+++ b/tensorflow/CIFAR10/logs/1k80_gc/resnet56_train.log
--- a/tensorflow/CIFAR10/logs/1p100_dawn/resnet164_b_train.log
+++ b/tensorflow/CIFAR10/logs/1p100_dawn/resnet164_b_train.log
--- a/tensorflow/CIFAR10/logs/1p100_dawn/resnet164_nb_train.log
+++ b/tensorflow/CIFAR10/logs/1p100_dawn/resnet164_nb_train.log
--- a/tensorflow/CIFAR10/logs/1p100_dawn/resnet20_train.log
+++ b/tensorflow/CIFAR10/logs/1p100_dawn/resnet20_train.log
--- a/tensorflow/CIFAR10/logs/1p100_dawn/resnet56_train.log
+++ b/tensorflow/CIFAR10/logs/1p100_dawn/resnet56_train.log
--- a/tensorflow/CIFAR10/resnet/README.md
+++ b/tensorflow/CIFAR10/resnet/README.md
@ -0,0 +1,88 @@
+# ResNet on CIFAR10 and CIFAR100
+
+(Borrowed from the tensorflow/models repository)
+
+## Dataset
+
+https://www.cs.toronto.edu/~kriz/cifar.html
+
+## Related papers
+
+- [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027v2.pdf)
+- [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385v1.pdf)
+- [Wide Residual Networks](https://arxiv.org/pdf/1605.07146v1.pdf)
+
+## Setting
+
+* Pad to 36x36 and random crop. Horizontal flip. Per-image whitening.
+* Momentum optimizer (momentum = 0.9).
+* Learning rate schedule: 0.01 (1 epoch), 0.1 (90 epochs), 0.01 (45 epochs), 0.001 (45 epochs).
+* L2 weight decay: 0.005.
+* Batch size: 128. (28-10 wide and 1001 layer bottleneck use 64)
+
+## Results
+
+CIFAR-10 Model|Best Precision|Steps
+--------------|--------------|------
+32 layer|92.5%|~80k
+110 layer|93.6%|~80k
+164 layer bottleneck|94.5%|~80k
+1001 layer bottleneck|94.9%|~80k
+28-10 wide|95%|~90k
+
+CIFAR-100 Model|Best Precision|Steps
+---------------|--------------|-----
+32 layer|68.1%|~45k
+110 layer|71.3%|~60k
+164 layer bottleneck|75.7%|~50k
+1001 layer bottleneck|78.2%|~70k
+28-10 wide|78.3%|~70k
+
+## Prerequisites
+
+1. Install TensorFlow 1.2 (preferably from source for higher performance) and Python 3.6.2.
+
+2. Download CIFAR-10/CIFAR-100 dataset.
+
+```shell
+curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
+curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
+```
+
+## How to run
+
+```shell
+# cd to the models repository and run with bash. Expected command output shown.
+# The directory should contain an empty WORKSPACE file, the resnet code, and the cifar10 dataset.
+# Note: The user can split 5k from train set for eval set.
+$ ls -R
+.:
+cifar10  resnet  WORKSPACE
+
+./cifar10:
+data_batch_1.bin  data_batch_2.bin  data_batch_3.bin  data_batch_4.bin
+data_batch_5.bin  test_batch.bin
+
+./resnet:
+cifar_input.py  README.md  resnet_main.py  resnet_model.py
+
+# Train the model.
+$ python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* \
+                                --log_root=/tmp/resnet_model \
+                                --train_dir=/tmp/resnet_model/train \
+                                --dataset='cifar10' \
+                                --num_gpus=1
+
+# While the model is training, you can also check on its progress using tensorboard:
+$ tensorboard --logdir=/tmp/resnet_model
+
+# Evaluate the model.
+# Avoid running on the same GPU as the training job at the same time,
+# otherwise, you might run out of memory.
+$ python3 resnet/resnet_main.py --eval_data_path=cifar10/test_batch.bin \
+                                --log_root=/tmp/resnet_model \
+                                --eval_dir=/tmp/resnet_model/test \
+                                --mode=eval \
+                                --dataset='cifar10' \
+                                --num_gpus=0
+```
--- a/tensorflow/CIFAR10/resnet/cifar_input.py
+++ b/tensorflow/CIFAR10/resnet/cifar_input.py
@ -0,0 +1,121 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""CIFAR dataset input module.
+"""
+
+import tensorflow as tf
+
+def build_input(dataset, data_path, batch_size, mode, data_format):
+  """Build CIFAR image and labels.
+
+  Args:
+    dataset: Either 'cifar10' or 'cifar100'.
+    data_path: Filename for data.
+    batch_size: Input batch size.
+    mode: Either 'train' or 'eval'.
+    data_format: Either 'NCHW' or 'NHWC'.
+  Returns:
+    images: Batches of images. [batch_size, image_size, image_size, 3]
+    labels: Batches of labels. [batch_size, num_classes]
+  Raises:
+    ValueError: when the specified dataset is not supported.
+  """
+  with tf.device('/cpu:0'):
+    image_size = 32
+    if dataset == 'cifar10':
+      label_bytes = 1
+      label_offset = 0
+      num_classes = 10
+    elif dataset == 'cifar100':
+      label_bytes = 1
+      label_offset = 1
+      num_classes = 100
+    else:
+      raise ValueError('Not supported dataset %s', dataset)
+
+    depth = 3
+    image_bytes = image_size * image_size * depth
+    record_bytes = label_bytes + label_offset + image_bytes
+
+    data_files = tf.gfile.Glob(data_path)
+    file_queue = tf.train.string_input_producer(data_files, shuffle=True)
+    # Read examples from files in the filename queue.
+    reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
+    _, value = reader.read(file_queue)
+
+    # Convert these examples to dense labels and processed images.
+    record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
+    label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
+    # Convert from string to [depth * height * width] to [depth, height, width].
+    depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
+                             [depth, image_size, image_size])
+    # Convert from [depth, height, width] to [height, width, depth].
+    image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
+
+    if mode == 'train':
+      image = tf.image.resize_image_with_crop_or_pad(
+          image, image_size+4, image_size+4)
+      image = tf.random_crop(image, [image_size, image_size, 3])
+      image = tf.image.random_flip_left_right(image)
+      # Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
+      # image = tf.image.random_brightness(image, max_delta=63. / 255.)
+      # image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+      # image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
+      image = tf.image.per_image_standardization(image)
+
+      example_queue = tf.RandomShuffleQueue(
+          capacity=16 * batch_size,
+          min_after_dequeue=8 * batch_size,
+          dtypes=[tf.float32, tf.int32],
+          shapes=[[image_size, image_size, depth], [1]])
+      num_threads = 16
+    else:
+      image = tf.image.resize_image_with_crop_or_pad(
+          image, image_size, image_size)
+      image = tf.image.per_image_standardization(image)
+
+      example_queue = tf.FIFOQueue(
+          3 * batch_size,
+          dtypes=[tf.float32, tf.int32],
+          shapes=[[image_size, image_size, depth], [1]])
+      num_threads = 1
+
+    example_enqueue_op = example_queue.enqueue([image, label])
+    tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
+        example_queue, [example_enqueue_op] * num_threads))
+
+    # Read 'batch' labels + images from the example queue.
+    images, labels = example_queue.dequeue_many(batch_size)
+    labels = tf.reshape(labels, [batch_size, 1])
+    indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
+    labels = tf.sparse_to_dense(
+        tf.concat(values=[indices, labels], axis=1),
+        [batch_size, num_classes], 1.0, 0.0)
+
+    if data_format == 'NCHW':
+      images = tf.transpose(images, [0, 3, 1, 2])
+
+    assert len(images.get_shape()) == 4
+    assert images.get_shape()[0] == batch_size
+    if data_format == 'NCHW':
+      assert images.get_shape()[1] == 3
+    else:
+      assert images.get_shape()[-1] == 3
+    assert len(labels.get_shape()) == 2
+    assert labels.get_shape()[0] == batch_size
+    assert labels.get_shape()[1] == num_classes
+
+  return images, labels
--- a/tensorflow/CIFAR10/resnet/resnet_main.py
+++ b/tensorflow/CIFAR10/resnet/resnet_main.py
@ -0,0 +1,302 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ResNet Train/Eval module.
+"""
+import os
+import six
+import subprocess
+import sys
+import time
+
+import cifar_input
+import numpy as np
+import resnet_model
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_string('dataset', 'cifar10', 'cifar10 or cifar100.')
+tf.app.flags.DEFINE_string('mode', 'train', 'train or eval.')
+tf.app.flags.DEFINE_string('model', '', 'model to train.')
+tf.app.flags.DEFINE_string('data_format', 'NHWC',
+                           """Data layout to use: NHWC (TF native)
+                              or NCHW (cuDNN native).""")
+tf.app.flags.DEFINE_string('train_data_path', '',
+                           'Filepattern for training data.')
+tf.app.flags.DEFINE_string('eval_data_path', '',
+                           'Filepattern for eval data')
+tf.app.flags.DEFINE_integer('image_size', 32, 'Image side length.')
+tf.app.flags.DEFINE_string('train_dir', '',
+                           'Directory to keep training outputs.')
+tf.app.flags.DEFINE_string('eval_dir', '',
+                           'Directory to keep eval outputs.')
+tf.app.flags.DEFINE_integer('eval_batch_count', 50,
+                            'Number of batches to eval.')
+tf.app.flags.DEFINE_bool('eval_once', False,
+                         'Whether evaluate the model only once.')
+tf.app.flags.DEFINE_string('log_root', '',
+                           'Should be a parent directory of FLAGS.train_dir/eval_dir.')
+tf.app.flags.DEFINE_string('checkpoint_dir', '',
+                           'Directory to store the checkpoints')
+tf.app.flags.DEFINE_integer('num_gpus', 0,
+                            'Number of gpus used for training. (0 or 1)')
+tf.app.flags.DEFINE_bool('use_bottleneck', False,
+                         'Use bottleneck module or not.')
+tf.app.flags.DEFINE_bool('time_inference', False,
+                         'Time inference.')
+tf.app.flags.DEFINE_integer('batch_size', -1,
+                            'Batch size to use.')
+
+
+def train(hps):
+  """Training loop."""
+  images, labels = cifar_input.build_input(
+      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode, hps.data_format)
+  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
+  model.build_graph()
+
+  param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+      tf.get_default_graph(),
+      tfprof_options=tf.contrib.tfprof.model_analyzer.
+          TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+  sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
+
+  tf.contrib.tfprof.model_analyzer.print_model_analysis(
+      tf.get_default_graph(),
+      tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+
+  truth = tf.argmax(model.labels, axis=1)
+  predictions = tf.argmax(model.predictions, axis=1)
+  precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))
+
+  summary_hook = tf.train.SummarySaverHook(
+      save_steps=100,
+      output_dir=FLAGS.train_dir,
+      summary_op=tf.summary.merge([model.summaries,
+                                   tf.summary.scalar('Precision', precision)]))
+
+  num_steps_per_epoch = 391  # TODO: Don't hardcode this.
+
+  logging_hook = tf.train.LoggingTensorHook(
+      tensors={'step': model.global_step,
+               'loss': model.cost,
+               'precision': precision},
+      every_n_iter=100)
+
+  class _LearningRateSetterHook(tf.train.SessionRunHook):
+    """Sets learning_rate based on global step."""
+
+    def begin(self):
+      self._lrn_rate = 0.01
+
+    def before_run(self, run_context):
+      return tf.train.SessionRunArgs(
+          model.global_step,  # Asks for global step value.
+          feed_dict={model.lrn_rate: self._lrn_rate})  # Sets learning rate
+
+    def after_run(self, run_context, run_values):
+      train_step = run_values.results
+      if train_step < num_steps_per_epoch:
+        self._lrn_rate = 0.01
+      elif train_step < (91 * num_steps_per_epoch):
+        self._lrn_rate = 0.1
+      elif train_step < (136 * num_steps_per_epoch):
+        self._lrn_rate = 0.01
+      elif train_step < (181 * num_steps_per_epoch):
+        self._lrn_rate = 0.001
+      else:
+        self._lrn_rate = 0.0001
+
+  class _SaverHook(tf.train.SessionRunHook):
+    """Sets learning_rate based on global step."""
+
+    def begin(self):
+      self.saver = tf.train.Saver(max_to_keep=10000)
+      subprocess.call("rm -rf %s; mkdir -p %s" % (FLAGS.checkpoint_dir,
+                                                  FLAGS.checkpoint_dir), shell=True)
+      self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w')
+
+    def after_create_session(self, sess, coord):
+      self.sess = sess
+      self.start_time = time.time()
+
+    def before_run(self, run_context):
+      return tf.train.SessionRunArgs(
+          model.global_step  # Asks for global step value.
+      )
+
+    def after_run(self, run_context, run_values):
+      train_step = run_values.results
+      epoch = train_step / num_steps_per_epoch
+      if train_step % num_steps_per_epoch == 0:
+        end_time = time.time()
+        directory = os.path.join(FLAGS.checkpoint_dir, ("%5d" % epoch).replace(' ', '0'))
+        subprocess.call("mkdir -p %s" % directory, shell=True)
+        ckpt_name = 'model.ckpt'
+        self.saver.save(self.sess, os.path.join(directory, ckpt_name),
+                        global_step=train_step)
+        self.f.write("Step: %d\tTime: %s\n" % (train_step, end_time - self.start_time))
+        print("Saved checkpoint after %d epoch(s) to %s..." % (epoch, directory))
+        sys.stdout.flush()
+        self.start_time = time.time()
+
+    def end(self, sess):
+      self.f.close()
+
+  with tf.train.MonitoredTrainingSession(
+      checkpoint_dir=FLAGS.log_root,
+      hooks=[logging_hook, _LearningRateSetterHook()],
+      chief_only_hooks=[summary_hook, _SaverHook()],
+      save_checkpoint_secs=None,
+      # Since we provide a SummarySaverHook, we need to disable default
+      # SummarySaverHook. To do that we set save_summaries_steps to 0.
+      save_summaries_steps=None,
+      save_summaries_secs=None,
+      config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
+    for i in range(num_steps_per_epoch * 181):
+      mon_sess.run(model.train_op)
+
+def evaluate(hps):
+  """Eval loop."""
+  images, labels = cifar_input.build_input(
+      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode, hps.data_format)
+  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
+  model.build_graph()
+  saver = tf.train.Saver()
+  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
+
+  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+  tf.train.start_queue_runners(sess)
+
+  best_precision = 0.0
+  while True:
+    try:
+      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
+    except tf.errors.OutOfRangeError as e:
+      tf.logging.error('Cannot restore checkpoint: %s', e)
+      continue
+    if not (ckpt_state and ckpt_state.model_checkpoint_path):
+      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
+      break
+    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
+    saver.restore(sess, ckpt_state.model_checkpoint_path)
+
+    global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split('-')[-1]
+    if not global_step.isdigit():
+      global_step = 0
+    else:
+      global_step = int(global_step)
+
+    total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0
+    start_time = time.time()
+    for _ in six.moves.range(FLAGS.eval_batch_count):
+      (summaries, loss, predictions, truth, train_step) = sess.run(
+          [model.summaries, model.cost, model.predictions,
+           model.labels, model.global_step])
+
+      if not FLAGS.time_inference:
+        for (indiv_truth, indiv_prediction) in zip(truth, predictions):
+          indiv_truth = np.argmax(indiv_truth)
+          top5_prediction = np.argsort(indiv_prediction)[-5:]
+          top1_prediction = np.argsort(indiv_prediction)[-1]
+          correct_prediction += (indiv_truth == top1_prediction)
+          if indiv_truth in top5_prediction:
+            correct_prediction_top5 += 1
+          total_prediction += 1
+
+    if FLAGS.time_inference:
+      print("Time for inference: %.4f" % (time.time() - start_time))
+    else:
+      precision = 1.0 * correct_prediction / total_prediction
+      precision_top5 = 1.0 * correct_prediction_top5 / total_prediction
+      best_precision = max(precision, best_precision)
+
+      precision_summ = tf.Summary()
+      precision_summ.value.add(
+          tag='Precision', simple_value=precision)
+      summary_writer.add_summary(precision_summ, train_step)
+      best_precision_summ = tf.Summary()
+      best_precision_summ.value.add(
+          tag='Best Precision', simple_value=best_precision)
+      summary_writer.add_summary(best_precision_summ, train_step)
+      summary_writer.add_summary(summaries, train_step)
+      print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' %
+            (precision, precision_top5, global_step))
+      summary_writer.flush()
+
+    if FLAGS.eval_once:
+      break
+
+    time.sleep(60)
+
+
+def main(_):
+  if FLAGS.model == '':
+    raise Exception('--model must be specified.')
+
+  if FLAGS.num_gpus == 0:
+    dev = '/cpu:0'
+  elif FLAGS.num_gpus == 1:
+    dev = '/gpu:0'
+  else:
+    raise ValueError('Only support 0 or 1 gpu.')
+
+  if FLAGS.batch_size == -1:
+    if FLAGS.mode == 'train':
+      batch_size = 128
+    elif FLAGS.mode == 'eval':
+      batch_size = 100
+  else:
+    batch_size = FLAGS.batch_size
+
+  if FLAGS.dataset == 'cifar10':
+    num_classes = 10
+  elif FLAGS.dataset == 'cifar100':
+    num_classes = 100
+
+  if FLAGS.model == 'resnet20':
+    num_residual_units = 3
+  elif FLAGS.model == 'resnet56':
+    num_residual_units = 9
+  elif FLAGS.model == 'resnet164' and FLAGS.use_bottleneck:
+    num_residual_units = 18
+  elif FLAGS.model == 'resnet164' and not FLAGS.use_bottleneck:
+    num_residual_units = 27
+  else:
+    raise Exception("Invalid model -- only resnet20, resnet56 and resnet164 supported")
+
+  data_format = FLAGS.data_format
+
+  hps = resnet_model.HParams(batch_size=batch_size,
+                             num_classes=num_classes,
+                             min_lrn_rate=0.0001,
+                             lrn_rate=0.1,
+                             num_residual_units=num_residual_units,
+                             use_bottleneck=FLAGS.use_bottleneck,
+                             weight_decay_rate=0.0005,
+                             relu_leakiness=0.1,
+                             optimizer='mom',
+                             data_format=data_format)
+
+  with tf.device(dev):
+    if FLAGS.mode == 'train':
+      train(hps)
+    elif FLAGS.mode == 'eval':
+      evaluate(hps)
+
+
+if __name__ == '__main__':
+  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.app.run()
--- a/tensorflow/CIFAR10/resnet/resnet_model.py
+++ b/tensorflow/CIFAR10/resnet/resnet_model.py
@ -0,0 +1,281 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ResNet model.
+
+Related papers:
+https://arxiv.org/pdf/1603.05027v2.pdf
+https://arxiv.org/pdf/1512.03385v1.pdf
+https://arxiv.org/pdf/1605.07146v1.pdf
+"""
+from collections import namedtuple
+
+import numpy as np
+import tensorflow as tf
+import six
+
+from tensorflow.python.training import moving_averages
+
+
+HParams = namedtuple('HParams',
+                     'batch_size, num_classes, min_lrn_rate, lrn_rate, '
+                     'num_residual_units, use_bottleneck, weight_decay_rate, '
+                     'relu_leakiness, optimizer, data_format')
+
+
+class ResNet(object):
+  """ResNet model."""
+
+  def __init__(self, hps, images, labels, mode):
+    """ResNet constructor.
+
+    Args:
+      hps: Hyperparameters.
+      images: Batches of images. [batch_size, image_size, image_size, 3]
+      labels: Batches of labels. [batch_size, num_classes]
+      mode: One of 'train' and 'eval'.
+    """
+    self.hps = hps
+    self._images = images
+    self.labels = labels
+    self.mode = mode
+
+    self._extra_train_ops = []
+
+  def build_graph(self):
+    """Build a whole graph for the model."""
+    self.global_step = tf.contrib.framework.get_or_create_global_step()
+    self._build_model()
+    if self.mode == 'train':
+      self._build_train_op()
+    self.summaries = tf.summary.merge_all()
+
+  def _stride_arr(self, stride):
+    """Map a stride scalar to the stride array for tf.nn.conv2d."""
+    if self.hps.data_format == 'NHWC':
+      return [1, stride, stride, 1]
+    elif self.hps.data_format == 'NCHW':
+      return [1, 1, stride, stride]
+    else:
+      raise Exception("Invalid data_format")
+
+  def _build_model(self):
+    """Build the core model within the graph."""
+    with tf.variable_scope('init'):
+      x = self._images
+      x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))
+
+    strides = [1, 2, 2]
+    activate_before_residual = [True, False, False]
+    if self.hps.use_bottleneck:
+      res_func = self._bottleneck_residual
+      filters = [16, 64, 128, 256]
+    else:
+      res_func = self._residual
+      filters = [16, 16, 32, 64]
+      # Uncomment the following codes to use w28-10 wide residual network.
+      # It is more memory efficient than very deep residual network and has
+      # comparably good performance.
+      # https://arxiv.org/pdf/1605.07146v1.pdf
+      # filters = [16, 160, 320, 640]
+      # Update hps.num_residual_units to 4
+
+    with tf.variable_scope('unit_1_0'):
+      x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
+                   activate_before_residual[0])
+    for i in six.moves.range(1, self.hps.num_residual_units):
+      with tf.variable_scope('unit_1_%d' % i):
+        x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
+
+    with tf.variable_scope('unit_2_0'):
+      x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
+                   activate_before_residual[1])
+    for i in six.moves.range(1, self.hps.num_residual_units):
+      with tf.variable_scope('unit_2_%d' % i):
+        x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
+
+    with tf.variable_scope('unit_3_0'):
+      x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
+                   activate_before_residual[2])
+    for i in six.moves.range(1, self.hps.num_residual_units):
+      with tf.variable_scope('unit_3_%d' % i):
+        x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
+
+    with tf.variable_scope('unit_last'):
+      x = self._batch_norm('final_bn', x)
+      x = self._relu(x, self.hps.relu_leakiness)
+      x = self._global_avg_pool(x)
+
+    with tf.variable_scope('logit'):
+      logits = self._fully_connected(x, self.hps.num_classes)
+      self.predictions = tf.nn.softmax(logits)
+
+    with tf.variable_scope('costs'):
+      xent = tf.nn.softmax_cross_entropy_with_logits(
+          logits=logits, labels=self.labels)
+      self.cost = tf.reduce_mean(xent, name='xent')
+      self.cost += self._decay()
+
+      tf.summary.scalar('cost', self.cost)
+
+  def _build_train_op(self):
+    """Build training specific ops for the graph."""
+    self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
+    tf.summary.scalar('learning_rate', self.lrn_rate)
+
+    trainable_variables = tf.trainable_variables()
+    grads = tf.gradients(self.cost, trainable_variables)
+
+    if self.hps.optimizer == 'sgd':
+      optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
+    elif self.hps.optimizer == 'mom':
+      optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
+
+    apply_op = optimizer.apply_gradients(
+        zip(grads, trainable_variables),
+        global_step=self.global_step, name='train_step')
+
+    train_ops = [apply_op] + self._extra_train_ops
+    self.train_op = tf.group(*train_ops)
+
+  # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
+  def _batch_norm(self, name, x):
+    """Batch normalization."""
+    with tf.variable_scope(name) as scope:
+      output = tf.contrib.layers.batch_norm(x,
+                                            decay=0.9,
+                                            epsilon=0.001,
+                                            data_format=self.hps.data_format,
+                                            scope=scope,
+                                            is_training=(self.mode == 'train'),
+                                            fused=True,
+                                            updates_collections=None)
+    return output
+
+  def _residual(self, x, in_filter, out_filter, stride,
+                activate_before_residual=False):
+    """Residual unit with 2 sub layers."""
+    if activate_before_residual:
+      with tf.variable_scope('shared_activation'):
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self.hps.relu_leakiness)
+        orig_x = x
+    else:
+      with tf.variable_scope('residual_only_activation'):
+        orig_x = x
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self.hps.relu_leakiness)
+
+    with tf.variable_scope('sub1'):
+      x = self._conv('conv1', x, 3, in_filter, out_filter, stride)
+
+    with tf.variable_scope('sub2'):
+      x = self._batch_norm('bn2', x)
+      x = self._relu(x, self.hps.relu_leakiness)
+      x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])
+
+    with tf.variable_scope('sub_add'):
+      if in_filter != out_filter:
+        orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID',
+                                data_format=self.hps.data_format)
+        if self.hps.data_format == 'NHWC':
+          orig_x = tf.pad(
+              orig_x, [[0, 0], [0, 0], [0, 0],
+                       [(out_filter-in_filter)//2, (out_filter-in_filter)//2]])
+        elif self.hps.data_format == 'NCHW':
+          orig_x = tf.pad(
+              orig_x, [[0, 0], [(out_filter-in_filter)//2, (out_filter-in_filter)//2],
+                       [0, 0], [0, 0]])
+      x += orig_x
+
+    tf.logging.debug('image after unit %s', x.get_shape())
+    return x
+
+  def _bottleneck_residual(self, x, in_filter, out_filter, stride,
+                           activate_before_residual=False):
+    """Bottleneck residual unit with 3 sub layers."""
+    if activate_before_residual:
+      with tf.variable_scope('common_bn_relu'):
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self.hps.relu_leakiness)
+        orig_x = x
+    else:
+      with tf.variable_scope('residual_bn_relu'):
+        orig_x = x
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self.hps.relu_leakiness)
+
+    with tf.variable_scope('sub1'):
+      x = self._conv('conv1', x, 1, in_filter, out_filter/4, stride)
+
+    with tf.variable_scope('sub2'):
+      x = self._batch_norm('bn2', x)
+      x = self._relu(x, self.hps.relu_leakiness)
+      x = self._conv('conv2', x, 3, out_filter/4, out_filter/4, [1, 1, 1, 1])
+
+    with tf.variable_scope('sub3'):
+      x = self._batch_norm('bn3', x)
+      x = self._relu(x, self.hps.relu_leakiness)
+      x = self._conv('conv3', x, 1, out_filter/4, out_filter, [1, 1, 1, 1])
+
+    with tf.variable_scope('sub_add'):
+      if in_filter != out_filter:
+        orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
+      x += orig_x
+
+    tf.logging.info('image after unit %s', x.get_shape())
+    return x
+
+  def _decay(self):
+    """L2 weight decay loss."""
+    costs = []
+    for var in tf.trainable_variables():
+      if var.op.name.find(r'DW') > 0:
+        costs.append(tf.nn.l2_loss(var))
+        # tf.summary.histogram(var.op.name, var)
+
+    return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))
+
+  def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
+    """Convolution."""
+    with tf.variable_scope(name):
+      n = filter_size * filter_size * out_filters
+      kernel = tf.get_variable(
+          'DW', [filter_size, filter_size, in_filters, out_filters],
+          tf.float32, initializer=tf.random_normal_initializer(
+              stddev=np.sqrt(2.0/n)))
+      return tf.nn.conv2d(x, kernel, strides, padding='SAME',
+                          data_format=self.hps.data_format)
+
+  def _relu(self, x, leakiness=0.0):
+    """Relu, with optional leaky support."""
+    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
+
+  def _fully_connected(self, x, out_dim):
+    """FullyConnected layer for final output."""
+    x = tf.reshape(x, [self.hps.batch_size, -1])
+    w = tf.get_variable(
+        'DW', [x.get_shape()[1], out_dim],
+        initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
+    b = tf.get_variable('biases', [out_dim],
+                        initializer=tf.constant_initializer())
+    return tf.nn.xw_plus_b(x, w, b)
+
+  def _global_avg_pool(self, x):
+    assert x.get_shape().ndims == 4
+    if self.hps.data_format == 'NHWC':
+      return tf.reduce_mean(x, [1, 2])
+    elif self.hps.data_format == 'NCHW':
+      return tf.reduce_mean(x, [2, 3])
--- a/tensorflow/CIFAR10/time_inference.py
+++ b/tensorflow/CIFAR10/time_inference.py
@ -0,0 +1,51 @@
+import argparse
+import os
+import subprocess
+import sys
+
+def main(checkpoint_path, model, use_bottleneck):
+  print("Number of images\tInference time")
+  num_trials = 10
+  for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+    command = ("python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin "
+               "--eval_dir=data/%(model)s/log_root/eval --dataset='cifar10' --model=%(model)s "
+               "--use_bottleneck=%(use_bottleneck)s --eval_batch_count=%(num_trials)d --eval_once=True --num_gpus=1 "
+               "--data_format=NHWC --time_inference=True --eval_batch_count=1 --batch_size=%(batch_size)d" %
+               {"model": model, "use_bottleneck": "True" if use_bottleneck else "False", "batch_size": batch_size,
+                "num_trials": num_trials})
+    full_command = command + " --log_root=%s 2>/dev/null" % checkpoint_path
+    try:
+      output = subprocess.check_output(full_command, shell=True)
+      output = output.decode('utf8').strip()
+      for line in output.split('\n'):
+        if "Time for inference" in line:
+          line = line.strip()
+          inference_time = float(line.split(": ")[1]) / num_trials
+          stats = [batch_size, inference_time]
+          print("\t".join([str(stat) for stat in stats]))
+          sys.stdout.flush()
+    except:
+      stats = [batch_size, ""]
+      print("\t".join([str(stat) for stat in stats]))
+      sys.stdout.flush()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(
+    description=("Backup model checkpoints periodically")
+  )
+  parser.add_argument('-i', "--checkpoint_path", type=str, required=True,
+                      help="Path to dumped model checkpoints")
+  parser.add_argument('-m', "--model", type=str, required=True,
+                      help="Model name")
+  parser.add_argument('-b', "--use_bottleneck", type=bool, default=False,
+                      help="Use bottleneck")
+
+  cmdline_args = parser.parse_args()
+  opt_dict = vars(cmdline_args)
+
+  checkpoint_path = opt_dict["checkpoint_path"]
+  model = opt_dict["model"]
+  use_bottleneck = opt_dict["use_bottleneck"]
+
+  main(checkpoint_path, model, use_bottleneck)
--- a/tensorflow/SQuAD/.gitignore
+++ b/tensorflow/SQuAD/.gitignore
@ -0,0 +1,3 @@
+out/
+data/
+*/__pycache__/
--- a/tensorflow/SQuAD/README.md
+++ b/tensorflow/SQuAD/README.md
@ -0,0 +1,165 @@
+# Bi-directional Attention Flow for Machine Comprehension
+ 
+- This the original implementation of [Bi-directional Attention Flow for Machine Comprehension][paper] (Seo et al., 2016).
+- This is tensorflow v1.1.0 comaptible version. This is not compatible with previous trained models, 
+so if you want to use them, go to [v0.2.1][v0.2.1]. 
+- The CodaLab worksheet for the [SQuAD Leaderboard][squad] submission is available [here][worksheet].
+- Please contact [Minjoon Seo][minjoon] ([@seominjoon][minjoon-github]) for questions and suggestions.
+
+## 0. Requirements
+#### General
+- Python (developed on 3.5.2. Issues have been reported with Python 2!)
+- unzip
+
+#### Python Packages
+- tensorflow (deep learning library, verified on 1.1.0)
+- nltk (NLP tools, verified on 3.2.1)
+- tqdm (progress bar, verified on 4.7.4)
+- jinja2 (for visaulization; if you only train and test, not needed)
+
+## 1. Pre-processing
+First, prepare data. Donwload SQuAD data and GloVe and nltk corpus
+(~850 MB, this will download files to `$HOME/data`):
+```
+chmod +x download.sh; ./download.sh
+```
+
+Second, Preprocess Stanford QA dataset (along with GloVe vectors) and save them in `$PWD/data/squad` (~5 minutes):
+```
+python -m squad.prepro
+```
+
+## 2. Training
+The model was trained with NVidia Titan X (Pascal Architecture, 2016).
+The model requires at least 12GB of GPU RAM.
+If your GPU RAM is smaller than 12GB, you can either decrease batch size (performance might degrade),
+or you can use multi GPU (see below).
+The training converges at ~18k steps, and it took ~4s per step (i.e. ~20 hours).
+
+Before training, it is recommended to first try the following code to verify everything is okay and memory is sufficient:
+```
+python -m basic.cli --mode train --noload --debug
+```
+
+Then to fully train, run:
+```
+python -m basic.cli --mode train --noload
+```
+
+You can speed up the training process with optimization flags:
+```
+python -m basic.cli --mode train --noload --len_opt --cluster
+```
+You can still omit them, but training will be much slower.
+
+
+## 3. Test
+To test, run:
+```
+python -m basic.cli
+```
+
+Similarly to training, you can give the optimization flags to speed up test (5 minutes on dev data):
+```
+python -m basic.cli --len_opt --cluster
+```
+
+This command loads the most recently saved model during training and begins testing on the test data.
+After the process ends, it prints F1 and EM scores, and also outputs a json file (`$PWD/out/basic/00/answer/test-####.json`,
+where `####` is the step # that the model was saved).
+Note that the printed scores are not official (our scoring scheme is a bit harsher).
+To obtain the official number, use the official evaluator (copied in `squad` folder) and the output json file:
+
+```
+python squad/evaluate-v1.1.py $HOME/data/squad/dev-v1.1.json out/basic/00/answer/test-####.json
+```
+
+### 3.1 Loading from pre-trained weights
+NOTE: this version is not compatible with the following trained models. 
+For compatibility, use [v0.2.1][v0.2.1]. 
+
+Instead of training the model yourself, you can choose to use pre-trained weights that were used for [SQuAD Leaderboard][squad] submission.
+Refer to [this worksheet][worksheet] in CodaLab to reproduce the results.
+If you are unfamiliar with CodaLab, follow these simple steps (given that you met all prereqs above):
+
+1. Download `save.zip` from the [worksheet][worksheet] and unzip it in the current directory.
+2. Copy `glove.6B.100d.txt` from your glove data folder (`$HOME/data/glove/`) to the current directory.
+3. To reproduce single model:
+  
+  ```
+  basic/run_single.sh $HOME/data/squad/dev-v1.1.json single.json
+  ```
+  
+  This writes the answers to `single.json` in the current directory. You can then use the official evaluator to obtain EM and F1 scores. If you want to run on GPU (~5 mins), change the value of batch_size flag in the shell file to a higher number (60 for 12GB GPU RAM). 
+4. Similarly, to reproduce ensemble method:
+  
+  ```
+  basic/run_ensemble.sh $HOME/data/squad/dev-v1.1.json ensemble.json 
+  ```
+  If you want to run on GPU, you should run the script sequentially by removing '&' in the forloop, or you will need to specify different GPUs for each run of the for loop.
+
+## Results
+
+### Dev Data
+
+|          | EM (%) | F1 (%) |
+| -------- |:------:|:------:|
+| single   | 67.8   | 77.4   |
+
+###Dev Data (old)
+NOTE: These numbers are from [v0.2.1][v0.2.1]. 
+
+|          | EM (%) | F1 (%) |
+| -------- |:------:|:------:|
+| single   | 67.7   | 77.3   |
+| ensemble | 72.6   | 80.7   |
+
+
+###Test Data (old)
+NOTE: These numbers are from [v0.2.1][v0.2.1]. 
+
+|          | EM (%) | F1 (%) |
+| -------- |:------:|:------:|
+| single   | 68.0   | 77.3   |
+| ensemble | 73.3   | 81.1   |
+
+Refer to [our paper][paper] for more details.
+See [SQuAD Leaderboard][squad] to compare with other models.
+
+
+<!--
+## Using Pre-trained Model
+
+If you would like to use pre-trained model, it's very easy! 
+You can download the model weights [here][save] (make sure that its commit id matches the source code's).
+Extract them and put them in `$PWD/out/basic/00/save` directory, with names unchanged.
+Then do the testing again, but you need to specify the step # that you are loading from:
+```
+python -m basic.cli --mode test --batch_size 8 --eval_num_batches 0 --load_step ####
+```
+-->
+
+
+## Multi-GPU Training & Testing
+Our model supports multi-GPU training.
+We follow the parallelization paradigm described in [TensorFlow Tutorial][multi-gpu].
+In short, if you want to use batch size of 60 (default) but if you have 3 GPUs with 4GB of RAM,
+then you initialize each GPU with batch size of 20, and combine the gradients on CPU.
+This can be easily done by running:
+```
+python -m basic.cli --mode train --noload --num_gpus 3 --batch_size 20
+```
+
+Similarly, you can speed up your testing by:
+```
+python -m basic.cli --num_gpus 3 --batch_size 20 
+```
+ 
+
+[multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards
+[squad]: http://stanford-qa.com
+[paper]: https://arxiv.org/abs/1611.01603
+[worksheet]: https://worksheets.codalab.org/worksheets/0x37a9b8c44f6845c28866267ef941c89d/
+[minjoon]: https://seominjoon.github.io
+[minjoon-github]: https://github.com/seominjoon
+[v0.2.1]: https://github.com/allenai/bi-att-flow/tree/v0.2.1
--- a/tensorflow/SQuAD/basic/init.py
+++ b/tensorflow/SQuAD/basic/init.py
--- a/tensorflow/SQuAD/basic/cli.py
+++ b/tensorflow/SQuAD/basic/cli.py
@ -0,0 +1,112 @@
+import os
+
+import tensorflow as tf
+
+from basic.main import main as m
+
+flags = tf.app.flags
+
+# Names and directories
+flags.DEFINE_string("model_name", "basic", "Model name [basic]")
+flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
+flags.DEFINE_string("run_id", "0", "Run ID [0]")
+flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
+flags.DEFINE_string("forward_name", "single", "Forward name [single]")
+flags.DEFINE_string("answer_path", "", "Answer path []")
+flags.DEFINE_string("eval_path", "", "Eval path []")
+flags.DEFINE_string("load_path", "", "Load path []")
+flags.DEFINE_string("shared_path", "", "Shared path []")
+
+# Device placement
+flags.DEFINE_string("device", "/cpu:0", "default device for summing gradients. [/cpu:0]")
+flags.DEFINE_string("device_type", "gpu", "device for computing gradients (parallelization). cpu | gpu [gpu]")
+flags.DEFINE_integer("num_gpus", 1, "num of gpus or cpus for computing gradients [1]")
+
+# Essential training and test options
+flags.DEFINE_string("mode", "test", "trains | test | forward [test]")
+flags.DEFINE_boolean("load", True, "load saved data? [True]")
+flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
+flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
+flags.DEFINE_bool('load_ema', True, "load exponential average of variables when testing?  [True]")
+flags.DEFINE_bool("eval", True, "eval? [True]")
+flags.DEFINE_bool("wy", False, "Use wy for loss / eval? [False]")
+flags.DEFINE_bool("na", False, "Enable no answer strategy and learn bias? [False]")
+flags.DEFINE_float("th", 0.5, "Threshold [0.5]")
+
+# Training / test parameters
+flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
+flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
+flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
+flags.DEFINE_integer("num_epochs", 12, "Total number of epochs for training [12]")
+flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
+flags.DEFINE_integer("load_step", 0, "load step [0]")
+flags.DEFINE_float("init_lr", 0.001, "Initial learning rate [0.001]")
+flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]")
+flags.DEFINE_float("keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]")
+flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
+flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
+flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
+flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
+flags.DEFINE_string("out_channel_dims", "100", "Out channel dims of Char-CNN, separated by commas [100]")
+flags.DEFINE_string("filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]")
+flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
+flags.DEFINE_bool("highway", True, "Use highway? [True]")
+flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
+flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
+flags.DEFINE_bool("share_lstm_weights", True, "Share pre-processing (phrase-level) LSTM weights [True]")
+flags.DEFINE_float("var_decay", 0.999, "Exponential moving average decay for variables [0.999]")
+
+# Optimizations
+flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
+flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
+flags.DEFINE_bool("cpu_opt", False, "CPU optimization? GPU computation can be slower [False]")
+
+# Logging and saving options
+flags.DEFINE_boolean("progress", True, "Show progress? [True]")
+flags.DEFINE_integer("log_period", 100, "Log period [100]")
+flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
+flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
+flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
+flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
+flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
+flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
+flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
+flags.DEFINE_float("decay", 0.9, "Exponential moving average decay for logging values [0.9]")
+
+# Thresholds for speed and less memory usage
+flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
+flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
+flags.DEFINE_integer("sent_size_th", 400, "sent size th [64]")
+flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
+flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
+flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
+flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
+
+# Advanced training options
+flags.DEFINE_bool("lower_word", True, "lower word [True]")
+flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
+flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
+flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
+flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
+flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
+flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
+flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
+flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
+
+# Ablation options
+flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
+flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
+flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
+flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
+flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")
+
+
+def main(_):
+    config = flags.FLAGS
+
+    config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
+
+    m(config)
+
+if __name__ == "__main__":
+    tf.app.run()
--- a/tensorflow/SQuAD/basic/ensemble.py
+++ b/tensorflow/SQuAD/basic/ensemble.py
@ -0,0 +1,116 @@
+import argparse
+import functools
+import gzip
+import json
+import pickle
+from collections import defaultdict
+from operator import mul
+
+from tqdm import tqdm
+from squad.utils import get_phrase, get_best_span, get_span_score_pairs
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('paths', nargs='+')
+    parser.add_argument('-o', '--out', default='ensemble.json')
+    parser.add_argument("--data_path", default="data/squad/data_test.json")
+    parser.add_argument("--shared_path", default="data/squad/shared_test.json")
+    args = parser.parse_args()
+    return args
+
+
+def ensemble(args):
+    e_list = []
+    for path in tqdm(args.paths):
+        with gzip.open(path, 'r') as fh:
+            e = pickle.load(fh)
+            e_list.append(e)
+
+    with open(args.data_path, 'r') as fh:
+        data = json.load(fh)
+
+    with open(args.shared_path, 'r') as fh:
+        shared = json.load(fh)
+
+    out = {}
+    for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])):
+        if idx >= len(e['yp']):
+            # for debugging purpose
+            break
+        context = shared['p'][rx[0]][rx[1]]
+        wordss = shared['x'][rx[0]][rx[1]]
+        yp_list = [e['yp'][idx] for e in e_list]
+        yp2_list = [e['yp2'][idx] for e in e_list]
+        answer = ensemble4(context, wordss, yp_list, yp2_list)
+        out[id_] = answer
+
+    with open(args.out, 'w') as fh:
+        json.dump(out, fh)
+
+
+def ensemble1(context, wordss, y1_list, y2_list):
+    """
+
+    :param context: Original context
+    :param wordss: tokenized words (nested 2D list)
+    :param y1_list: list of start index probs (each element corresponds to probs form single model)
+    :param y2_list: list of stop index probs
+    :return:
+    """
+    sum_y1 = combine_y_list(y1_list)
+    sum_y2 = combine_y_list(y2_list)
+    span, score = get_best_span(sum_y1, sum_y2)
+    return get_phrase(context, wordss, span)
+
+
+def ensemble2(context, wordss, y1_list, y2_list):
+    start_dict = defaultdict(float)
+    stop_dict = defaultdict(float)
+    for y1, y2 in zip(y1_list, y2_list):
+        span, score = get_best_span(y1, y2)
+        start_dict[span[0]] += y1[span[0][0]][span[0][1]]
+        stop_dict[span[1]] += y2[span[1][0]][span[1][1]]
+    start = max(start_dict.items(), key=lambda pair: pair[1])[0]
+    stop = max(stop_dict.items(), key=lambda pair: pair[1])[0]
+    best_span = (start, stop)
+    return get_phrase(context, wordss, best_span)
+
+
+def ensemble3(context, wordss, y1_list, y2_list):
+    d = defaultdict(float)
+    for y1, y2 in zip(y1_list, y2_list):
+        span, score = get_best_span(y1, y2)
+        phrase = get_phrase(context, wordss, span)
+        d[phrase] += score
+    return max(d.items(), key=lambda pair: pair[1])[0]
+
+
+def ensemble4(context, wordss, y1_list, y2_list):
+    d = defaultdict(lambda: 0.0)
+    for y1, y2 in zip(y1_list, y2_list):
+        for span, score in get_span_score_pairs(y1, y2):
+            d[span] += score
+    span = max(d.items(), key=lambda pair: pair[1])[0]
+    phrase = get_phrase(context, wordss, span)
+    return phrase
+
+
+def combine_y_list(y_list, op='*'):
+    if op == '+':
+        func = sum
+    elif op == '*':
+        def func(l): return functools.reduce(mul, l)
+    else:
+        func = op
+    return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)]
+
+
+def main():
+    args = get_args()
+    ensemble(args)
+
+if __name__ == "__main__":
+    main()
+
+
--- a/tensorflow/SQuAD/basic/ensemble_fast.py
+++ b/tensorflow/SQuAD/basic/ensemble_fast.py
@ -0,0 +1,39 @@
+import sys
+import json
+from collections import Counter, defaultdict
+import re
+
+def key_func(pair):
+    return pair[1]
+
+
+def get_func(vals, probs):
+    counter = Counter(vals)
+    # return max(zip(vals, probs), key=lambda pair: pair[1])[0]
+    # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
+    # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
+    d = defaultdict(float)
+    for val, prob in zip(vals, probs):
+        d[val] += prob
+    d[''] = 0
+    return max(d.items(), key=lambda pair: pair[1])[0]
+
+third_path = sys.argv[1]
+other_paths = sys.argv[2:]
+
+others = [json.load(open(path, 'r')) for path in other_paths]
+
+
+c = {}
+
+assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
+
+for key in others[0].keys():
+    if key == 'scores':
+        continue
+    probs = [other['scores'][key] for other in others]
+    vals = [other[key] for other in others]
+    largest_val = get_func(vals, probs)
+    c[key] = largest_val
+
+json.dump(c, open(third_path, 'w'))
--- a/tensorflow/SQuAD/basic/evaluator.py
+++ b/tensorflow/SQuAD/basic/evaluator.py
@ -0,0 +1,453 @@
+import numpy as np
+import tensorflow as tf
+
+from basic.read_data import DataSet
+from my.nltk_utils import span_f1
+from my.tensorflow import padded_reshape
+from my.utils import argmax
+from squad.utils import get_phrase, get_best_span, get_best_span_wy
+
+
+class Evaluation(object):
+    def __init__(self, data_type, global_step, idxs, yp, tensor_dict=None):
+        self.data_type = data_type
+        self.global_step = global_step
+        self.idxs = idxs
+        self.yp = yp
+        self.num_examples = len(yp)
+        self.tensor_dict = None
+        self.dict = {'data_type': data_type,
+                     'global_step': global_step,
+                     'yp': yp,
+                     'idxs': idxs,
+                     'num_examples': self.num_examples}
+        if tensor_dict is not None:
+            self.tensor_dict = {key: val.tolist() for key, val in tensor_dict.items()}
+            for key, val in self.tensor_dict.items():
+                self.dict[key] = val
+        self.summaries = None
+
+    def __repr__(self):
+        return "{} step {}".format(self.data_type, self.global_step)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_idxs = self.idxs + other.idxs
+        new_tensor_dict = None
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: val + other.tensor_dict[key] for key, val in self.tensor_dict.items()}
+        return Evaluation(self.data_type, self.global_step, new_idxs, new_yp, tensor_dict=new_tensor_dict)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+
+class LabeledEvaluation(Evaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y, tensor_dict=None):
+        super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
+        self.y = y
+        self.dict['y'] = y
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_idxs = self.idxs + other.idxs
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, tensor_dict=new_tensor_dict)
+
+
+class AccuracyEvaluation(LabeledEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=None):
+        super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y, tensor_dict=tensor_dict)
+        self.loss = loss
+        self.correct = correct
+        self.acc = sum(correct) / len(correct)
+        self.dict['loss'] = loss
+        self.dict['correct'] = correct
+        self.dict['acc'] = self.acc
+        loss_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/loss'.format(data_type), simple_value=self.loss)])
+        acc_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/acc'.format(data_type), simple_value=self.acc)])
+        self.summaries = [loss_summary, acc_summary]
+
+    def __repr__(self):
+        return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_correct = self.correct + other.correct
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss, tensor_dict=new_tensor_dict)
+
+
+class Evaluator(object):
+    def __init__(self, config, model, tensor_dict=None):
+        self.config = config
+        self.model = model
+        self.global_step = model.global_step
+        self.yp = model.yp
+        self.tensor_dict = {} if tensor_dict is None else tensor_dict
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), tensor_dict=tensor_dict)
+        return e
+
+    def get_evaluation_from_batches(self, sess, batches):
+        e = sum(self.get_evaluation(sess, batch) for batch in batches)
+        return e
+
+
+class LabeledEvaluator(Evaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(LabeledEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.y = model.y
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        y = feed_dict[self.y]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), tensor_dict=tensor_dict)
+        return e
+
+
+class AccuracyEvaluator(LabeledEvaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(AccuracyEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.loss = model.loss
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        assert isinstance(data_set, DataSet)
+        feed_dict = self.model.get_feed_dict(data_set, False)
+        global_step, yp, loss, vals = sess.run([self.global_step, self.yp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        y = data_set.data['y']
+        yp = yp[:data_set.num_examples]
+        correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y, correct, float(loss), tensor_dict=tensor_dict)
+        return e
+
+    @staticmethod
+    def compare(yi, ypi):
+        for start, stop in yi:
+            if start == int(np.argmax(ypi)):
+                return True
+        return False
+
+
+class AccuracyEvaluator2(AccuracyEvaluator):
+    @staticmethod
+    def compare(yi, ypi):
+        for start, stop in yi:
+            para_start = int(np.argmax(np.max(ypi, 1)))
+            sent_start = int(np.argmax(ypi[para_start]))
+            if tuple(start) == (para_start, sent_start):
+                return True
+        return False
+
+
+class ForwardEvaluation(Evaluation):
+    def __init__(self, data_type, global_step, idxs, yp, yp2, loss, id2answer_dict, tensor_dict=None):
+        super(ForwardEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
+        self.yp2 = yp2
+        self.loss = loss
+        self.dict['loss'] = loss
+        self.dict['yp2'] = yp2
+        self.id2answer_dict = id2answer_dict
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_yp2 = self.yp2 + other.yp2
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_yp)
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
+        new_id2answer_dict['scores'] = new_id2score_dict
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return ForwardEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_loss, new_id2answer_dict, tensor_dict=new_tensor_dict)
+
+    def __repr__(self):
+        return "{} step {}: loss={:.4f}".format(self.data_type, self.global_step, self.loss)
+
+
+class F1Evaluation(AccuracyEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, yp2, y, correct, loss, f1s, id2answer_dict, tensor_dict=None):
+        super(F1Evaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=tensor_dict)
+        self.yp2 = yp2
+        self.f1s = f1s
+        self.f1 = float(np.mean(f1s))
+        self.dict['yp2'] = yp2
+        self.dict['f1s'] = f1s
+        self.dict['f1'] = self.f1
+        self.id2answer_dict = id2answer_dict
+        f1_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/f1'.format(data_type), simple_value=self.f1)])
+        self.summaries.append(f1_summary)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_yp2 = self.yp2 + other.yp2
+        new_y = self.y + other.y
+        new_correct = self.correct + other.correct
+        new_f1s = self.f1s + other.f1s
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
+        new_id2answer_dict['scores'] = new_id2score_dict
+        if 'na' in self.id2answer_dict:
+            new_id2na_dict = dict(list(self.id2answer_dict['na'].items()) + list(other.id2answer_dict['na'].items()))
+            new_id2answer_dict['na'] = new_id2na_dict
+        e = F1Evaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_correct, new_loss, new_f1s, new_id2answer_dict)
+        if 'wyp' in self.dict:
+            new_wyp = self.dict['wyp'] + other.dict['wyp']
+            e.dict['wyp'] = new_wyp
+        return e
+
+    def __repr__(self):
+        return "{} step {}: accuracy={:.4f}, f1={:.4f}, loss={:.4f}".format(self.data_type, self.global_step, self.acc, self.f1, self.loss)
+
+
+class F1Evaluator(LabeledEvaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(F1Evaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.yp2 = model.yp2
+        self.wyp = model.wyp
+        self.loss = model.loss
+        if config.na:
+            self.na = model.na_prob
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = self._split_batch(batch)
+        assert isinstance(data_set, DataSet)
+        feed_dict = self._get_feed_dict(batch)
+        if self.config.na:
+            global_step, yp, yp2, wyp, loss, na, vals = sess.run([self.global_step, self.yp, self.yp2, self.wyp, self.loss, self.na, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        else:
+            global_step, yp, yp2, wyp, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.wyp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        y = data_set.data['y']
+        if self.config.squash:
+            new_y = []
+            for xi, yi in zip(data_set.data['x'], y):
+                new_yi = []
+                for start, stop in yi:
+                    start_offset = sum(map(len, xi[:start[0]]))
+                    stop_offset = sum(map(len, xi[:stop[0]]))
+                    new_start = 0, start_offset + start[1]
+                    new_stop = 0, stop_offset + stop[1]
+                    new_yi.append((new_start, new_stop))
+                new_y.append(new_yi)
+            y = new_y
+        if self.config.single:
+            new_y = []
+            for yi in y:
+                new_yi = []
+                for start, stop in yi:
+                    new_start = 0, start[1]
+                    new_stop = 0, stop[1]
+                    new_yi.append((new_start, new_stop))
+                new_y.append(new_yi)
+            y = new_y
+
+        yp, yp2, wyp = yp[:data_set.num_examples], yp2[:data_set.num_examples], wyp[:data_set.num_examples]
+        if self.config.wy:
+            spans, scores = zip(*[get_best_span_wy(wypi, self.config.th) for wypi in wyp])
+        else:
+            spans, scores = zip(*[get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)])
+
+        def _get(xi, span):
+            if len(xi) <= span[0][0]:
+                return [""]
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return [""]
+            return xi[span[0][0]][span[0][1]:span[1][1]]
+
+        def _get2(context, xi, span):
+            if len(xi) <= span[0][0]:
+                return ""
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return ""
+            return get_phrase(context, xi, span)
+
+        id2answer_dict = {id_: _get2(context, xi, span)
+                          for id_, xi, span, context in zip(data_set.data['ids'], data_set.data['x'], spans, data_set.data['p'])}
+        id2score_dict = {id_: score for id_, score in zip(data_set.data['ids'], scores)}
+        id2answer_dict['scores'] = id2score_dict
+        if self.config.na:
+            id2na_dict = {id_: float(each) for id_, each in zip(data_set.data['ids'], na)}
+            id2answer_dict['na'] = id2na_dict
+        correct = [self.__class__.compare2(yi, span) for yi, span in zip(y, spans)]
+        f1s = [self.__class__.span_f1(yi, span) for yi, span in zip(y, spans)]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = F1Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y,
+                         correct, float(loss), f1s, id2answer_dict, tensor_dict=tensor_dict)
+        if self.config.wy:
+            e.dict['wyp'] = wyp.tolist()
+        return e
+
+    def _split_batch(self, batch):
+        return batch
+
+    def _get_feed_dict(self, batch):
+        return self.model.get_feed_dict(batch[1], False)
+
+    @staticmethod
+    def compare(yi, ypi, yp2i):
+        for start, stop in yi:
+            aypi = argmax(ypi)
+            mask = np.zeros(yp2i.shape)
+            mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
+            if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
+                return True
+        return False
+
+    @staticmethod
+    def compare2(yi, span):
+        for start, stop in yi:
+            if tuple(start) == span[0] and tuple(stop) == span[1]:
+                return True
+        return False
+
+    @staticmethod
+    def span_f1(yi, span):
+        max_f1 = 0
+        for start, stop in yi:
+            if start[0] == span[0][0]:
+                true_span = start[1], stop[1]
+                pred_span = span[0][1], span[1][1]
+                f1 = span_f1(true_span, pred_span)
+                max_f1 = max(f1, max_f1)
+        return max_f1
+
+
+class MultiGPUF1Evaluator(F1Evaluator):
+    def __init__(self, config, models, tensor_dict=None):
+        super(MultiGPUF1Evaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
+        self.models = models
+        with tf.name_scope("eval_concat"):
+            N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
+            self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
+            self.yp2 = tf.concat(axis=0, values=[padded_reshape(model.yp2, [N, M, JX]) for model in models])
+            self.wy = tf.concat(axis=0, values=[padded_reshape(model.wy, [N, M, JX]) for model in models])
+            self.loss = tf.add_n([model.loss for model in models])/len(models)
+
+    def _split_batch(self, batches):
+        idxs_list, data_sets = zip(*batches)
+        idxs = sum(idxs_list, ())
+        data_set = sum(data_sets, data_sets[0].get_empty())
+        return idxs, data_set
+
+    def _get_feed_dict(self, batches):
+        feed_dict = {}
+        for model, (_, data_set) in zip(self.models, batches):
+            feed_dict.update(model.get_feed_dict(data_set, False))
+        return feed_dict
+
+
+class ForwardEvaluator(Evaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(ForwardEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.yp2 = model.yp2
+        self.loss = model.loss
+        if config.na:
+            self.na = model.na_prob
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        assert isinstance(data_set, DataSet)
+        feed_dict = self.model.get_feed_dict(data_set, False)
+        if self.config.na:
+            global_step, yp, yp2, loss, na, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, self.na, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        else:
+            global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+
+        yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
+        spans, scores = zip(*[get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)])
+
+        def _get(xi, span):
+            if len(xi) <= span[0][0]:
+                return [""]
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return [""]
+            return xi[span[0][0]][span[0][1]:span[1][1]]
+
+        def _get2(context, xi, span):
+            if len(xi) <= span[0][0]:
+                return ""
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return ""
+            return get_phrase(context, xi, span)
+
+        id2answer_dict = {id_: _get2(context, xi, span)
+                          for id_, xi, span, context in zip(data_set.data['ids'], data_set.data['x'], spans, data_set.data['p'])}
+        id2score_dict = {id_: score for id_, score in zip(data_set.data['ids'], scores)}
+        id2answer_dict['scores'] = id2score_dict
+        if self.config.na:
+            id2na_dict = {id_: float(each) for id_, each in zip(data_set.data['ids'], na)}
+            id2answer_dict['na'] = id2na_dict
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = ForwardEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), float(loss), id2answer_dict, tensor_dict=tensor_dict)
+        # TODO : wy support
+        return e
+
+    @staticmethod
+    def compare(yi, ypi, yp2i):
+        for start, stop in yi:
+            aypi = argmax(ypi)
+            mask = np.zeros(yp2i.shape)
+            mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
+            if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
+                return True
+        return False
+
+    @staticmethod
+    def compare2(yi, span):
+        for start, stop in yi:
+            if tuple(start) == span[0] and tuple(stop) == span[1]:
+                return True
+        return False
+
+    @staticmethod
+    def span_f1(yi, span):
+        max_f1 = 0
+        for start, stop in yi:
+            if start[0] == span[0][0]:
+                true_span = start[1], stop[1]
+                pred_span = span[0][1], span[1][1]
+                f1 = span_f1(true_span, pred_span)
+                max_f1 = max(f1, max_f1)
+        return max_f1
+
+
--- a/tensorflow/SQuAD/basic/get_pr.py
+++ b/tensorflow/SQuAD/basic/get_pr.py
@ -0,0 +1,35 @@
+import json
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path")
+    parser.add_argument("-t", "--th", type=float, default=0.5)
+    # TODO : put more args here
+    return parser.parse_args()
+
+
+def get_pr(args):
+    with open(args.path, 'r') as fp:
+        answers = json.load(fp)
+
+    na = answers['na']
+
+    tp = sum(int(not id_.startswith("neg") and score < args.th) for id_, score in na.items())
+    fp = sum(int(id_.startswith("neg") and score < args.th) for id_, score in na.items())
+    tn = sum(int(id_.startswith("neg") and score >= args.th) for id_, score in na.items())
+    fn = sum(int(not id_.startswith("neg") and score >= args.th) for id_, score in na.items())
+
+    p = tp / (tp + fp)
+    r = tp / (tp + fn)
+    print("p={:.3f}, r={:.3f}".format(p, r))
+
+
+def main():
+    args = get_args()
+    get_pr(args)
+
+if __name__ == "__main__":
+    main()
+
--- a/tensorflow/SQuAD/basic/graph_handler.py
+++ b/tensorflow/SQuAD/basic/graph_handler.py
@ -0,0 +1,79 @@
+import gzip
+import json
+from json import encoder
+import os
+
+import tensorflow as tf
+
+from basic.evaluator import Evaluation, F1Evaluation
+from my.utils import short_floats
+
+import pickle
+
+
+class GraphHandler(object):
+    def __init__(self, config, model):
+        self.config = config
+        self.model = model
+        self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
+        self.writer = None
+        self.save_path = os.path.join(config.save_dir, config.model_name)
+
+    def initialize(self, sess):
+        sess.run(tf.global_variables_initializer())
+        if self.config.load:
+            self._load(sess)
+
+        if self.config.mode == 'train':
+            self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
+
+    def save(self, sess, global_step=None):
+        saver = tf.train.Saver(max_to_keep=self.config.max_to_keep)
+        saver.save(sess, self.save_path, global_step=global_step)
+
+    def _load(self, sess):
+        config = self.config
+        vars_ = {var.name.split(":")[0]: var for var in tf.global_variables()}
+        if config.load_ema:
+            ema = self.model.var_ema
+            for var in tf.trainable_variables():
+                del vars_[var.name.split(":")[0]]
+                vars_[ema.average_name(var)] = var
+        saver = tf.train.Saver(vars_, max_to_keep=config.max_to_keep)
+
+        if config.load_path:
+            save_path = config.load_path
+        elif config.load_step > 0:
+            save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
+        else:
+            save_dir = config.save_dir
+            checkpoint = tf.train.get_checkpoint_state(save_dir)
+            assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
+            save_path = checkpoint.model_checkpoint_path
+        print("Loading saved model from {}".format(save_path))
+        saver.restore(sess, save_path)
+
+    def add_summary(self, summary, global_step):
+        self.writer.add_summary(summary, global_step)
+
+    def add_summaries(self, summaries, global_step):
+        for summary in summaries:
+            self.add_summary(summary, global_step)
+
+    def dump_eval(self, e, precision=2, path=None):
+        assert isinstance(e, Evaluation)
+        if self.config.dump_pickle:
+            path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
+            with gzip.open(path, 'wb', compresslevel=3) as fh:
+                pickle.dump(e.dict, fh)
+        else:
+            path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
+            with open(path, 'w') as fh:
+                json.dump(short_floats(e.dict, precision), fh)
+
+    def dump_answer(self, e, path=None):
+        assert isinstance(e, Evaluation)
+        path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
+        with open(path, 'w') as fh:
+            json.dump(e.id2answer_dict, fh)
+
--- a/tensorflow/SQuAD/basic/main.py
+++ b/tensorflow/SQuAD/basic/main.py
@ -0,0 +1,233 @@
+import argparse
+import json
+import math
+import os
+import shutil
+from pprint import pprint
+
+import tensorflow as tf
+from tqdm import tqdm
+import numpy as np
+
+from basic.evaluator import ForwardEvaluator, MultiGPUF1Evaluator
+from basic.graph_handler import GraphHandler
+from basic.model import get_multi_gpu_models
+from basic.trainer import MultiGPUTrainer
+from basic.read_data import read_data, get_squad_data_filter, update_config
+from my.tensorflow import get_num_params
+
+
+def main(config):
+    set_dirs(config)
+    with tf.device(config.device):
+        if config.mode == 'train':
+            _train(config)
+        elif config.mode == 'test':
+            _test(config)
+        elif config.mode == 'forward':
+            _forward(config)
+        else:
+            raise ValueError("invalid value for 'mode': {}".format(config.mode))
+
+
+def set_dirs(config):
+    # create directories
+    assert config.load or config.mode == 'train', "config.load must be True if not training"
+    if not config.load and os.path.exists(config.out_dir):
+        shutil.rmtree(config.out_dir)
+
+    config.save_dir = os.path.join(config.out_dir, "save")
+    config.log_dir = os.path.join(config.out_dir, "log")
+    config.eval_dir = os.path.join(config.out_dir, "eval")
+    config.answer_dir = os.path.join(config.out_dir, "answer")
+    if not os.path.exists(config.out_dir):
+        os.makedirs(config.out_dir)
+    if not os.path.exists(config.save_dir):
+        os.mkdir(config.save_dir)
+    if not os.path.exists(config.log_dir):
+        os.mkdir(config.log_dir)
+    if not os.path.exists(config.answer_dir):
+        os.mkdir(config.answer_dir)
+    if not os.path.exists(config.eval_dir):
+        os.mkdir(config.eval_dir)
+
+
+def _config_debug(config):
+    if config.debug:
+        config.num_steps = 2
+        config.eval_period = 1
+        config.log_period = 1
+        config.save_period = 1
+        config.val_num_batches = 2
+        config.test_num_batches = 2
+
+
+def _train(config):
+    data_filter = get_squad_data_filter(config)
+    train_data = read_data(config, 'train', config.load, data_filter=data_filter)
+    dev_data = read_data(config, 'dev', True, data_filter=data_filter)
+    update_config(config, [train_data, dev_data])
+
+    _config_debug(config)
+
+    word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
+    word2idx_dict = train_data.shared['word2idx']
+    idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
+    emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
+                        else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
+                        for idx in range(config.word_vocab_size)])
+    config.emb_mat = emb_mat
+
+    # construct model graph and variables (using default graph)
+    pprint(config.__flags, indent=2)
+    models = get_multi_gpu_models(config)
+    model = models[0]
+    print("num params: {}".format(get_num_params()))
+    trainer = MultiGPUTrainer(config, models)
+    evaluator = MultiGPUF1Evaluator(config, models, tensor_dict=model.tensor_dict if config.vis else None)
+    graph_handler = GraphHandler(config, model)  # controls all tensors and variables in the graph, including loading /saving
+
+    # Variables
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+
+    # Begin training
+    num_steps = config.num_steps or int(math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs
+    global_step = 0
+    for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus,
+                                                     num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps):
+        global_step = sess.run(model.global_step) + 1  # +1 because all calculations are done after step
+        get_summary = global_step % config.log_period == 0
+        loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary)
+        if get_summary:
+            graph_handler.add_summary(summary, global_step)
+
+        # occasional saving
+        if global_step % config.save_period == 0:
+            graph_handler.save(sess, global_step=global_step)
+
+        if not config.eval:
+            continue
+        # Occasional evaluation
+        if global_step % config.eval_period == 0:
+            num_steps = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus))
+            if 0 < config.val_num_batches < num_steps:
+                num_steps = config.val_num_batches
+            e_train = evaluator.get_evaluation_from_batches(
+                sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)
+            )
+            graph_handler.add_summaries(e_train.summaries, global_step)
+            e_dev = evaluator.get_evaluation_from_batches(
+                sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps))
+            graph_handler.add_summaries(e_dev.summaries, global_step)
+
+            if config.dump_eval:
+                graph_handler.dump_eval(e_dev)
+            if config.dump_answer:
+                graph_handler.dump_answer(e_dev)
+    if global_step % config.save_period != 0:
+        graph_handler.save(sess, global_step=global_step)
+
+
+def _test(config):
+    test_data = read_data(config, 'test', True)
+    update_config(config, [test_data])
+
+    _config_debug(config)
+
+    if config.use_glove_for_unk:
+        word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
+        new_word2idx_dict = test_data.shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        config.new_emb_mat = new_emb_mat
+
+    pprint(config.__flags, indent=2)
+    models = get_multi_gpu_models(config)
+    model = models[0]
+    evaluator = MultiGPUF1Evaluator(config, models, tensor_dict=models[0].tensor_dict if config.vis else None)
+    graph_handler = GraphHandler(config, model)
+
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+    num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus))
+    if 0 < config.test_num_batches < num_steps:
+        num_steps = config.test_num_batches
+
+    e = None
+    for multi_batch in tqdm(test_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps):
+        ei = evaluator.get_evaluation(sess, multi_batch)
+        e = ei if e is None else e + ei
+        if config.vis:
+            eval_subdir = os.path.join(config.eval_dir, "{}-{}".format(ei.data_type, str(ei.global_step).zfill(6)))
+            if not os.path.exists(eval_subdir):
+                os.mkdir(eval_subdir)
+            path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8))
+            graph_handler.dump_eval(ei, path=path)
+
+    print(e)
+    if config.dump_answer:
+        print("dumping answer ...")
+        graph_handler.dump_answer(e)
+    if config.dump_eval:
+        print("dumping eval ...")
+        graph_handler.dump_eval(e)
+
+
+def _forward(config):
+    assert config.load
+    test_data = read_data(config, config.forward_name, True)
+    update_config(config, [test_data])
+
+    _config_debug(config)
+
+    if config.use_glove_for_unk:
+        word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
+        new_word2idx_dict = test_data.shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        config.new_emb_mat = new_emb_mat
+
+    pprint(config.__flags, indent=2)
+    models = get_multi_gpu_models(config)
+    model = models[0]
+    print("num params: {}".format(get_num_params()))
+    evaluator = ForwardEvaluator(config, model)
+    graph_handler = GraphHandler(config, model)  # controls all tensors and variables in the graph, including loading /saving
+
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+
+    num_batches = math.ceil(test_data.num_examples / config.batch_size)
+    if 0 < config.test_num_batches < num_batches:
+        num_batches = config.test_num_batches
+    e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
+    print(e)
+    if config.dump_answer:
+        print("dumping answer ...")
+        graph_handler.dump_answer(e, path=config.answer_path)
+    if config.dump_eval:
+        print("dumping eval ...")
+        graph_handler.dump_eval(e, path=config.eval_path)
+
+
+def _get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config_path")
+    return parser.parse_args()
+
+
+class Config(object):
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+
+def _run():
+    args = _get_args()
+    with open(args.config_path, 'r') as fh:
+        config = Config(**json.load(fh))
+        main(config)
+
+
+if __name__ == "__main__":
+    _run()
--- a/tensorflow/SQuAD/basic/model.py
+++ b/tensorflow/SQuAD/basic/model.py
@ -0,0 +1,535 @@
+import random
+
+import itertools
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.rnn import BasicLSTMCell
+
+from basic.read_data import DataSet
+from my.tensorflow import get_initializer
+from my.tensorflow.nn import softsel, get_logits, highway_network, multi_conv1d
+from my.tensorflow.rnn import bidirectional_dynamic_rnn
+from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, AttentionCell
+
+
+def get_multi_gpu_models(config):
+    models = []
+    with tf.variable_scope(tf.get_variable_scope()):
+        for gpu_idx in range(config.num_gpus):
+            with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/{}:{}".format(config.device_type, gpu_idx)):
+                if gpu_idx > 0:
+                    tf.get_variable_scope().reuse_variables()
+                model = Model(config, scope, rep=gpu_idx == 0)
+                models.append(model)
+
+    # update the summary in a different scope to avoid reuse issue
+    with tf.variable_scope('loss_summary', reuse=False):
+        for gpu_idx in range(config.num_gpus):
+            with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/{}:{}".format(config.device_type, gpu_idx)):
+                model = models[gpu_idx]
+                rep = gpu_idx == 0
+                if rep:
+                    model._build_var_ema()
+                if config.mode == 'train':
+                    model._build_ema();
+                model.summary = tf.summary.merge_all()
+                model.summary = tf.summary.merge(tf.get_collection("summaries", scope=model.scope))
+
+    return models
+
+
+class Model(object):
+    def __init__(self, config, scope, rep=True):
+        self.scope = scope
+        self.config = config
+        self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
+                                           initializer=tf.constant_initializer(0), trainable=False)
+
+        # Define forward inputs here
+        N, M, JX, JQ, VW, VC, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size
+        self.x = tf.placeholder('int32', [N, None, None], name='x')
+        self.cx = tf.placeholder('int32', [N, None, None, W], name='cx')
+        self.x_mask = tf.placeholder('bool', [N, None, None], name='x_mask')
+        self.q = tf.placeholder('int32', [N, None], name='q')
+        self.cq = tf.placeholder('int32', [N, None, W], name='cq')
+        self.q_mask = tf.placeholder('bool', [N, None], name='q_mask')
+        self.y = tf.placeholder('bool', [N, None, None], name='y')
+        self.y2 = tf.placeholder('bool', [N, None, None], name='y2')
+        self.wy = tf.placeholder('bool', [N, None, None], name='wy')
+        self.is_train = tf.placeholder('bool', [], name='is_train')
+        self.new_emb_mat = tf.placeholder('float', [None, config.word_emb_size], name='new_emb_mat')
+        self.na = tf.placeholder('bool', [N], name='na')
+
+        # Define misc
+        self.tensor_dict = {}
+
+        # Forward outputs / loss inputs
+        self.logits = None
+        self.yp = None
+        self.var_list = None
+        self.na_prob = None
+
+        # Loss outputs
+        self.loss = None
+
+        self._build_forward()
+        self._build_loss()
+        self.var_ema = None
+        # if rep:
+        #     self._build_var_ema()
+        # if config.mode == 'train':
+        #     self._build_ema()
+
+        # self.summary = tf.summary.merge_all()
+        # self.summary = tf.summary.merge(tf.get_collection("summaries", scope=self.scope))
+
+    def _build_forward(self):
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
+            config.max_word_size
+        JX = tf.shape(self.x)[2]
+        JQ = tf.shape(self.q)[1]
+        M = tf.shape(self.x)[1]
+        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
+
+        with tf.variable_scope("emb"):
+            if config.use_char_emb:
+                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
+                    char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
+
+                with tf.variable_scope("char"):
+                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
+                    Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
+                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
+                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])
+
+                    filter_sizes = list(map(int, config.out_channel_dims.split(',')))
+                    heights = list(map(int, config.filter_heights.split(',')))
+                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
+                    with tf.variable_scope("conv"):
+                        xx = multi_conv1d(Acx, filter_sizes, heights, "VALID",  self.is_train, config.keep_prob, scope="xx")
+                        if config.share_cnn_weights:
+                            tf.get_variable_scope().reuse_variables()
+                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
+                        else:
+                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
+                        xx = tf.reshape(xx, [-1, M, JX, dco])
+                        qq = tf.reshape(qq, [-1, JQ, dco])
+
+            if config.use_word_emb:
+                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
+                    if config.mode == 'train':
+                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
+                    else:
+                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
+                    if config.use_glove_for_unk:
+                        word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
+
+                with tf.name_scope("word"):
+                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
+                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
+                    self.tensor_dict['x'] = Ax
+                    self.tensor_dict['q'] = Aq
+                if config.use_char_emb:
+                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
+                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
+                else:
+                    xx = Ax
+                    qq = Aq
+
+        # highway network
+        if config.highway:
+            with tf.variable_scope("highway"):
+                xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
+                tf.get_variable_scope().reuse_variables()
+                qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
+
+        self.tensor_dict['xx'] = xx
+        self.tensor_dict['qq'] = qq
+
+        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
+        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
+        d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
+        d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
+        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
+        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
+        d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
+        d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
+        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
+        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
+        d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
+        d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
+        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
+        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
+        d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
+        d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
+        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
+        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
+
+        with tf.variable_scope("prepro"):
+            (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1')  # [N, J, d], [N, d]
+            u = tf.concat(axis=2, values=[fw_u, bw_u])
+            if config.share_lstm_weights:
+                tf.get_variable_scope().reuse_variables()
+                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1')  # [N, M, JX, 2d]
+                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+            else:
+                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1')  # [N, M, JX, 2d]
+                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+            self.tensor_dict['u'] = u
+            self.tensor_dict['h'] = h
+
+        with tf.variable_scope("main"):
+            if config.dynamic_att:
+                p0 = h
+                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
+                q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
+                first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim',
+                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
+                first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim',
+                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
+                second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim',
+                                            input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
+                second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim',
+                                               input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
+            else:
+                p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
+                first_cell_fw = d_cell2_fw
+                second_cell_fw = d_cell3_fw
+                first_cell_bw = d_cell2_bw
+                second_cell_bw = d_cell3_bw
+
+            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0')  # [N, M, JX, 2d]
+            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
+            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1')  # [N, M, JX, 2d]
+            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])
+
+            logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
+                                mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1')
+            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX]))
+            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1])
+
+            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
+                                                          x_len, dtype='float', scope='g2')  # [N, M, JX, 2d]
+            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
+            logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
+                                 mask=self.x_mask,
+                                 is_train=self.is_train, func=config.answer_func, scope='logits2')
+
+            flat_logits = tf.reshape(logits, [-1, M * JX])
+            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
+            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
+            flat_yp2 = tf.nn.softmax(flat_logits2)
+
+            if config.na:
+                na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
+                na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1])  # [N, 1]
+                concat_flat_logits = tf.concat(axis=1, values=[na_bias_tiled, flat_logits])
+                concat_flat_yp = tf.nn.softmax(concat_flat_logits)
+                na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1])
+                flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])
+
+                concat_flat_logits2 = tf.concat(axis=1, values=[na_bias_tiled, flat_logits2])
+                concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
+                na_prob2 = tf.squeeze(tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1])  # [N]
+                flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])
+
+                self.concat_logits = concat_flat_logits
+                self.concat_logits2 = concat_flat_logits2
+                self.na_prob = na_prob * na_prob2
+
+            yp = tf.reshape(flat_yp, [-1, M, JX])
+            yp2 = tf.reshape(flat_yp2, [-1, M, JX])
+            wyp = tf.nn.sigmoid(logits2)
+
+            self.tensor_dict['g1'] = g1
+            self.tensor_dict['g2'] = g2
+
+            self.logits = flat_logits
+            self.logits2 = flat_logits2
+            self.yp = yp
+            self.yp2 = yp2
+            self.wyp = wyp
+
+    def _build_loss(self):
+        config = self.config
+        JX = tf.shape(self.x)[2]
+        M = tf.shape(self.x)[1]
+        JQ = tf.shape(self.q)[1]
+
+        loss_mask = tf.reduce_max(tf.cast(self.q_mask, 'float'), 1)
+        if config.wy:
+            losses = tf.nn.sigmoid_cross_entropy_with_logits(
+                logits=tf.reshape(self.logits2, [-1, M, JX]), labels=tf.cast(self.wy, 'float'))  # [N, M, JX]
+            num_pos = tf.reduce_sum(tf.cast(self.wy, 'float'))
+            num_neg = tf.reduce_sum(tf.cast(self.x_mask, 'float')) - num_pos
+            damp_ratio = num_pos / num_neg
+            dampened_losses = losses * (
+                (tf.cast(self.x_mask, 'float') - tf.cast(self.wy, 'float')) * damp_ratio + tf.cast(self.wy, 'float'))
+            new_losses = tf.reduce_sum(dampened_losses, [1, 2])
+            ce_loss = tf.reduce_mean(loss_mask * new_losses)
+            """
+            if config.na:
+                na = tf.reshape(self.na, [-1, 1])
+                concat_y = tf.concat(1, [na, tf.reshape(self.wy, [-1, M * JX])])
+                losses = tf.nn.softmax_cross_entropy_with_logits(
+                    self.concat_logits, tf.cast(concat_y, 'float') / tf.reduce_sum(tf.cast(self.wy, 'float')))
+            else:
+                losses = tf.nn.softmax_cross_entropy_with_logits(
+                    self.logits2, tf.cast(tf.reshape(self.wy, [-1, M * JX]), 'float') / tf.reduce_sum(tf.cast(self.wy, 'float')))
+            ce_loss = tf.reduce_mean(loss_mask * losses)
+            """
+            tf.add_to_collection('losses', ce_loss)
+
+        else:
+            if config.na:
+                na = tf.reshape(self.na, [-1, 1])
+                concat_y = tf.concat(axis=1, values=[na, tf.reshape(self.y, [-1, M * JX])])
+                losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.concat_logits, labels=tf.cast(concat_y, 'float'))
+                concat_y2 = tf.concat(axis=1, values=[na, tf.reshape(self.y2, [-1, M * JX])])
+                losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=self.concat_logits2, labels=tf.cast(concat_y2, 'float'))
+            else:
+                losses = tf.nn.softmax_cross_entropy_with_logits(
+                    logits=self.logits, labels=tf.cast(tf.reshape(self.y, [-1, M * JX]), 'float'))
+                losses2 = tf.nn.softmax_cross_entropy_with_logits(
+                    logits=self.logits2, labels=tf.cast(tf.reshape(self.y2, [-1, M * JX]), 'float'))
+            ce_loss = tf.reduce_mean(loss_mask * losses)
+            ce_loss2 = tf.reduce_mean(loss_mask * losses2)
+            tf.add_to_collection('losses', ce_loss)
+            tf.add_to_collection("losses", ce_loss2)
+
+        self.loss = tf.add_n(tf.get_collection('losses', scope=self.scope), name='loss')
+        tf.summary.scalar(self.loss.op.name, self.loss)
+        tf.add_to_collection('ema/scalar', self.loss)
+
+    def _build_ema(self):
+        self.ema = tf.train.ExponentialMovingAverage(self.config.decay)
+        ema = self.ema
+        tensors = tf.get_collection("ema/scalar", scope=self.scope) + tf.get_collection("ema/vector", scope=self.scope)
+        ema_op = ema.apply(tensors)
+        for var in tf.get_collection("ema/scalar", scope=self.scope):
+            ema_var = ema.average(var)
+            tf.summary.scalar(ema_var.op.name, ema_var)
+        for var in tf.get_collection("ema/vector", scope=self.scope):
+            ema_var = ema.average(var)
+            tf.summary.histogram(ema_var.op.name, ema_var)
+
+        with tf.control_dependencies([ema_op]):
+            self.loss = tf.identity(self.loss)
+
+    def _build_var_ema(self):
+        self.var_ema = tf.train.ExponentialMovingAverage(self.config.var_decay)
+        ema = self.var_ema
+        ema_op = ema.apply(tf.trainable_variables())
+        with tf.control_dependencies([ema_op]):
+            self.loss = tf.identity(self.loss)
+
+    def get_loss(self):
+        return self.loss
+
+    def get_global_step(self):
+        return self.global_step
+
+    def get_var_list(self):
+        return self.var_list
+
+    def get_feed_dict(self, batch, is_train, supervised=True):
+        assert isinstance(batch, DataSet)
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size
+        feed_dict = {}
+
+        if config.len_opt:
+            """
+            Note that this optimization results in variable GPU RAM usage (i.e. can cause OOM in the middle of training.)
+            First test without len_opt and make sure no OOM, and use len_opt
+            """
+            if sum(len(sent) for para in batch.data['x'] for sent in para) == 0:
+                new_JX = 1
+            else:
+                new_JX = max(len(sent) for para in batch.data['x'] for sent in para)
+            JX = min(JX, new_JX)
+
+            if sum(len(ques) for ques in batch.data['q']) == 0:
+                new_JQ = 1
+            else:
+                new_JQ = max(len(ques) for ques in batch.data['q'])
+            JQ = min(JQ, new_JQ)
+
+        if config.cpu_opt:
+            if sum(len(para) for para in batch.data['x']) == 0:
+                new_M = 1
+            else:
+                new_M = max(len(para) for para in batch.data['x'])
+            M = min(M, new_M)
+
+        x = np.zeros([N, M, JX], dtype='int32')
+        cx = np.zeros([N, M, JX, W], dtype='int32')
+        x_mask = np.zeros([N, M, JX], dtype='bool')
+        q = np.zeros([N, JQ], dtype='int32')
+        cq = np.zeros([N, JQ, W], dtype='int32')
+        q_mask = np.zeros([N, JQ], dtype='bool')
+
+        feed_dict[self.x] = x
+        feed_dict[self.x_mask] = x_mask
+        feed_dict[self.cx] = cx
+        feed_dict[self.q] = q
+        feed_dict[self.cq] = cq
+        feed_dict[self.q_mask] = q_mask
+        feed_dict[self.is_train] = is_train
+        if config.use_glove_for_unk:
+            feed_dict[self.new_emb_mat] = batch.shared['new_emb_mat']
+
+        X = batch.data['x']
+        CX = batch.data['cx']
+
+        if supervised:
+            y = np.zeros([N, M, JX], dtype='bool')
+            y2 = np.zeros([N, M, JX], dtype='bool')
+            wy = np.zeros([N, M, JX], dtype='bool')
+            na = np.zeros([N], dtype='bool')
+            feed_dict[self.y] = y
+            feed_dict[self.y2] = y2
+            feed_dict[self.wy] = wy
+            feed_dict[self.na] = na
+
+            for i, (xi, cxi, yi, nai) in enumerate(zip(X, CX, batch.data['y'], batch.data['na'])):
+                if nai:
+                    na[i] = nai
+                    continue
+                start_idx, stop_idx = random.choice(yi)
+                j, k = start_idx
+                j2, k2 = stop_idx
+                if config.single:
+                    X[i] = [xi[j]]
+                    CX[i] = [cxi[j]]
+                    j, j2 = 0, 0
+                if config.squash:
+                    offset = sum(map(len, xi[:j]))
+                    j, k = 0, k + offset
+                    offset = sum(map(len, xi[:j2]))
+                    j2, k2 = 0, k2 + offset
+                y[i, j, k] = True
+                y2[i, j2, k2-1] = True
+                if j == j2:
+                    wy[i, j, k:k2] = True
+                else:
+                    wy[i, j, k:len(batch.data['x'][i][j])] = True
+                    wy[i, j2, :k2] = True
+
+        def _get_word(word):
+            d = batch.shared['word2idx']
+            for each in (word, word.lower(), word.capitalize(), word.upper()):
+                if each in d:
+                    return d[each]
+            if config.use_glove_for_unk:
+                d2 = batch.shared['new_word2idx']
+                for each in (word, word.lower(), word.capitalize(), word.upper()):
+                    if each in d2:
+                        return d2[each] + len(d)
+            return 1
+
+        def _get_char(char):
+            d = batch.shared['char2idx']
+            if char in d:
+                return d[char]
+            return 1
+
+        for i, xi in enumerate(X):
+            if self.config.squash:
+                xi = [list(itertools.chain(*xi))]
+            for j, xij in enumerate(xi):
+                if j == config.max_num_sents:
+                    break
+                for k, xijk in enumerate(xij):
+                    if k == config.max_sent_size:
+                        break
+                    each = _get_word(xijk)
+                    assert isinstance(each, int), each
+                    x[i, j, k] = each
+                    x_mask[i, j, k] = True
+
+        for i, cxi in enumerate(CX):
+            if self.config.squash:
+                cxi = [list(itertools.chain(*cxi))]
+            for j, cxij in enumerate(cxi):
+                if j == config.max_num_sents:
+                    break
+                for k, cxijk in enumerate(cxij):
+                    if k == config.max_sent_size:
+                        break
+                    for l, cxijkl in enumerate(cxijk):
+                        if l == config.max_word_size:
+                            break
+                        cx[i, j, k, l] = _get_char(cxijkl)
+
+        for i, qi in enumerate(batch.data['q']):
+            for j, qij in enumerate(qi):
+                q[i, j] = _get_word(qij)
+                q_mask[i, j] = True
+
+        for i, cqi in enumerate(batch.data['cq']):
+            for j, cqij in enumerate(cqi):
+                for k, cqijk in enumerate(cqij):
+                    cq[i, j, k] = _get_char(cqijk)
+                    if k + 1 == config.max_word_size:
+                        break
+
+        if supervised:
+            assert np.sum(~(x_mask | ~wy)) == 0
+
+        return feed_dict
+
+
+def bi_attention(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
+    with tf.variable_scope(scope or "bi_attention"):
+        JX = tf.shape(h)[2]
+        M = tf.shape(h)[1]
+        JQ = tf.shape(u)[1]
+        h_aug = tf.tile(tf.expand_dims(h, 3), [1, 1, 1, JQ, 1])
+        u_aug = tf.tile(tf.expand_dims(tf.expand_dims(u, 1), 1), [1, M, JX, 1, 1])
+        if h_mask is None:
+            hu_mask = None
+        else:
+            h_mask_aug = tf.tile(tf.expand_dims(h_mask, 3), [1, 1, 1, JQ])
+            u_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(u_mask, 1), 1), [1, M, JX, 1])
+            hu_mask = h_mask_aug & u_mask_aug
+
+        u_logits = get_logits([h_aug, u_aug], None, True, wd=config.wd, mask=hu_mask,
+                              is_train=is_train, func=config.logit_func, scope='u_logits')  # [N, M, JX, JQ]
+        u_a = softsel(u_aug, u_logits)  # [N, M, JX, d]
+        h_a = softsel(h, tf.reduce_max(u_logits, 3))  # [N, M, d]
+        h_a = tf.tile(tf.expand_dims(h_a, 2), [1, 1, JX, 1])
+
+        if tensor_dict is not None:
+            a_u = tf.nn.softmax(u_logits)  # [N, M, JX, JQ]
+            a_h = tf.nn.softmax(tf.reduce_max(u_logits, 3))
+            tensor_dict['a_u'] = a_u
+            tensor_dict['a_h'] = a_h
+            variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name)
+            for var in variables:
+                tensor_dict[var.name] = var
+
+        return u_a, h_a
+
+
+def attention_layer(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
+    with tf.variable_scope(scope or "attention_layer"):
+        JX = tf.shape(h)[2]
+        M = tf.shape(h)[1]
+        JQ = tf.shape(u)[1]
+        if config.q2c_att or config.c2q_att:
+            u_a, h_a = bi_attention(config, is_train, h, u, h_mask=h_mask, u_mask=u_mask, tensor_dict=tensor_dict)
+        if not config.c2q_att:
+            u_a = tf.tile(tf.expand_dims(tf.expand_dims(tf.reduce_mean(u, 1), 1), 1), [1, M, JX, 1])
+        if config.q2c_att:
+            p0 = tf.concat(axis=3, values=[h, u_a, h * u_a, h * h_a])
+        else:
+            p0 = tf.concat(axis=3, values=[h, u_a, h * u_a])
+        return p0
--- a/tensorflow/SQuAD/basic/read_data.py
+++ b/tensorflow/SQuAD/basic/read_data.py
@ -0,0 +1,316 @@
+import json
+import os
+import random
+import itertools
+import math
+from collections import defaultdict
+
+import numpy as np
+
+from my.tensorflow import grouper
+from my.utils import index
+
+
+class Data(object):
+    def get_size(self):
+        raise NotImplementedError()
+
+    def get_by_idxs(self, idxs):
+        """
+        Efficient way to obtain a batch of items from filesystem
+        :param idxs:
+        :return dict: {'X': [,], 'Y', }
+        """
+        data = defaultdict(list)
+        for idx in idxs:
+            each_data = self.get_one(idx)
+            for key, val in each_data.items():
+                data[key].append(val)
+        return data
+
+    def get_one(self, idx):
+        raise NotImplementedError()
+
+    def get_empty(self):
+        raise NotImplementedError()
+
+    def __add__(self, other):
+        raise NotImplementedError()
+
+
+class DataSet(object):
+    def __init__(self, data, data_type, shared=None, valid_idxs=None):
+        self.data = data  # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
+        self.data_type = data_type
+        self.shared = shared
+        total_num_examples = self.get_data_size()
+        self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
+        self.num_examples = len(self.valid_idxs)
+
+    def _sort_key(self, idx):
+        rx = self.data['*x'][idx]
+        x = self.shared['x'][rx[0]][rx[1]]
+        return max(map(len, x))
+
+    def get_data_size(self):
+        if isinstance(self.data, dict):
+            return len(next(iter(self.data.values())))
+        elif isinstance(self.data, Data):
+            return self.data.get_size()
+        raise Exception()
+
+    def get_by_idxs(self, idxs):
+        if isinstance(self.data, dict):
+            out = defaultdict(list)
+            for key, val in self.data.items():
+                out[key].extend(val[idx] for idx in idxs)
+            return out
+        elif isinstance(self.data, Data):
+            return self.data.get_by_idxs(idxs)
+        raise Exception()
+
+    def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False):
+        """
+
+        :param batch_size:
+        :param num_batches:
+        :param shuffle:
+        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
+        :return:
+        """
+        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
+        if num_batches is None:
+            num_batches = num_batches_per_epoch
+        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
+
+        if shuffle:
+            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
+            if cluster:
+                sorted_idxs = sorted(random_idxs, key=self._sort_key)
+                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
+                grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch)
+            else:
+                random_grouped = lambda: list(grouper(random_idxs, batch_size))
+                grouped = random_grouped
+        else:
+            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
+            grouped = raw_grouped
+
+        batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs))
+        for _ in range(num_batches):
+            batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None)
+            batch_data = self.get_by_idxs(batch_idxs)
+            shared_batch_data = {}
+            for key, val in batch_data.items():
+                if key.startswith('*'):
+                    assert self.shared is not None
+                    shared_key = key[1:]
+                    shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val]
+            batch_data.update(shared_batch_data)
+
+            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
+            yield batch_idxs, batch_ds
+
+    def get_multi_batches(self, batch_size, num_batches_per_step, num_steps=None, shuffle=False, cluster=False):
+        batch_size_per_step = batch_size * num_batches_per_step
+        batches = self.get_batches(batch_size_per_step, num_batches=num_steps, shuffle=shuffle, cluster=cluster)
+        multi_batches = (tuple(zip(grouper(idxs, batch_size, shorten=True, num_groups=num_batches_per_step),
+                         data_set.divide(num_batches_per_step))) for idxs, data_set in batches)
+        return multi_batches
+
+    def get_empty(self):
+        if isinstance(self.data, dict):
+            data = {key: [] for key in self.data}
+        elif isinstance(self.data, Data):
+            data = self.data.get_empty()
+        else:
+            raise Exception()
+        return DataSet(data, self.data_type, shared=self.shared)
+
+    def __add__(self, other):
+        if isinstance(self.data, dict):
+            data = {key: val + other.data[key] for key, val in self.data.items()}
+        elif isinstance(self.data, Data):
+            data = self.data + other.data
+        else:
+            raise Exception()
+
+        valid_idxs = list(self.valid_idxs) + [valid_idx + self.num_examples for valid_idx in other.valid_idxs]
+        return DataSet(data, self.data_type, shared=self.shared, valid_idxs=valid_idxs)
+
+    def divide(self, integer):
+        batch_size = int(math.ceil(self.num_examples / integer))
+        idxs_gen = grouper(self.valid_idxs, batch_size, shorten=True, num_groups=integer)
+        data_gen = (self.get_by_idxs(idxs) for idxs in idxs_gen)
+        ds_tuple = tuple(DataSet(data, self.data_type, shared=self.shared) for data in data_gen)
+        return ds_tuple
+
+
+def load_metadata(config, data_type):
+    metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
+    with open(metadata_path, 'r') as fh:
+        metadata = json.load(fh)
+        for key, val in metadata.items():
+            config.__setattr__(key, val)
+        return metadata
+
+
+def read_data(config, data_type, ref, data_filter=None):
+    data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
+    with open(data_path, 'r') as fh:
+        data = json.load(fh)
+    with open(shared_path, 'r') as fh:
+        shared = json.load(fh)
+
+    num_examples = len(next(iter(data.values())))
+    if data_filter is None:
+        valid_idxs = range(num_examples)
+    else:
+        mask = []
+        keys = data.keys()
+        values = data.values()
+        for vals in zip(*values):
+            each = {key: val for key, val in zip(keys, vals)}
+            mask.append(data_filter(each, shared))
+        valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
+
+    print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
+
+    shared_path = config.shared_path or os.path.join(config.out_dir, "shared.json")
+    if not ref:
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
+        char_counter = shared['char_counter']
+        if config.finetune:
+            shared['word2idx'] = {word: idx + 2 for idx, word in
+                                  enumerate(word for word, count in word_counter.items()
+                                            if count > config.word_count_th or (config.known_if_glove and word in word2vec_dict))}
+        else:
+            assert config.known_if_glove
+            assert config.use_glove_for_unk
+            shared['word2idx'] = {word: idx + 2 for idx, word in
+                                  enumerate(word for word, count in word_counter.items()
+                                            if count > config.word_count_th and word not in word2vec_dict)}
+        shared['char2idx'] = {char: idx + 2 for idx, char in
+                              enumerate(char for char, count in char_counter.items()
+                                        if count > config.char_count_th)}
+        NULL = "-NULL-"
+        UNK = "-UNK-"
+        shared['word2idx'][NULL] = 0
+        shared['word2idx'][UNK] = 1
+        shared['char2idx'][NULL] = 0
+        shared['char2idx'][UNK] = 1
+        json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx']}, open(shared_path, 'w'))
+    else:
+        new_shared = json.load(open(shared_path, 'r'))
+        for key, val in new_shared.items():
+            shared[key] = val
+
+    if config.use_glove_for_unk:
+        # create new word2idx and word2vec
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        new_word2idx_dict = {word: idx for idx, word in enumerate(word for word in word2vec_dict.keys() if word not in shared['word2idx'])}
+        shared['new_word2idx'] = new_word2idx_dict
+        offset = len(shared['word2idx'])
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        new_word2idx_dict = shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        # print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        shared['new_emb_mat'] = new_emb_mat
+
+    data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
+    return data_set
+
+
+def get_squad_data_filter(config):
+    def data_filter(data_point, shared):
+        assert shared is not None
+        rx, rcx, q, cq, y = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
+        x, cx = shared['x'], shared['cx']
+        if len(q) > config.ques_size_th:
+            return False
+
+        # x filter
+        xi = x[rx[0]][rx[1]]
+        if config.squash:
+            for start, stop in y:
+                stop_offset = sum(map(len, xi[:stop[0]]))
+                if stop_offset + stop[1] > config.para_size_th:
+                    return False
+            return True
+
+        if config.single:
+            for start, stop in y:
+                if start[0] != stop[0]:
+                    return False
+
+        if config.data_filter == 'max':
+            for start, stop in y:
+                    if stop[0] >= config.num_sents_th:
+                        return False
+                    if start[0] != stop[0]:
+                        return False
+                    if stop[1] >= config.sent_size_th:
+                        return False
+        elif config.data_filter == 'valid':
+            if len(xi) > config.num_sents_th:
+                return False
+            if any(len(xij) > config.sent_size_th for xij in xi):
+                return False
+        elif config.data_filter == 'semi':
+            """
+            Only answer sentence needs to be valid.
+            """
+            for start, stop in y:
+                if stop[0] >= config.num_sents_th:
+                    return False
+                if start[0] != start[0]:
+                    return False
+                if len(xi[start[0]]) > config.sent_size_th:
+                    return False
+        else:
+            raise Exception()
+
+        return True
+    return data_filter
+
+
+def update_config(config, data_sets):
+    config.max_num_sents = 0
+    config.max_sent_size = 0
+    config.max_ques_size = 0
+    config.max_word_size = 0
+    config.max_para_size = 0
+    for data_set in data_sets:
+        data = data_set.data
+        shared = data_set.shared
+        for idx in data_set.valid_idxs:
+            rx = data['*x'][idx]
+            q = data['q'][idx]
+            sents = shared['x'][rx[0]][rx[1]]
+            config.max_para_size = max(config.max_para_size, sum(map(len, sents)))
+            config.max_num_sents = max(config.max_num_sents, len(sents))
+            config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
+            config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
+            if len(q) > 0:
+                config.max_ques_size = max(config.max_ques_size, len(q))
+                config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
+
+    if config.mode == 'train':
+        config.max_num_sents = min(config.max_num_sents, config.num_sents_th)
+        config.max_sent_size = min(config.max_sent_size, config.sent_size_th)
+        config.max_para_size = min(config.max_para_size, config.para_size_th)
+
+    config.max_word_size = min(config.max_word_size, config.word_size_th)
+
+    config.char_vocab_size = len(data_sets[0].shared['char2idx'])
+    config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
+    config.word_vocab_size = len(data_sets[0].shared['word2idx'])
+
+    if config.single:
+        config.max_num_sents = 1
+    if config.squash:
+        config.max_sent_size = config.max_para_size
+        config.max_num_sents = 1
--- a/tensorflow/SQuAD/basic/run_ensemble.sh
+++ b/tensorflow/SQuAD/basic/run_ensemble.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+source_path=$1
+target_path=$2
+inter_dir="inter_ensemble"
+root_dir="save"
+
+parg=""
+marg=""
+if [ "$3" = "debug" ]
+then
+    parg="-d"
+    marg="--debug"
+fi
+
+# Preprocess data
+python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
+
+eargs=""
+for num in 31 33 34 35 36 37 40 41 43 44 45 46; do
+    load_path="$root_dir/$num/save"
+    shared_path="$root_dir/$num/shared.json"
+    eval_path="$inter_dir/eval-$num.pklz"
+    eargs="$eargs $eval_path"
+    python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema &
+done
+wait
+
+# Ensemble
+python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs
--- a/tensorflow/SQuAD/basic/run_single.sh
+++ b/tensorflow/SQuAD/basic/run_single.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+source_path=$1
+target_path=$2
+inter_dir="inter_single"
+root_dir="save"
+
+parg=""
+marg=""
+if [ "$3" = "debug" ]
+then
+    parg="-d"
+    marg="--debug"
+fi
+
+# Preprocess data
+python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
+
+num=37
+load_path="$root_dir/$num/save"
+shared_path="$root_dir/$num/shared.json"
+eval_path="$inter_dir/eval.pklz"
+python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema
+
+# Ensemble (for single run, just one input)
+python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path
+
+
--- a/tensorflow/SQuAD/basic/templates/visualizer.html
+++ b/tensorflow/SQuAD/basic/templates/visualizer.html
@ -0,0 +1,76 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>{{ title }}</title>
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
+    <script>
+        $(document).ready(function(){
+            $(".att").each(function() {
+                // var val = parseFloat($(this).text());
+                var val = parseFloat($(this).attr("color"));
+                var scale = chroma.scale(['white', 'red']);
+                var color = scale(val).hex();
+                $(this).attr("bgcolor", color);
+            });
+        })
+    </script>
+</head>
+<style>
+    table, th, td {border: 1px solid black}
+</style>
+<body>
+    <h2>{{ title }}</h2>
+    <table>
+        <tr>
+            <th>ID</th>
+            <th>Question</th>
+            <th>Answers</th>
+            <th>Predicted</th>
+            <th>Score</th>
+            <th>Paragraph</th>
+        </tr>
+        {% for row in rows %}
+            <tr>
+                <td>{{ row.id }}</td>
+                <td>
+                    {% for qj in row.ques %}
+                        {{ qj }}
+                    {% endfor %}
+                </td>
+                <td>
+                    {% for aa in row.a %}
+                        <li>{{ aa }}</li>
+                    {% endfor %}
+                </td>
+                <td>{{ row.ap }}</td>
+                <td>{{ row.score }}</td>
+                <td>
+                    <table>
+                    {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
+                        <tr>
+                        {% set rowloop = loop %}
+                        {% for xjk, ypjk in zip(xj, ypj) %}
+                            <td class="att" color="{{ ypjk }}">
+                            {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
+                                <b>{{ xjk }}</b>
+                            {% else %}
+                                {{ xjk }}
+                            {% endif %}
+                            </td>
+                        {% endfor %}
+                        </tr>
+                        <tr>
+                        {% for xjk, yp2jk in zip(xj, yp2j) %}
+                            <td class="att" color="{{ yp2jk }}">-</td>
+                        {% endfor %}
+                        </tr>
+                    {% endfor %}
+                    </table>
+                </td>
+            </tr>
+        {% endfor %}
+    </table>
+</body>
+</html>
--- a/tensorflow/SQuAD/basic/trainer.py
+++ b/tensorflow/SQuAD/basic/trainer.py
@ -0,0 +1,73 @@
+import tensorflow as tf
+
+from basic.model import Model
+from my.tensorflow import average_gradients
+
+
+class Trainer(object):
+    def __init__(self, config, model):
+        assert isinstance(model, Model)
+        self.config = config
+        self.model = model
+        self.opt = tf.train.AdamOptimizer(config.init_lr)
+        self.loss = model.get_loss()
+        self.var_list = model.get_var_list()
+        self.global_step = model.get_global_step()
+        self.summary = model.summary
+        self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
+        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
+
+    def get_train_op(self):
+        return self.train_op
+
+    def step(self, sess, batch, get_summary=False):
+        assert isinstance(sess, tf.Session)
+        _, ds = batch
+        feed_dict = self.model.get_feed_dict(ds, True)
+        if get_summary:
+            loss, summary, train_op = \
+                sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
+        else:
+            loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+            summary = None
+        return loss, summary, train_op
+
+
+class MultiGPUTrainer(object):
+    def __init__(self, config, models):
+        model = models[0]
+        assert isinstance(model, Model)
+        self.config = config
+        self.model = model
+        self.opt = tf.train.AdamOptimizer(config.init_lr)
+        self.var_list = model.get_var_list()
+        self.global_step = model.get_global_step()
+        self.summary = model.summary
+        self.models = models
+        losses = []
+        grads_list = []
+        for gpu_idx, model in enumerate(models):
+            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)):
+                loss = model.get_loss()
+                grads = self.opt.compute_gradients(loss, var_list=self.var_list)
+                losses.append(loss)
+                grads_list.append(grads)
+
+        self.loss = tf.add_n(losses)/len(losses)
+        self.grads = average_gradients(grads_list)
+        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
+
+    def step(self, sess, batches, get_summary=False):
+        assert isinstance(sess, tf.Session)
+        feed_dict = {}
+        for batch, model in zip(batches, self.models):
+            _, ds = batch
+            feed_dict.update(model.get_feed_dict(ds, True))
+
+        if get_summary:
+            loss, summary, train_op = \
+                sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
+        else:
+            loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+            summary = None
+        return loss, summary, train_op
--- a/tensorflow/SQuAD/basic/visualizer.py
+++ b/tensorflow/SQuAD/basic/visualizer.py
@ -0,0 +1,140 @@
+import shutil
+from collections import OrderedDict
+import http.server
+import socketserver
+import argparse
+import json
+import os
+import numpy as np
+from tqdm import tqdm
+import pickle
+import gzip
+
+from jinja2 import Environment, FileSystemLoader
+
+from squad.utils import get_best_span, get_best_span_wy
+
+
+def bool_(string):
+    if string == 'True':
+        return True
+    elif string == 'False':
+        return False
+    else:
+        raise Exception()
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default='basic')
+    parser.add_argument("--data_type", type=str, default='dev')
+    parser.add_argument("--step", type=int, default=5000)
+    parser.add_argument("--template_name", type=str, default="visualizer.html")
+    parser.add_argument("--num_per_page", type=int, default=100)
+    parser.add_argument("--data_dir", type=str, default="data/squad")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--open", type=str, default='False')
+    parser.add_argument("--run_id", type=str, default="0")
+    parser.add_argument("-w", "--wy", action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def _decode(decoder, sent):
+    return " ".join(decoder[idx] for idx in sent)
+
+
+def accuracy2_visualizer(args):
+    model_name = args.model_name
+    data_type = args.data_type
+    num_per_page = args.num_per_page
+    data_dir = args.data_dir
+    run_id = args.run_id.zfill(2)
+    step = args.step
+
+    eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.pklz".format(data_type, str(step).zfill(6)))
+    print("loading {}".format(eval_path))
+    eval_ = pickle.load(gzip.open(eval_path, 'r'))
+
+    _id = 0
+    html_dir = "/tmp/list_results%d" % _id
+    while os.path.exists(html_dir):
+        _id += 1
+        html_dir = "/tmp/list_results%d" % _id
+
+    if os.path.exists(html_dir):
+        shutil.rmtree(html_dir)
+    os.mkdir(html_dir)
+
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    templates_dir = os.path.join(cur_dir, 'templates')
+    env = Environment(loader=FileSystemLoader(templates_dir))
+    env.globals.update(zip=zip, reversed=reversed)
+    template = env.get_template(args.template_name)
+
+    data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
+    print("loading {}".format(data_path))
+    data = json.load(open(data_path, 'r'))
+    print("loading {}".format(shared_path))
+    shared = json.load(open(shared_path, 'r'))
+
+    rows = []
+    for i, (idx, yi, ypi, yp2i, wypi) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2', 'wyp')])), total=len(eval_['idxs'])):
+        id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
+        x = shared['x'][rx[0]][rx[1]]
+        ques = [" ".join(q)]
+        para = [[word for word in sent] for sent in x]
+        span, score = get_best_span_wy(wypi, 0.5) if args.wy else get_best_span(ypi, yp2i)
+        ap = get_segment(para, span)
+        # score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
+
+        row = {
+            'id': id_,
+            'title': "Hello world!",
+            'ques': ques,
+            'para': para,
+            'y': yi[0][0],
+            'y2': yi[0][1],
+            'yp': wypi if args.wy else ypi,
+            'yp2': wypi if args.wy else yp2i,
+            'a': answers,
+            'ap': ap,
+            'score': score
+               }
+        rows.append(row)
+
+        if i % num_per_page == 0:
+            html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
+
+        if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
+            var_dict = {'title': "Accuracy Visualization",
+                        'rows': rows
+                        }
+            with open(html_path, "wb") as f:
+                f.write(template.render(**var_dict).encode('UTF-8'))
+            rows = []
+
+    os.chdir(html_dir)
+    port = args.port
+    host = args.host
+    # Overriding to suppress log message
+    class MyHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass
+    handler = MyHandler
+    httpd = socketserver.TCPServer((host, port), handler)
+    if args.open == 'True':
+        os.system("open http://%s:%d" % (args.host, args.port))
+    print("serving at %s:%d" % (host, port))
+    httpd.serve_forever()
+
+
+def get_segment(para, span):
+    return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
+
+
+if __name__ == "__main__":
+    ARGS = get_args()
+    accuracy2_visualizer(ARGS)
--- a/tensorflow/SQuAD/basic_cnn/init.py
+++ b/tensorflow/SQuAD/basic_cnn/init.py
--- a/tensorflow/SQuAD/basic_cnn/cli.py
+++ b/tensorflow/SQuAD/basic_cnn/cli.py
@ -0,0 +1,103 @@
+import os
+
+import tensorflow as tf
+
+from basic_cnn.main import main as m
+
+flags = tf.app.flags
+
+flags.DEFINE_string("model_name", "basic_cnn", "Model name [basic]")
+flags.DEFINE_string("data_dir", "data/cnn", "Data dir [data/cnn]")
+flags.DEFINE_string("root_dir", "/Users/minjoons/data/cnn/questions", "root dir [~/data/cnn/questions]")
+flags.DEFINE_string("run_id", "0", "Run ID [0]")
+flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
+
+flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
+flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
+flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
+flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
+flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
+flags.DEFINE_integer("load_step", 0, "load step [0]")
+flags.DEFINE_integer("early_stop", 4, "early stop [4]")
+
+flags.DEFINE_string("mode", "test", "train | dev | test | forward [test]")
+flags.DEFINE_boolean("load", True, "load saved data? [True]")
+flags.DEFINE_boolean("progress", True, "Show progress? [True]")
+flags.DEFINE_integer("log_period", 100, "Log period [100]")
+flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
+flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
+flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
+
+flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
+
+flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
+flags.DEFINE_integer("char_out_size", 100, "Char out size [100]")
+flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob [0.8]")
+flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
+flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
+flags.DEFINE_float("wd", 0.0, "Weight decay [0.0]")
+flags.DEFINE_bool("lower_word", True, "lower word [True]")
+flags.DEFINE_bool("dump_eval", False, "dump eval? [True]")
+flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
+flags.DEFINE_string("model", "2", "config 1 |2 [2]")
+flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
+flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
+
+flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
+flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
+flags.DEFINE_integer("sent_size_th", 60, "sent size th [64]")
+flags.DEFINE_integer("num_sents_th", 200, "num sents th [8]")
+flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
+flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
+flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
+
+flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
+flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
+flags.DEFINE_bool("finetune", False, "finetune? [False]")
+flags.DEFINE_bool("feed_gt", False, "feed gt prev token during training [False]")
+flags.DEFINE_bool("feed_hard", False, "feed hard argmax prev token during testing [False]")
+flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
+flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
+flags.DEFINE_bool("eval", True, "eval? [True]")
+flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
+flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
+
+flags.DEFINE_string("forward_name", "single", "Forward name [single]")
+flags.DEFINE_string("answer_path", "", "Answer path []")
+flags.DEFINE_string("load_path", "", "Load path []")
+flags.DEFINE_string("shared_path", "", "Shared path []")
+flags.DEFINE_string("device", "/cpu:0", "default device [/cpu:0]")
+flags.DEFINE_integer("num_gpus", 1, "num of gpus [1]")
+
+flags.DEFINE_string("out_channel_dims", "100", "Out channel dims, separated by commas [100]")
+flags.DEFINE_string("filter_heights", "5", "Filter heights, separated by commas [5]")
+
+flags.DEFINE_bool("share_cnn_weights", True, "Share CNN weights [False]")
+flags.DEFINE_bool("share_lstm_weights", True, "Share LSTM weights [True]")
+flags.DEFINE_bool("two_prepro_layers", False, "Use two layers for preprocessing? [False]")
+flags.DEFINE_bool("aug_att", False, "Augment attention layers with more features? [False]")
+flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
+flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
+flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
+flags.DEFINE_float("keep_prob", 1.0, "keep prob [1.0]")
+flags.DEFINE_string("prev_mode", "a", "prev mode gy | y | a [a]")
+flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
+flags.DEFINE_bool("sh", False, "use superhighway [False]")
+flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
+flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
+flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
+flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
+flags.DEFINE_float("filter_ratio", 1.0, "filter ratio [1.0]")
+flags.DEFINE_bool("bi", False, "bi-directional attention? [False]")
+flags.DEFINE_integer("width", 5, "width around entity [5]")
+
+
+def main(_):
+    config = flags.FLAGS
+
+    config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
+
+    m(config)
+
+if __name__ == "__main__":
+    tf.app.run()
--- a/tensorflow/SQuAD/basic_cnn/evaluator.py
+++ b/tensorflow/SQuAD/basic_cnn/evaluator.py
@ -0,0 +1,494 @@
+import itertools
+from collections import defaultdict
+
+import numpy as np
+import tensorflow as tf
+import os
+
+from basic_cnn.read_data import DataSet
+from my.nltk_utils import span_f1
+from my.tensorflow import padded_reshape
+from my.utils import argmax
+
+
+class Evaluation(object):
+    def __init__(self, data_type, global_step, idxs, yp, tensor_dict=None):
+        self.data_type = data_type
+        self.global_step = global_step
+        self.idxs = idxs
+        self.yp = yp
+        self.num_examples = len(yp)
+        self.tensor_dict = None
+        self.dict = {'data_type': data_type,
+                     'global_step': global_step,
+                     'yp': yp,
+                     'idxs': idxs,
+                     'num_examples': self.num_examples}
+        if tensor_dict is not None:
+            self.tensor_dict = {key: val.tolist() for key, val in tensor_dict.items()}
+            for key, val in self.tensor_dict.items():
+                self.dict[key] = val
+        self.summaries = None
+
+    def __repr__(self):
+        return "{} step {}".format(self.data_type, self.global_step)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_idxs = self.idxs + other.idxs
+        new_tensor_dict = None
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: val + other.tensor_dict[key] for key, val in self.tensor_dict.items()}
+        return Evaluation(self.data_type, self.global_step, new_idxs, new_yp, tensor_dict=new_tensor_dict)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+
+class LabeledEvaluation(Evaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y, id2answer_dict, tensor_dict=None):
+        super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
+        self.y = y
+        self.dict['y'] = y
+        self.id2answer_dict = id2answer_dict
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_idxs = self.idxs + other.idxs
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
+        new_id2answer_dict['scores'] = new_id2score_dict
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_id2answer_dict, tensor_dict=new_tensor_dict)
+
+
+class AccuracyEvaluation(LabeledEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y, id2answer_dict, correct, loss, tensor_dict=None):
+        super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y, id2answer_dict, tensor_dict=tensor_dict)
+        self.loss = loss
+        self.correct = correct
+        self.id2answer_dict = id2answer_dict
+        self.acc = sum(correct) / len(correct)
+        self.dict['loss'] = loss
+        self.dict['correct'] = correct
+        self.dict['acc'] = self.acc
+        loss_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/loss'.format(data_type), simple_value=self.loss)])
+        acc_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/acc'.format(data_type), simple_value=self.acc)])
+        self.summaries = [loss_summary, acc_summary]
+
+    def __repr__(self):
+        return "{} step {}: accuracy={}={}/{}, loss={}".format(self.data_type, self.global_step, self.acc,
+                                                               sum(self.correct), self.num_examples, self.loss)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_correct = self.correct + other.correct
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
+        new_id2answer_dict['scores'] = new_id2score_dict
+        new_tensor_dict = None
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_id2answer_dict, new_correct, new_loss, tensor_dict=new_tensor_dict)
+
+
+class Evaluator(object):
+    def __init__(self, config, model, tensor_dict=None):
+        self.config = config
+        self.model = model
+        self.global_step = model.global_step
+        self.yp = model.yp
+        self.tensor_dict = {} if tensor_dict is None else tensor_dict
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), tensor_dict=tensor_dict)
+        return e
+
+    def get_evaluation_from_batches(self, sess, batches):
+        e = sum(self.get_evaluation(sess, batch) for batch in batches)
+        return e
+
+
+class LabeledEvaluator(Evaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(LabeledEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.y = model.y
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        y = feed_dict[self.y]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), tensor_dict=tensor_dict)
+        return e
+
+
+class AccuracyEvaluator(LabeledEvaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(AccuracyEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.loss = model.loss
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = self._split_batch(batch)
+        assert isinstance(data_set, DataSet)
+        feed_dict = self._get_feed_dict(batch)
+        y = data_set.data['y']
+        global_step, yp, loss, vals = sess.run([self.global_step, self.yp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        correct, probs, preds = zip(*[self.__class__.compare(data_set.get_one(idx), ypi) for idx, ypi in zip(data_set.valid_idxs, yp)])
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        ids = data_set.data['ids']
+        id2score_dict = {id_: prob for id_, prob in zip(ids, probs)}
+        id2answer_dict = {id_: pred for id_, pred in zip(ids, preds)}
+        id2answer_dict['scores'] = id2score_dict
+        e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y, id2answer_dict, correct, float(loss), tensor_dict=tensor_dict)
+        return e
+
+    @staticmethod
+    def compare(data, ypi):
+        prob = float(np.max(ypi))
+        yi = data['y']
+        for start, stop in yi:
+            if start == int(np.argmax(ypi)):
+                return True, prob, " "
+        return False, prob, " "
+
+    def _split_batch(self, batch):
+        return batch
+
+    def _get_feed_dict(self, batch):
+        return self.model.get_feed_dict(batch[1], False)
+
+
+class CNNAccuracyEvaluator(AccuracyEvaluator):
+    @staticmethod
+    def compare(data, ypi):
+        # ypi: [N, M, JX] numbers
+        yi = data['y'][0]  # entity
+        xi = data['x'][0]  # [N, M, JX] words
+        dist = defaultdict(int)
+        for ypij, xij in zip(ypi, xi):
+            for ypijk, xijk in zip(ypij, xij):
+                if xijk.startswith("@"):
+                    dist[xijk] += ypijk
+        pred, prob = max(dist.items(), key=lambda item: item[1])
+        assert pred.startswith("@")
+        assert yi.startswith("@")
+        return pred == yi, prob, pred
+
+
+class AccuracyEvaluator2(AccuracyEvaluator):
+    @staticmethod
+    def compare(yi, ypi):
+        for start, stop in yi:
+            para_start = int(np.argmax(np.max(ypi, 1)))
+            sent_start = int(np.argmax(ypi[para_start]))
+            if tuple(start) == (para_start, sent_start):
+                return True
+        return False
+
+
+class ForwardEvaluation(Evaluation):
+    def __init__(self, data_type, global_step, idxs, yp, yp2, loss, id2answer_dict, tensor_dict=None):
+        super(ForwardEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
+        self.yp2 = yp2
+        self.loss = loss
+        self.dict['loss'] = loss
+        self.dict['yp2'] = yp2
+        self.id2answer_dict = id2answer_dict
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_yp2 = self.yp2 + other.yp2
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_yp)
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        if self.tensor_dict is not None:
+            new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
+        return ForwardEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_loss, new_id2answer_dict, tensor_dict=new_tensor_dict)
+
+    def __repr__(self):
+        return "{} step {}: loss={:.4f}".format(self.data_type, self.global_step, self.loss)
+
+
+class F1Evaluation(AccuracyEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, yp2, y, correct, loss, f1s, id2answer_dict, tensor_dict=None):
+        super(F1Evaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=tensor_dict)
+        self.yp2 = yp2
+        self.f1s = f1s
+        self.f1 = float(np.mean(f1s))
+        self.dict['yp2'] = yp2
+        self.dict['f1s'] = f1s
+        self.dict['f1'] = self.f1
+        self.id2answer_dict = id2answer_dict
+        f1_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/f1'.format(data_type), simple_value=self.f1)])
+        self.summaries.append(f1_summary)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_yp2 = self.yp2 + other.yp2
+        new_y = self.y + other.y
+        new_correct = self.correct + other.correct
+        new_f1s = self.f1s + other.f1s
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
+        return F1Evaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_correct, new_loss, new_f1s, new_id2answer_dict)
+
+    def __repr__(self):
+        return "{} step {}: accuracy={:.4f}, f1={:.4f}, loss={:.4f}".format(self.data_type, self.global_step, self.acc, self.f1, self.loss)
+
+
+class F1Evaluator(LabeledEvaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(F1Evaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.yp2 = model.yp2
+        self.loss = model.loss
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = self._split_batch(batch)
+        assert isinstance(data_set, DataSet)
+        feed_dict = self._get_feed_dict(batch)
+        global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+        y = data_set.data['y']
+        if self.config.squash:
+            new_y = []
+            for xi, yi in zip(data_set.data['x'], y):
+                new_yi = []
+                for start, stop in yi:
+                    start_offset = sum(map(len, xi[:start[0]]))
+                    stop_offset = sum(map(len, xi[:stop[0]]))
+                    new_start = 0, start_offset + start[1]
+                    new_stop = 0, stop_offset + stop[1]
+                    new_yi.append((new_start, new_stop))
+                new_y.append(new_yi)
+            y = new_y
+        if self.config.single:
+            new_y = []
+            for yi in y:
+                new_yi = []
+                for start, stop in yi:
+                    new_start = 0, start[1]
+                    new_stop = 0, stop[1]
+                    new_yi.append((new_start, new_stop))
+                new_y.append(new_yi)
+            y = new_y
+
+        yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
+        spans = [get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)]
+
+        def _get(xi, span):
+            if len(xi) <= span[0][0]:
+                return [""]
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return [""]
+            return xi[span[0][0]][span[0][1]:span[1][1]]
+
+        id2answer_dict = {id_: " ".join(_get(xi, span))
+                          for id_, xi, span in zip(data_set.data['ids'], data_set.data['x'], spans)}
+        correct = [self.__class__.compare2(yi, span) for yi, span in zip(y, spans)]
+        f1s = [self.__class__.span_f1(yi, span) for yi, span in zip(y, spans)]
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = F1Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y,
+                         correct, float(loss), f1s, id2answer_dict, tensor_dict=tensor_dict)
+        return e
+
+    def _split_batch(self, batch):
+        return batch
+
+    def _get_feed_dict(self, batch):
+        return self.model.get_feed_dict(batch[1], False)
+
+    @staticmethod
+    def compare(yi, ypi, yp2i):
+        for start, stop in yi:
+            aypi = argmax(ypi)
+            mask = np.zeros(yp2i.shape)
+            mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
+            if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
+                return True
+        return False
+
+    @staticmethod
+    def compare2(yi, span):
+        for start, stop in yi:
+            if tuple(start) == span[0] and tuple(stop) == span[1]:
+                return True
+        return False
+
+    @staticmethod
+    def span_f1(yi, span):
+        max_f1 = 0
+        for start, stop in yi:
+            if start[0] == span[0][0]:
+                true_span = start[1], stop[1]
+                pred_span = span[0][1], span[1][1]
+                f1 = span_f1(true_span, pred_span)
+                max_f1 = max(f1, max_f1)
+        return max_f1
+
+
+class MultiGPUF1Evaluator(F1Evaluator):
+    def __init__(self, config, models, tensor_dict=None):
+        super(MultiGPUF1Evaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
+        self.models = models
+        with tf.name_scope("eval_concat"):
+            N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
+            self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
+            self.yp2 = tf.concat(axis=0, values=[padded_reshape(model.yp2, [N, M, JX]) for model in models])
+            self.loss = tf.add_n([model.loss for model in models])/len(models)
+
+    def _split_batch(self, batches):
+        idxs_list, data_sets = zip(*batches)
+        idxs = sum(idxs_list, ())
+        data_set = sum(data_sets, data_sets[0].get_empty())
+        return idxs, data_set
+
+    def _get_feed_dict(self, batches):
+        feed_dict = {}
+        for model, (_, data_set) in zip(self.models, batches):
+            feed_dict.update(model.get_feed_dict(data_set, False))
+        return feed_dict
+
+
+class MultiGPUCNNAccuracyEvaluator(CNNAccuracyEvaluator):
+    def __init__(self, config, models, tensor_dict=None):
+        super(MultiGPUCNNAccuracyEvaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
+        self.models = models
+        with tf.name_scope("eval_concat"):
+            N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
+            self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
+            self.loss = tf.add_n([model.loss for model in models])/len(models)
+
+    def _split_batch(self, batches):
+        idxs_list, data_sets = zip(*batches)
+        idxs = sum(idxs_list, ())
+        data_set = sum(data_sets, data_sets[0].get_empty())
+        return idxs, data_set
+
+    def _get_feed_dict(self, batches):
+        feed_dict = {}
+        for model, (_, data_set) in zip(self.models, batches):
+            feed_dict.update(model.get_feed_dict(data_set, False))
+        return feed_dict
+
+
+class ForwardEvaluator(Evaluator):
+    def __init__(self, config, model, tensor_dict=None):
+        super(ForwardEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
+        self.yp2 = model.yp2
+        self.loss = model.loss
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        assert isinstance(data_set, DataSet)
+        feed_dict = self.model.get_feed_dict(data_set, False)
+        global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
+
+        yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
+        spans = [get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)]
+
+        def _get(xi, span):
+            if len(xi) <= span[0][0]:
+                return [""]
+            if len(xi[span[0][0]]) <= span[1][1]:
+                return [""]
+            return xi[span[0][0]][span[0][1]:span[1][1]]
+
+        id2answer_dict = {id_: " ".join(_get(xi, span))
+                          for id_, xi, span in zip(data_set.data['ids'], data_set.data['x'], spans)}
+        tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
+        e = ForwardEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), float(loss), id2answer_dict, tensor_dict=tensor_dict)
+        return e
+
+    @staticmethod
+    def compare(yi, ypi, yp2i):
+        for start, stop in yi:
+            aypi = argmax(ypi)
+            mask = np.zeros(yp2i.shape)
+            mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
+            if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
+                return True
+        return False
+
+    @staticmethod
+    def compare2(yi, span):
+        for start, stop in yi:
+            if tuple(start) == span[0] and tuple(stop) == span[1]:
+                return True
+        return False
+
+    @staticmethod
+    def span_f1(yi, span):
+        max_f1 = 0
+        for start, stop in yi:
+            if start[0] == span[0][0]:
+                true_span = start[1], stop[1]
+                pred_span = span[0][1], span[1][1]
+                f1 = span_f1(true_span, pred_span)
+                max_f1 = max(f1, max_f1)
+        return max_f1
+
+
+def get_best_span(ypi, yp2i):
+
+    max_val = 0
+    best_word_span = (0, 1)
+    best_sent_idx = 0
+    for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
+        argmax_j1 = 0
+        for j in range(len(ypif)):
+            val1 = ypif[argmax_j1]
+            if val1 < ypif[j]:
+                val1 = ypif[j]
+                argmax_j1 = j
+
+            val2 = yp2if[j]
+            if val1 * val2 > max_val:
+                best_word_span = (argmax_j1, j)
+                best_sent_idx = f
+                max_val = val1 * val2
+    return (best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)
+
+
+def get_span_score_pairs(ypi, yp2i):
+    span_score_pairs = []
+    for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
+        for j in range(len(ypif)):
+            for k in range(j, len(yp2if)):
+                span = ((f, j), (f, k+1))
+                score = ypif[j] * yp2if[k]
+                span_score_pairs.append((span, score))
+    return span_score_pairs
--- a/tensorflow/SQuAD/basic_cnn/graph_handler.py
+++ b/tensorflow/SQuAD/basic_cnn/graph_handler.py
@ -0,0 +1,70 @@
+import gzip
+import json
+from json import encoder
+import os
+
+import tensorflow as tf
+
+from basic_cnn.evaluator import Evaluation, F1Evaluation
+from my.utils import short_floats
+
+import pickle
+
+
+class GraphHandler(object):
+    def __init__(self, config):
+        self.config = config
+        self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
+        self.writer = None
+        self.save_path = os.path.join(config.save_dir, config.model_name)
+
+    def initialize(self, sess):
+        if self.config.load:
+            self._load(sess)
+        else:
+            sess.run(tf.global_variables_initializer())
+
+        if self.config.mode == 'train':
+            self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
+
+    def save(self, sess, global_step=None):
+        self.saver.save(sess, self.save_path, global_step=global_step)
+
+    def _load(self, sess):
+        config = self.config
+        if config.load_path:
+            save_path = config.load_path
+        elif config.load_step > 0:
+            save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
+        else:
+            save_dir = config.save_dir
+            checkpoint = tf.train.get_checkpoint_state(save_dir)
+            assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
+            save_path = checkpoint.model_checkpoint_path
+        print("Loading saved model from {}".format(save_path))
+        self.saver.restore(sess, save_path)
+
+    def add_summary(self, summary, global_step):
+        self.writer.add_summary(summary, global_step)
+
+    def add_summaries(self, summaries, global_step):
+        for summary in summaries:
+            self.add_summary(summary, global_step)
+
+    def dump_eval(self, e, precision=2, path=None):
+        assert isinstance(e, Evaluation)
+        if self.config.dump_pickle:
+            path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
+            with gzip.open(path, 'wb', compresslevel=3) as fh:
+                pickle.dump(e.dict, fh)
+        else:
+            path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
+            with open(path, 'w') as fh:
+                json.dump(short_floats(e.dict, precision), fh)
+
+    def dump_answer(self, e, path=None):
+        assert isinstance(e, Evaluation)
+        path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
+        with open(path, 'w') as fh:
+            json.dump(e.id2answer_dict, fh)
+
--- a/tensorflow/SQuAD/basic_cnn/main.py
+++ b/tensorflow/SQuAD/basic_cnn/main.py
@ -0,0 +1,238 @@
+import argparse
+import json
+import math
+import os
+import shutil
+from pprint import pprint
+
+import tensorflow as tf
+from tqdm import tqdm
+import numpy as np
+
+from basic_cnn.evaluator import F1Evaluator, Evaluator, ForwardEvaluator, MultiGPUF1Evaluator, CNNAccuracyEvaluator, \
+    MultiGPUCNNAccuracyEvaluator
+from basic_cnn.graph_handler import GraphHandler
+from basic_cnn.model import Model, get_multi_gpu_models
+from basic_cnn.trainer import Trainer, MultiGPUTrainer
+
+from basic_cnn.read_data import read_data, get_cnn_data_filter, update_config
+
+
+def main(config):
+    set_dirs(config)
+    with tf.device(config.device):
+        if config.mode == 'train':
+            _train(config)
+        elif config.mode == 'test' or config.mode == 'dev':
+            _test(config)
+        elif config.mode == 'forward':
+            _forward(config)
+        else:
+            raise ValueError("invalid value for 'mode': {}".format(config.mode))
+
+
+def _config_draft(config):
+    if config.draft:
+        config.num_steps = 2
+        config.eval_period = 1
+        config.log_period = 1
+        config.save_period = 1
+        config.eval_num_batches = 1
+
+
+def _train(config):
+    # load_metadata(config, 'train')  # this updates the config file according to metadata file
+
+    data_filter = get_cnn_data_filter(config)
+    train_data = read_data(config, 'train', config.load, data_filter=data_filter)
+    dev_data = read_data(config, 'dev', True, data_filter=data_filter)
+    # test_data = read_data(config, 'test', True, data_filter=data_filter)
+    update_config(config, [train_data, dev_data])
+
+    _config_draft(config)
+
+    word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
+    word2idx_dict = train_data.shared['word2idx']
+    idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
+    print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+    emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
+                        else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
+                        for idx in range(config.word_vocab_size)])
+    config.emb_mat = emb_mat
+
+    # construct model graph and variables (using default graph)
+    pprint(config.__flags, indent=2)
+    # model = Model(config)
+    models = get_multi_gpu_models(config)
+    model = models[0]
+    trainer = MultiGPUTrainer(config, models)
+    evaluator = MultiGPUCNNAccuracyEvaluator(config, models, tensor_dict=model.tensor_dict if config.vis else None)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    # Variables
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+
+    # begin training
+    print(train_data.num_examples)
+    num_steps = config.num_steps or int(math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs
+    global_step = 0
+    for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus,
+                                                     num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps):
+        global_step = sess.run(model.global_step) + 1  # +1 because all calculations are done after step
+        get_summary = global_step % config.log_period == 0
+        loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary)
+        if get_summary:
+            graph_handler.add_summary(summary, global_step)
+
+        # occasional saving
+        if global_step % config.save_period == 0:
+            graph_handler.save(sess, global_step=global_step)
+
+        if not config.eval:
+            continue
+        # Occasional evaluation
+        if global_step % config.eval_period == 0:
+            num_steps = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus))
+            if 0 < config.eval_num_batches < num_steps:
+                num_steps = config.eval_num_batches
+            e_train = evaluator.get_evaluation_from_batches(
+                sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)
+            )
+            graph_handler.add_summaries(e_train.summaries, global_step)
+            e_dev = evaluator.get_evaluation_from_batches(
+                sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps))
+            graph_handler.add_summaries(e_dev.summaries, global_step)
+
+            if config.dump_eval:
+                graph_handler.dump_eval(e_dev)
+            if config.dump_answer:
+                graph_handler.dump_answer(e_dev)
+    if global_step % config.save_period != 0:
+        graph_handler.save(sess, global_step=global_step)
+
+
+def _test(config):
+    assert config.load
+    test_data = read_data(config, config.mode, True)
+    update_config(config, [test_data])
+
+    _config_draft(config)
+
+    if config.use_glove_for_unk:
+        word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
+        new_word2idx_dict = test_data.shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        # print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        config.new_emb_mat = new_emb_mat
+
+    pprint(config.__flags, indent=2)
+    models = get_multi_gpu_models(config)
+    evaluator = MultiGPUCNNAccuracyEvaluator(config, models, tensor_dict=models[0].tensor_dict if config.vis else None)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+    num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus))
+    if 0 < config.eval_num_batches < num_steps:
+        num_steps = config.eval_num_batches
+
+    e = None
+    for multi_batch in tqdm(test_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps):
+        ei = evaluator.get_evaluation(sess, multi_batch)
+        e = ei if e is None else e + ei
+        if config.vis:
+            eval_subdir = os.path.join(config.eval_dir, "{}-{}".format(ei.data_type, str(ei.global_step).zfill(6)))
+            if not os.path.exists(eval_subdir):
+                os.mkdir(eval_subdir)
+            path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8))
+            graph_handler.dump_eval(ei, path=path)
+
+    print(e)
+    if config.dump_answer:
+        print("dumping answer ...")
+        graph_handler.dump_answer(e)
+    if config.dump_eval:
+        print("dumping eval ...")
+        graph_handler.dump_eval(e)
+
+
+def _forward(config):
+    assert config.load
+    test_data = read_data(config, config.forward_name, True)
+    update_config(config, [test_data])
+
+    _config_draft(config)
+
+    if config.use_glove_for_unk:
+        word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
+        new_word2idx_dict = test_data.shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        # print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        config.new_emb_mat = new_emb_mat
+
+    pprint(config.__flags, indent=2)
+    models = get_multi_gpu_models(config)
+    model = models[0]
+    evaluator = ForwardEvaluator(config, model)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+    graph_handler.initialize(sess)
+
+    num_batches = math.ceil(test_data.num_examples / config.batch_size)
+    if 0 < config.eval_num_batches < num_batches:
+        num_batches = config.eval_num_batches
+    e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
+    print(e)
+    if config.dump_answer:
+        print("dumping answer ...")
+        graph_handler.dump_answer(e, path=config.answer_path)
+    if config.dump_eval:
+        print("dumping eval ...")
+        graph_handler.dump_eval(e)
+
+
+def set_dirs(config):
+    # create directories
+    if not config.load and os.path.exists(config.out_dir):
+        shutil.rmtree(config.out_dir)
+
+    config.save_dir = os.path.join(config.out_dir, "save")
+    config.log_dir = os.path.join(config.out_dir, "log")
+    config.eval_dir = os.path.join(config.out_dir, "eval")
+    config.answer_dir = os.path.join(config.out_dir, "answer")
+    if not os.path.exists(config.out_dir):
+        os.makedirs(config.out_dir)
+    if not os.path.exists(config.save_dir):
+        os.mkdir(config.save_dir)
+    if not os.path.exists(config.log_dir):
+        os.mkdir(config.log_dir)
+    if not os.path.exists(config.answer_dir):
+        os.mkdir(config.answer_dir)
+    if not os.path.exists(config.eval_dir):
+        os.mkdir(config.eval_dir)
+
+
+def _get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config_path")
+    return parser.parse_args()
+
+
+class Config(object):
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+
+def _run():
+    args = _get_args()
+    with open(args.config_path, 'r') as fh:
+        config = Config(**json.load(fh))
+        main(config)
+
+
+if __name__ == "__main__":
+    _run()
--- a/tensorflow/SQuAD/basic_cnn/model.py
+++ b/tensorflow/SQuAD/basic_cnn/model.py
@ -0,0 +1,375 @@
+import random
+
+import itertools
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops.rnn_cell import BasicLSTMCell, GRUCell
+
+from basic_cnn.read_data import DataSet
+from basic_cnn.superhighway import SHCell
+from my.tensorflow import exp_mask, get_initializer, VERY_SMALL_NUMBER
+from my.tensorflow.nn import linear, double_linear_logits, linear_logits, softsel, dropout, get_logits, softmax, \
+    highway_network, multi_conv1d
+from my.tensorflow.rnn import bidirectional_dynamic_rnn, dynamic_rnn
+from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, AttentionCell
+
+
+def bi_attention(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
+    """
+    h_a:
+    all u attending on h
+    choosing an element of h that max-matches u
+    First creates confusion matrix between h and u
+    Then take max of the attention weights over u row
+    Finally softmax over
+
+    u_a:
+    each h attending on u
+
+    :param h: [N, M, JX, d]
+    :param u: [N, JQ, d]
+    :param h_mask:  [N, M, JX]
+    :param u_mask:  [N, B]
+    :param scope:
+    :return: [N, M, d], [N, M, JX, d]
+    """
+    with tf.variable_scope(scope or "bi_attention"):
+        N, M, JX, JQ, d = config.batch_size, config.max_num_sents, config.max_sent_size, config.max_ques_size, config.hidden_size
+        JX = tf.shape(h)[2]
+        h_aug = tf.tile(tf.expand_dims(h, 3), [1, 1, 1, JQ, 1])
+        u_aug = tf.tile(tf.expand_dims(tf.expand_dims(u, 1), 1), [1, M, JX, 1, 1])
+        if h_mask is None:
+            and_mask = None
+        else:
+            h_mask_aug = tf.tile(tf.expand_dims(h_mask, 3), [1, 1, 1, JQ])
+            u_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(u_mask, 1), 1), [1, M, JX, 1])
+            and_mask = h_mask_aug & u_mask_aug
+
+        u_logits = get_logits([h_aug, u_aug], None, True, wd=config.wd, mask=and_mask,
+                              is_train=is_train, func=config.logit_func, scope='u_logits')  # [N, M, JX, JQ]
+        u_a = softsel(u_aug, u_logits)  # [N, M, JX, d]
+        if tensor_dict is not None:
+            # a_h = tf.nn.softmax(h_logits)  # [N, M, JX]
+            a_u = tf.nn.softmax(u_logits)  # [N, M, JX, JQ]
+            # tensor_dict['a_h'] = a_h
+            tensor_dict['a_u'] = a_u
+        if config.bi:
+            h_a = softsel(h, tf.reduce_max(u_logits, 3))  # [N, M, d]
+            h_a = tf.tile(tf.expand_dims(h_a, 2), [1, 1, JX, 1])
+        else:
+            h_a = None
+        return u_a, h_a
+
+
+def attention_layer(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
+    with tf.variable_scope(scope or "attention_layer"):
+        u_a, h_a = bi_attention(config, is_train, h, u, h_mask=h_mask, u_mask=u_mask, tensor_dict=tensor_dict)
+        if config.bi:
+            p0 = tf.concat(axis=3, values=[h , u_a, h * u_a, h * h_a])
+        else:
+            p0 = tf.concat(axis=3, values=[h , u_a, h * u_a])
+        return p0
+
+
+class Model(object):
+    def __init__(self, config, scope):
+        self.scope = scope
+        self.config = config
+        self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
+                                           initializer=tf.constant_initializer(0), trainable=False)
+
+        # Define forward inputs here
+        N, M, JX, JQ, VW, VC, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size
+        self.x = tf.placeholder('int32', [N, M, None], name='x')
+        self.cx = tf.placeholder('int32', [N, M, None, W], name='cx')
+        self.x_mask = tf.placeholder('bool', [N, M, None], name='x_mask')
+        self.q = tf.placeholder('int32', [N, JQ], name='q')
+        self.cq = tf.placeholder('int32', [N, JQ, W], name='cq')
+        self.q_mask = tf.placeholder('bool', [N, JQ], name='q_mask')
+        self.y = tf.placeholder('bool', [N, M, JX], name='y')
+        self.is_train = tf.placeholder('bool', [], name='is_train')
+        self.new_emb_mat = tf.placeholder('float', [None, config.word_emb_size], name='new_emb_mat')
+
+        # Define misc
+        self.tensor_dict = {}
+
+        # Forward outputs / loss inputs
+        self.logits = None
+        self.yp = None
+        self.var_list = None
+
+        # Loss outputs
+        self.loss = None
+
+        self._build_forward()
+        self._build_loss()
+        if config.mode == 'train':
+            self._build_ema()
+
+        self.summary = tf.summary.merge_all()
+        self.summary = tf.summary.merge(tf.get_collection("summaries", scope=self.scope))
+
+    def _build_forward(self):
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
+            config.max_word_size
+        JX = tf.shape(self.x)[2]
+        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
+
+        with tf.variable_scope("emb"):
+            with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
+                char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
+
+            with tf.variable_scope("char"):
+                Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
+                Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
+                Acx = tf.reshape(Acx, [-1, JX, W, dc])
+                Acq = tf.reshape(Acq, [-1, JQ, W, dc])
+
+                filter_sizes = list(map(int, config.out_channel_dims.split(',')))
+                heights = list(map(int, config.filter_heights.split(',')))
+                assert sum(filter_sizes) == dco
+                with tf.variable_scope("conv"):
+                    xx = multi_conv1d(Acx, filter_sizes, heights, "VALID",  self.is_train, config.keep_prob, scope="xx")
+                    if config.share_cnn_weights:
+                        tf.get_variable_scope().reuse_variables()
+                        qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
+                    else:
+                        qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
+                    xx = tf.reshape(xx, [-1, M, JX, dco])
+                    qq = tf.reshape(qq, [-1, JQ, dco])
+
+            if config.use_word_emb:
+                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
+                    if config.mode == 'train':
+                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
+                    else:
+                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
+                    if config.use_glove_for_unk:
+                        word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
+
+                with tf.name_scope("word"):
+                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
+                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
+                    self.tensor_dict['x'] = Ax
+                    self.tensor_dict['q'] = Aq
+                xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
+                qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
+
+        # highway network
+        with tf.variable_scope("highway"):
+            xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
+            tf.get_variable_scope().reuse_variables()
+            qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
+            self.tensor_dict['xx'] = xx
+            self.tensor_dict['qq'] = qq
+
+        cell = BasicLSTMCell(d, state_is_tuple=True)
+        d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
+        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
+        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
+
+        with tf.variable_scope("prepro"):
+            (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1')  # [N, J, d], [N, d]
+            u = tf.concat(axis=2, values=[fw_u, bw_u])
+            if config.two_prepro_layers:
+                (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, u, q_len, dtype='float', scope='u2')  # [N, J, d], [N, d]
+                u = tf.concat(axis=2, values=[fw_u, bw_u])
+            if config.share_lstm_weights:
+                tf.get_variable_scope().reuse_variables()
+                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1')  # [N, M, JX, 2d]
+                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+                if config.two_prepro_layers:
+                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, h, x_len, dtype='float', scope='u2')  # [N, M, JX, 2d]
+                    h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+
+            else:
+                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1')  # [N, M, JX, 2d]
+                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+                if config.two_prepro_layers:
+                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, h, x_len, dtype='float', scope='h2')  # [N, M, JX, 2d]
+                    h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
+            self.tensor_dict['u'] = u
+            self.tensor_dict['h'] = h
+
+        with tf.variable_scope("main"):
+            p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
+            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, p0, x_len, dtype='float', scope='g0')  # [N, M, JX, 2d]
+            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
+            # p1 = attention_layer(config, self.is_train, g0, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p1")
+            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(d_cell, d_cell, g0, x_len, dtype='float', scope='g1')  # [N, M, JX, 2d]
+            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])
+            # logits = u_logits(config, self.is_train, g1, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits")
+            # [N, M, JX]
+            logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1')
+            a1i = softsel(tf.reshape(g1, [N, M*JX, 2*d]), tf.reshape(logits, [N, M*JX]))
+
+            if config.feed_gt:
+                logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER)
+                logits = tf.cond(self.is_train, lambda: logy, lambda: logits)
+            if config.feed_hard:
+                hard_yp = tf.argmax(tf.reshape(logits, [N, M*JX]), 1)
+                hard_logits = tf.reshape(tf.one_hot(hard_yp, M*JX), [N, M, JX])  # [N, M, JX]
+                logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits)
+
+            flat_logits = tf.reshape(logits, [-1, M * JX])
+            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
+            yp = tf.reshape(flat_yp, [-1, M, JX])
+
+            self.tensor_dict['g1'] = g1
+
+            self.logits = flat_logits
+            self.yp = yp
+
+    def _build_loss(self):
+        config = self.config
+        N, M, JX, JQ, VW, VC = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size
+        JX = tf.shape(self.x)[2]
+        loss_mask = tf.reduce_max(tf.cast(self.q_mask, 'float'), 1)
+        losses = -tf.log(tf.reduce_sum(self.yp * tf.cast(self.y, 'float'), [1, 2]) + VERY_SMALL_NUMBER)
+        ce_loss = tf.reduce_mean(loss_mask * losses)
+        tf.add_to_collection('losses', ce_loss)
+
+        self.loss = tf.add_n(tf.get_collection('losses', scope=self.scope), name='loss')
+        tf.summary.scalar(self.loss.op.name, self.loss)
+        tf.add_to_collection('ema/scalar', self.loss)
+
+    def _build_ema(self):
+        ema = tf.train.ExponentialMovingAverage(self.config.decay)
+        ema_op = ema.apply(tf.get_collection("ema/scalar", scope=self.scope) + tf.get_collection("ema/histogram", scope=self.scope))
+        for var in tf.get_collection("ema/scalar", scope=self.scope):
+            ema_var = ema.average(var)
+            tf.summary.scalar(ema_var.op.name, ema_var)
+        for var in tf.get_collection("ema/histogram", scope=self.scope):
+            ema_var = ema.average(var)
+            tf.summary.histogram(ema_var.op.name, ema_var)
+
+        with tf.control_dependencies([ema_op]):
+            self.loss = tf.identity(self.loss)
+
+    def get_loss(self):
+        return self.loss
+
+    def get_global_step(self):
+        return self.global_step
+
+    def get_var_list(self):
+        return self.var_list
+
+    def get_feed_dict(self, batch, is_train, supervised=True):
+        assert isinstance(batch, DataSet)
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size
+        feed_dict = {}
+
+        if config.len_opt:
+            """
+            Note that this optimization results in variable GPU RAM usage (i.e. can cause OOM in the middle of training.)
+            First test without len_opt and make sure no OOM, and use len_opt
+            """
+            if sum(len(para) for para in batch.data['x']) == 0:
+                new_JX = 1
+            else:
+                new_JX = max(len(para) for para in batch.data['x'])
+            JX = min(JX, new_JX)
+        # print(JX)
+
+        x = np.zeros([N, M, JX], dtype='int32')
+        cx = np.zeros([N, M, JX, W], dtype='int32')
+        x_mask = np.zeros([N, M, JX], dtype='bool')
+        q = np.zeros([N, JQ], dtype='int32')
+        cq = np.zeros([N, JQ, W], dtype='int32')
+        q_mask = np.zeros([N, JQ], dtype='bool')
+
+        feed_dict[self.x] = x
+        feed_dict[self.x_mask] = x_mask
+        feed_dict[self.cx] = cx
+        feed_dict[self.q] = q
+        feed_dict[self.cq] = cq
+        feed_dict[self.q_mask] = q_mask
+        feed_dict[self.is_train] = is_train
+        if config.use_glove_for_unk:
+            feed_dict[self.new_emb_mat] = batch.shared['new_emb_mat']
+
+        X = batch.data['x']
+        CX = batch.data['cx']
+
+        def _get_word(word):
+            if word.startswith("@"):
+                return 2
+            d = batch.shared['word2idx']
+            for each in (word, word.lower(), word.capitalize(), word.upper()):
+                if each in d:
+                    return d[each]
+            if config.use_glove_for_unk:
+                d2 = batch.shared['new_word2idx']
+                for each in (word, word.lower(), word.capitalize(), word.upper()):
+                    if each in d2:
+                        return d2[each] + len(d)
+            return 1
+
+        def _get_char(char):
+            d = batch.shared['char2idx']
+            if char in d:
+                return d[char]
+            return 1
+
+        if supervised:
+            y = np.zeros([N, M, JX], dtype='int32')
+            feed_dict[self.y] = y
+
+            for i, (xi, yi) in enumerate(zip(batch.data['x'], batch.data['y'])):
+                count = 0
+                for j, xij in enumerate(xi):
+                    for k, xijk in enumerate(xij):
+                        if xijk == yi:
+                            y[i, j, k] = True
+                            count += 1
+                assert count > 0
+
+        for i, xi in enumerate(X):
+            for j, xij in enumerate(xi):
+                for k, xijk in enumerate(xij):
+                    each = _get_word(xijk)
+                    x[i, j, k] = each
+                    x_mask[i, j, k] = True
+
+        for i, cxi in enumerate(CX):
+            for j, cxij in enumerate(cxi):
+                for k, cxijk in enumerate(cxij):
+                    for l, cxijkl in enumerate(cxijk):
+                        cx[i, j, k, l] = _get_char(cxijkl)
+                        if l + 1 == config.max_word_size:
+                            break
+
+        for i, qi in enumerate(batch.data['q']):
+            for j, qij in enumerate(qi):
+                q[i, j] = _get_word(qij)
+                q_mask[i, j] = True
+
+        for i, cqi in enumerate(batch.data['cq']):
+            for j, cqij in enumerate(cqi):
+                for k, cqijk in enumerate(cqij):
+                    cq[i, j, k] = _get_char(cqijk)
+                    if k + 1 == config.max_word_size:
+                        break
+
+        return feed_dict
+
+
+def get_multi_gpu_models(config):
+    models = []
+    for gpu_idx in range(config.num_gpus):
+        with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/gpu:{}".format(gpu_idx)):
+            model = Model(config, scope)
+            tf.get_variable_scope().reuse_variables()
+            models.append(model)
+    return models
--- a/tensorflow/SQuAD/basic_cnn/read_data.py
+++ b/tensorflow/SQuAD/basic_cnn/read_data.py
@ -0,0 +1,294 @@
+import json
+import os
+import random
+import itertools
+import math
+from collections import defaultdict
+
+import numpy as np
+
+from cnn_dm.prepro import para2sents
+from my.tensorflow import grouper
+from my.utils import index
+
+
+class Data(object):
+    def get_size(self):
+        raise NotImplementedError()
+
+    def get_by_idxs(self, idxs):
+        """
+        Efficient way to obtain a batch of items from filesystem
+        :param idxs:
+        :return dict: {'X': [,], 'Y', }
+        """
+        data = defaultdict(list)
+        for idx in idxs:
+            each_data = self.get_one(idx)
+            for key, val in each_data.items():
+                data[key].append(val)
+        return data
+
+    def get_one(self, idx):
+        raise NotImplementedError()
+
+    def get_empty(self):
+        raise NotImplementedError()
+
+    def __add__(self, other):
+        raise NotImplementedError()
+
+class MyData(Data):
+    def __init__(self, config, root_dir, file_names):
+        self.root_dir = root_dir
+        self.file_names = file_names
+        self.config = config
+
+    def get_one(self, idx):
+        file_name = self.file_names[idx]
+        with open(os.path.join(self.root_dir, file_name), 'r') as fh:
+            url = fh.readline().strip()
+            _ = fh.readline()
+            para = fh.readline().strip()
+            _ = fh.readline()
+            ques = fh.readline().strip()
+            _ = fh.readline()
+            answer = fh.readline().strip()
+            _ = fh.readline()
+            cands = list(line.strip() for line in fh)
+            cand_ents = list(cand.split(":")[0] for cand in cands)
+            wordss = para2sents(para, self.config.width)
+            ques_words = ques.split(" ")
+
+            x = wordss
+            cx = [[list(word) for word in words] for words in wordss]
+            q = ques_words
+            cq = [list(word) for word in ques_words]
+            y = answer
+            c = cand_ents
+
+            data = {'x': x, 'cx': cx, 'q': q, 'cq': cq, 'y': y, 'c': c, 'ids': file_name}
+            return data
+
+    def get_empty(self):
+        return MyData(self.config, self.root_dir, [])
+
+    def __add__(self, other):
+        file_names = self.file_names + other.file_names
+        return MyData(self.config, self.root_dir, file_names)
+
+    def get_size(self):
+        return len(self.file_names)
+
+
+class DataSet(object):
+    def __init__(self, data, data_type, shared=None, valid_idxs=None):
+        self.data = data  # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
+        self.data_type = data_type
+        self.shared = shared
+        total_num_examples = self.get_data_size()
+        self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
+        self.num_examples = total_num_examples
+
+    def _sort_key(self, idx):
+        rx = self.data['*x'][idx]
+        x = self.shared['x'][rx[0]][rx[1]]
+        return max(map(len, x))
+
+    def get_data_size(self):
+        if isinstance(self.data, dict):
+            return len(next(iter(self.data.values())))
+        elif isinstance(self.data, Data):
+            return self.data.get_size()
+        raise Exception()
+
+    def get_by_idxs(self, idxs):
+        if isinstance(self.data, dict):
+            out = defaultdict(list)
+            for key, val in self.data.items():
+                out[key].extend(val[idx] for idx in idxs)
+            return out
+        elif isinstance(self.data, Data):
+            return self.data.get_by_idxs(idxs)
+        raise Exception()
+
+    def get_one(self, idx):
+        if isinstance(self.data, dict):
+            out = {key: [val[idx]] for key, val in self.data.items()}
+            return out
+        elif isinstance(self.data, Data):
+            return self.data.get_one(idx)
+
+    def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False):
+        """
+
+        :param batch_size:
+        :param num_batches:
+        :param shuffle:
+        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
+        :return:
+        """
+        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
+        if num_batches is None:
+            num_batches = num_batches_per_epoch
+        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
+
+        if shuffle:
+            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
+            if cluster:
+                sorted_idxs = sorted(random_idxs, key=self._sort_key)
+                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
+                grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch)
+            else:
+                random_grouped = lambda: list(grouper(random_idxs, batch_size))
+                grouped = random_grouped
+        else:
+            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
+            grouped = raw_grouped
+
+        batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs))
+        for _ in range(num_batches):
+            batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None)
+            batch_data = self.get_by_idxs(batch_idxs)
+            shared_batch_data = {}
+            for key, val in batch_data.items():
+                if key.startswith('*'):
+                    assert self.shared is not None
+                    shared_key = key[1:]
+                    shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val]
+            batch_data.update(shared_batch_data)
+
+            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
+            yield batch_idxs, batch_ds
+
+    def get_multi_batches(self, batch_size, num_batches_per_step, num_steps=None, shuffle=False, cluster=False):
+        batch_size_per_step = batch_size * num_batches_per_step
+        batches = self.get_batches(batch_size_per_step, num_batches=num_steps, shuffle=shuffle, cluster=cluster)
+        multi_batches = (tuple(zip(grouper(idxs, batch_size, shorten=True, num_groups=num_batches_per_step),
+                         data_set.divide(num_batches_per_step))) for idxs, data_set in batches)
+        return multi_batches
+
+    def get_empty(self):
+        if isinstance(self.data, dict):
+            data = {key: [] for key in self.data}
+        elif isinstance(self.data, Data):
+            data = self.data.get_empty()
+        else:
+            raise Exception()
+        return DataSet(data, self.data_type, shared=self.shared)
+
+    def __add__(self, other):
+        if isinstance(self.data, dict):
+            data = {key: val + other.data[key] for key, val in self.data.items()}
+        elif isinstance(self.data, Data):
+            data = self.data + other.data
+        else:
+            raise Exception()
+
+        valid_idxs = list(self.valid_idxs) + [valid_idx + self.num_examples for valid_idx in other.valid_idxs]
+        return DataSet(data, self.data_type, shared=self.shared, valid_idxs=valid_idxs)
+
+    def divide(self, integer):
+        batch_size = int(math.ceil(self.num_examples / integer))
+        idxs_gen = grouper(self.valid_idxs, batch_size, shorten=True, num_groups=integer)
+        data_gen = (self.get_by_idxs(idxs) for idxs in idxs_gen)
+        ds_tuple = tuple(DataSet(data, self.data_type, shared=self.shared) for data in data_gen)
+        return ds_tuple
+
+
+class MyDataSet(DataSet):
+    def __init__(self, data, data_type, shared=None, valid_idxs=None):
+        super(MyDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
+        shared['max_num_sents'] = len(self.get_one(self.num_examples-1)['x'])
+
+    def _sort_key(self, idx):
+        return idx
+
+
+def read_data(config, data_type, ref, data_filter=None):
+    shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
+    with open(shared_path, 'r') as fh:
+        shared = json.load(fh)
+
+    paths = shared['sorted']
+    if config.filter_ratio < 1.0:
+        stop = int(round(len(paths) * config.filter_ratio))
+        paths = paths[:stop]
+    num_examples = len(paths)
+    valid_idxs = range(num_examples)
+
+    print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
+
+    shared_path = config.shared_path or os.path.join(config.out_dir, "shared.json")
+    if not ref:
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
+        char_counter = shared['char_counter']
+        if config.finetune:
+            shared['word2idx'] = {word: idx + 3 for idx, word in
+                                  enumerate(word for word, count in word_counter.items()
+                                            if count > config.word_count_th or (config.known_if_glove and word in word2vec_dict))}
+        else:
+            assert config.known_if_glove
+            assert config.use_glove_for_unk
+            shared['word2idx'] = {word: idx + 3 for idx, word in
+                                  enumerate(word for word, count in word_counter.items()
+                                            if count > config.word_count_th and word not in word2vec_dict)}
+        shared['char2idx'] = {char: idx + 2 for idx, char in
+                              enumerate(char for char, count in char_counter.items()
+                                        if count > config.char_count_th)}
+        NULL = "-NULL-"
+        UNK = "-UNK-"
+        ENT = "-ENT-"
+        shared['word2idx'][NULL] = 0
+        shared['word2idx'][UNK] = 1
+        shared['word2idx'][ENT] = 2
+        shared['char2idx'][NULL] = 0
+        shared['char2idx'][UNK] = 1
+
+        json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx']}, open(shared_path, 'w'))
+    else:
+        new_shared = json.load(open(shared_path, 'r'))
+        for key, val in new_shared.items():
+            shared[key] = val
+
+    if config.use_glove_for_unk:
+        # create new word2idx and word2vec
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        new_word2idx_dict = {word: idx for idx, word in enumerate(word for word in word2vec_dict.keys() if word not in shared['word2idx'])}
+        shared['new_word2idx'] = new_word2idx_dict
+        offset = len(shared['word2idx'])
+        word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
+        new_word2idx_dict = shared['new_word2idx']
+        idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
+        # print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+        new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
+        shared['new_emb_mat'] = new_emb_mat
+
+    data = MyData(config, os.path.join(config.root_dir, data_type), paths)
+    data_set = MyDataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
+    return data_set
+
+
+def get_cnn_data_filter(config):
+    return True
+
+
+def update_config(config, data_sets):
+    config.max_num_sents = 0
+    config.max_sent_size = 0
+    config.max_ques_size = 0
+    config.max_word_size = 0
+    for data_set in data_sets:
+        shared = data_set.shared
+        config.max_sent_size = max(config.max_sent_size, shared['max_sent_size'])
+        config.max_ques_size = max(config.max_ques_size, shared['max_ques_size'])
+        config.max_word_size = max(config.max_word_size, shared['max_word_size'])
+        config.max_num_sents = max(config.max_num_sents, shared['max_num_sents'])
+
+    config.max_word_size = min(config.max_word_size, config.word_size_th)
+
+    config.char_vocab_size = len(data_sets[0].shared['char2idx'])
+    config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
+    config.word_vocab_size = len(data_sets[0].shared['word2idx'])
+
--- a/tensorflow/SQuAD/basic_cnn/superhighway.py
+++ b/tensorflow/SQuAD/basic_cnn/superhighway.py
@ -0,0 +1,47 @@
+import tensorflow as tf
+from tensorflow.python.ops.rnn_cell import RNNCell
+
+from my.tensorflow.nn import linear
+
+
+class SHCell(RNNCell):
+    """
+    Super-Highway Cell
+    """
+    def __init__(self, input_size, logit_func='tri_linear', scalar=False):
+        self._state_size = input_size
+        self._output_size = input_size
+        self._logit_func = logit_func
+        self._scalar = scalar
+
+    @property
+    def state_size(self):
+        return self._state_size
+
+    @property
+    def output_size(self):
+        return self._output_size
+
+    def __call__(self, inputs, state, scope=None):
+        with tf.variable_scope(scope or "SHCell"):
+            a_size = 1 if self._scalar else self._state_size
+            h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs)
+            if self._logit_func == 'mul_linear':
+                args = [h * u, state * u]
+                a = tf.nn.sigmoid(linear(args, a_size, True))
+            elif self._logit_func == 'linear':
+                args = [h, u, state]
+                a = tf.nn.sigmoid(linear(args, a_size, True))
+            elif self._logit_func == 'tri_linear':
+                args = [h, u, state, h * u, state * u]
+                a = tf.nn.sigmoid(linear(args, a_size, True))
+            elif self._logit_func == 'double':
+                args = [h, u, state]
+                a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True))
+
+            else:
+                raise Exception()
+            new_state = a * state + (1 - a) * h
+            outputs = state
+            return outputs, new_state
+
--- a/tensorflow/SQuAD/basic_cnn/templates/visualizer.html
+++ b/tensorflow/SQuAD/basic_cnn/templates/visualizer.html
@ -0,0 +1,76 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>{{ title }}</title>
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
+    <script>
+        $(document).ready(function(){
+            $(".att").each(function() {
+                // var val = parseFloat($(this).text());
+                var val = parseFloat($(this).attr("color"));
+                var scale = chroma.scale(['white', 'red']);
+                var color = scale(val).hex();
+                $(this).attr("bgcolor", color);
+            });
+        })
+    </script>
+</head>
+<style>
+    table, th, td {border: 1px solid black}
+</style>
+<body>
+    <h2>{{ title }}</h2>
+    <table>
+        <tr>
+            <th>ID</th>
+            <th>Question</th>
+            <th>Answers</th>
+            <th>Predicted</th>
+            <th>Score</th>
+            <th>Paragraph</th>
+        </tr>
+        {% for row in rows %}
+            <tr>
+                <td>{{ row.id }}</td>
+                <td>
+                    {% for qj in row.ques %}
+                        {{ qj }}
+                    {% endfor %}
+                </td>
+                <td>
+                    {% for aa in row.a %}
+                        <li>{{ aa }}</li>
+                    {% endfor %}
+                </td>
+                <td>{{ row.ap }}</td>
+                <td>{{ row.score }}</td>
+                <td>
+                    <table>
+                    {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
+                        <tr>
+                        {% set rowloop = loop %}
+                        {% for xjk, ypjk in zip(xj, ypj) %}
+                            <td class="att" color="{{ ypjk }}">
+                            {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
+                                <b>{{ xjk }}</b>
+                            {% else %}
+                                {{ xjk }}
+                            {% endif %}
+                            </td>
+                        {% endfor %}
+                        </tr>
+                        <tr>
+                        {% for xjk, yp2jk in zip(xj, yp2j) %}
+                            <td class="att" color="{{ yp2jk }}">-</td>
+                        {% endfor %}
+                        </tr>
+                    {% endfor %}
+                    </table>
+                </td>
+            </tr>
+        {% endfor %}
+    </table>
+</body>
+</html>
--- a/tensorflow/SQuAD/basic_cnn/trainer.py
+++ b/tensorflow/SQuAD/basic_cnn/trainer.py
@ -0,0 +1,73 @@
+import tensorflow as tf
+
+from basic_cnn.model import Model
+from my.tensorflow import average_gradients
+
+
+class Trainer(object):
+    def __init__(self, config, model):
+        assert isinstance(model, Model)
+        self.config = config
+        self.model = model
+        self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
+        self.loss = model.get_loss()
+        self.var_list = model.get_var_list()
+        self.global_step = model.get_global_step()
+        self.summary = model.summary
+        self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
+        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
+
+    def get_train_op(self):
+        return self.train_op
+
+    def step(self, sess, batch, get_summary=False):
+        assert isinstance(sess, tf.Session)
+        _, ds = batch
+        feed_dict = self.model.get_feed_dict(ds, True)
+        if get_summary:
+            loss, summary, train_op = \
+                sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
+        else:
+            loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+            summary = None
+        return loss, summary, train_op
+
+
+class MultiGPUTrainer(object):
+    def __init__(self, config, models):
+        model = models[0]
+        assert isinstance(model, Model)
+        self.config = config
+        self.model = model
+        self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
+        self.var_list = model.get_var_list()
+        self.global_step = model.get_global_step()
+        self.summary = model.summary
+        self.models = models
+        losses = []
+        grads_list = []
+        for gpu_idx, model in enumerate(models):
+            with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
+                loss = model.get_loss()
+                grads = self.opt.compute_gradients(loss, var_list=self.var_list)
+                losses.append(loss)
+                grads_list.append(grads)
+
+        self.loss = tf.add_n(losses)/len(losses)
+        self.grads = average_gradients(grads_list)
+        self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
+
+    def step(self, sess, batches, get_summary=False):
+        assert isinstance(sess, tf.Session)
+        feed_dict = {}
+        for batch, model in zip(batches, self.models):
+            _, ds = batch
+            feed_dict.update(model.get_feed_dict(ds, True))
+
+        if get_summary:
+            loss, summary, train_op = \
+                sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
+        else:
+            loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+            summary = None
+        return loss, summary, train_op
--- a/tensorflow/SQuAD/basic_cnn/visualizer.py
+++ b/tensorflow/SQuAD/basic_cnn/visualizer.py
@ -0,0 +1,137 @@
+import shutil
+from collections import OrderedDict
+import http.server
+import socketserver
+import argparse
+import json
+import os
+import numpy as np
+from tqdm import tqdm
+
+from jinja2 import Environment, FileSystemLoader
+
+from basic_cnn.evaluator import get_span_score_pairs, get_best_span
+
+
+def bool_(string):
+    if string == 'True':
+        return True
+    elif string == 'False':
+        return False
+    else:
+        raise Exception()
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default='basic')
+    parser.add_argument("--data_type", type=str, default='dev')
+    parser.add_argument("--step", type=int, default=5000)
+    parser.add_argument("--template_name", type=str, default="visualizer.html")
+    parser.add_argument("--num_per_page", type=int, default=100)
+    parser.add_argument("--data_dir", type=str, default="data/squad")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--open", type=str, default='False')
+    parser.add_argument("--run_id", type=str, default="0")
+
+    args = parser.parse_args()
+    return args
+
+
+def _decode(decoder, sent):
+    return " ".join(decoder[idx] for idx in sent)
+
+
+def accuracy2_visualizer(args):
+    model_name = args.model_name
+    data_type = args.data_type
+    num_per_page = args.num_per_page
+    data_dir = args.data_dir
+    run_id = args.run_id.zfill(2)
+    step = args.step
+
+    eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
+    print("loading {}".format(eval_path))
+    eval_ = json.load(open(eval_path, 'r'))
+
+    _id = 0
+    html_dir = "/tmp/list_results%d" % _id
+    while os.path.exists(html_dir):
+        _id += 1
+        html_dir = "/tmp/list_results%d" % _id
+
+    if os.path.exists(html_dir):
+        shutil.rmtree(html_dir)
+    os.mkdir(html_dir)
+
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    templates_dir = os.path.join(cur_dir, 'templates')
+    env = Environment(loader=FileSystemLoader(templates_dir))
+    env.globals.update(zip=zip, reversed=reversed)
+    template = env.get_template(args.template_name)
+
+    data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
+    print("loading {}".format(data_path))
+    data = json.load(open(data_path, 'r'))
+    print("loading {}".format(shared_path))
+    shared = json.load(open(shared_path, 'r'))
+
+    rows = []
+    for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])):
+        id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
+        x = shared['x'][rx[0]][rx[1]]
+        ques = [" ".join(q)]
+        para = [[word for word in sent] for sent in x]
+        span = get_best_span(ypi, yp2i)
+        ap = get_segment(para, span)
+        score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
+
+        row = {
+            'id': id_,
+            'title': "Hello world!",
+            'ques': ques,
+            'para': para,
+            'y': yi[0][0],
+            'y2': yi[0][1],
+            'yp': ypi,
+            'yp2': yp2i,
+            'a': answers,
+            'ap': ap,
+            'score': score
+               }
+        rows.append(row)
+
+        if i % num_per_page == 0:
+            html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
+
+        if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
+            var_dict = {'title': "Accuracy Visualization",
+                        'rows': rows
+                        }
+            with open(html_path, "wb") as f:
+                f.write(template.render(**var_dict).encode('UTF-8'))
+            rows = []
+
+    os.chdir(html_dir)
+    port = args.port
+    host = args.host
+    # Overriding to suppress log message
+    class MyHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass
+    handler = MyHandler
+    httpd = socketserver.TCPServer((host, port), handler)
+    if args.open == 'True':
+        os.system("open http://%s:%d" % (args.host, args.port))
+    print("serving at %s:%d" % (host, port))
+    httpd.serve_forever()
+
+
+def get_segment(para, span):
+    return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
+
+
+if __name__ == "__main__":
+    ARGS = get_args()
+    accuracy2_visualizer(ARGS)
--- a/tensorflow/SQuAD/cnn_dm/init.py
+++ b/tensorflow/SQuAD/cnn_dm/init.py
--- a/tensorflow/SQuAD/cnn_dm/eda.ipynb
+++ b/tensorflow/SQuAD/cnn_dm/eda.ipynb
--- a/tensorflow/SQuAD/cnn_dm/evaluate.py
+++ b/tensorflow/SQuAD/cnn_dm/evaluate.py
@ -0,0 +1,38 @@
+import json
+import os
+import sys
+
+root_dir = sys.argv[1]
+answer_path = sys.argv[2]
+file_names = os.listdir(root_dir)
+
+num_correct = 0
+num_wrong = 0
+
+with open(answer_path, 'r') as fh:
+    id2answer_dict = json.load(fh)
+
+for file_name in file_names:
+    if not file_name.endswith(".question"):
+        continue
+    with open(os.path.join(root_dir, file_name), 'r') as fh:
+        url = fh.readline().strip()
+        _ = fh.readline()
+        para = fh.readline().strip()
+        _ = fh.readline()
+        ques = fh.readline().strip()
+        _ = fh.readline()
+        answer = fh.readline().strip()
+        _ = fh.readline()
+        if file_name in id2answer_dict:
+            pred = id2answer_dict[file_name]
+            if pred == answer:
+                num_correct += 1
+            else:
+                num_wrong += 1
+        else:
+            num_wrong += 1
+
+total = num_correct + num_wrong
+acc = float(num_correct) / total
+print("{} = {} / {}".format(acc, num_correct, total))
--- a/tensorflow/SQuAD/cnn_dm/prepro.py
+++ b/tensorflow/SQuAD/cnn_dm/prepro.py
@ -0,0 +1,185 @@
+import argparse
+import json
+import os
+# data: q, cq, (dq), (pq), y, *x, *cx
+# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
+# no metadata
+from collections import Counter
+
+from tqdm import tqdm
+
+from my.utils import process_tokens
+from squad.utils import get_word_span, process_tokens
+
+
+def bool_(arg):
+    if arg == 'True':
+        return True
+    elif arg == 'False':
+        return False
+    raise Exception(arg)
+
+
+def main():
+    args = get_args()
+    prepro(args)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    home = os.path.expanduser("~")
+    source_dir = os.path.join(home, "data", "cnn", 'questions')
+    target_dir = "data/cnn"
+    glove_dir = os.path.join(home, "data", "glove")
+    parser.add_argument("--source_dir", default=source_dir)
+    parser.add_argument("--target_dir", default=target_dir)
+    parser.add_argument("--glove_dir", default=glove_dir)
+    parser.add_argument("--glove_corpus", default='6B')
+    parser.add_argument("--glove_vec_size", default=100, type=int)
+    parser.add_argument("--debug", default=False, type=bool_)
+    parser.add_argument("--num_sents_th", default=200, type=int)
+    parser.add_argument("--ques_size_th", default=30, type=int)
+    parser.add_argument("--width", default=5, type=int)
+    # TODO : put more args here
+    return parser.parse_args()
+
+
+def prepro(args):
+    prepro_each(args, 'train')
+    prepro_each(args, 'dev')
+    prepro_each(args, 'test')
+
+
+def para2sents(para, width):
+    """
+    Turn para into double array of words (wordss)
+    Where each sentence is up to 5 word neighbors of each entity
+    :param para:
+    :return:
+    """
+    words = para.split(" ")
+    sents = []
+    for i, word in enumerate(words):
+        if word.startswith("@"):
+            start = max(i - width, 0)
+            stop = min(i + width + 1, len(words))
+            sent = words[start:stop]
+            sents.append(sent)
+    return sents
+
+
+def get_word2vec(args, word_counter):
+    glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
+    sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
+    total = sizes[args.glove_corpus]
+    word2vec_dict = {}
+    with open(glove_path, 'r', encoding='utf-8') as fh:
+        for line in tqdm(fh, total=total):
+            array = line.lstrip().rstrip().split(" ")
+            word = array[0]
+            vector = list(map(float, array[1:]))
+            if word in word_counter:
+                word2vec_dict[word] = vector
+            elif word.capitalize() in word_counter:
+                word2vec_dict[word.capitalize()] = vector
+            elif word.lower() in word_counter:
+                word2vec_dict[word.lower()] = vector
+            elif word.upper() in word_counter:
+                word2vec_dict[word.upper()] = vector
+
+    print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
+    return word2vec_dict
+
+
+def prepro_each(args, mode):
+    source_dir = os.path.join(args.source_dir, mode)
+    word_counter = Counter()
+    lower_word_counter = Counter()
+    ent_counter = Counter()
+    char_counter = Counter()
+    max_sent_size = 0
+    max_word_size = 0
+    max_ques_size = 0
+    max_num_sents = 0
+
+    file_names = list(os.listdir(source_dir))
+    if args.debug:
+        file_names = file_names[:1000]
+    lens = []
+
+    out_file_names = []
+    for file_name in tqdm(file_names, total=len(file_names)):
+        if file_name.endswith(".question"):
+            with open(os.path.join(source_dir, file_name), 'r') as fh:
+                url = fh.readline().strip()
+                _ = fh.readline()
+                para = fh.readline().strip()
+                _ = fh.readline()
+                ques = fh.readline().strip()
+                _ = fh.readline()
+                answer = fh.readline().strip()
+                _ = fh.readline()
+                cands = list(line.strip() for line in fh)
+                cand_ents = list(cand.split(":")[0] for cand in cands)
+                sents = para2sents(para, args.width)
+                ques_words = ques.split(" ")
+
+                # Filtering
+                if len(sents) > args.num_sents_th or len(ques_words) > args.ques_size_th:
+                    continue
+
+                max_sent_size = max(max(map(len, sents)), max_sent_size)
+                max_ques_size = max(len(ques_words), max_ques_size)
+                max_word_size = max(max(len(word) for sent in sents for word in sent), max_word_size)
+                max_num_sents = max(len(sents), max_num_sents)
+
+                for word in ques_words:
+                    if word.startswith("@"):
+                        ent_counter[word] += 1
+                        word_counter[word] += 1
+                    else:
+                        word_counter[word] += 1
+                        lower_word_counter[word.lower()] += 1
+                        for c in word:
+                            char_counter[c] += 1
+                for sent in sents:
+                    for word in sent:
+                        if word.startswith("@"):
+                            ent_counter[word] += 1
+                            word_counter[word] += 1
+                        else:
+                            word_counter[word] += 1
+                            lower_word_counter[word.lower()] += 1
+                            for c in word:
+                                char_counter[c] += 1
+
+                out_file_names.append(file_name)
+                lens.append(len(sents))
+    num_examples = len(out_file_names)
+
+    assert len(out_file_names) == len(lens)
+    sorted_file_names, lens = zip(*sorted(zip(out_file_names, lens), key=lambda each: each[1]))
+    assert lens[-1] == max_num_sents
+
+    word2vec_dict = get_word2vec(args, word_counter)
+    lower_word2vec_dit = get_word2vec(args, lower_word_counter)
+
+    shared = {'word_counter': word_counter, 'ent_counter': ent_counter, 'char_counter': char_counter,
+              'lower_word_counter': lower_word_counter,
+              'max_num_sents': max_num_sents, 'max_sent_size': max_sent_size, 'max_word_size': max_word_size,
+              'max_ques_size': max_ques_size,
+              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dit, 'sorted': sorted_file_names,
+              'num_examples': num_examples}
+
+    print("max num sents: {}".format(max_num_sents))
+    print("max ques size: {}".format(max_ques_size))
+
+    if not os.path.exists(args.target_dir):
+        os.makedirs(args.target_dir)
+    shared_path = os.path.join(args.target_dir, "shared_{}.json".format(mode))
+    with open(shared_path, 'w') as fh:
+        json.dump(shared, fh)
+
+
+if __name__ == "__main__":
+    main()
--- a/tensorflow/SQuAD/download.sh
+++ b/tensorflow/SQuAD/download.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+DATA_DIR=$HOME/data
+mkdir $DATA_DIR
+
+# Download SQuAD
+SQUAD_DIR=$DATA_DIR/squad
+mkdir $SQUAD_DIR
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
+
+
+# Download CNN and DailyMail
+# Download at: http://cs.nyu.edu/~kcho/DMQA/
+
+
+# Download GloVe
+GLOVE_DIR=$DATA_DIR/glove
+mkdir $GLOVE_DIR
+wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
+unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
+
+# Download NLTK (for tokenizer)
+# Make sure that nltk is installed!
+python3 -m nltk.downloader -d $HOME/nltk_data punkt
--- a/tensorflow/SQuAD/my/init.py
+++ b/tensorflow/SQuAD/my/init.py
--- a/tensorflow/SQuAD/my/corenlp_interface.py
+++ b/tensorflow/SQuAD/my/corenlp_interface.py
@ -0,0 +1,55 @@
+import logging
+
+import requests
+import nltk
+import json
+import networkx as nx
+import time
+
+
+class CoreNLPInterface(object):
+    def __init__(self, url, port):
+        self._url = url
+        self._port = port
+
+    def get(self, type_, in_, num_max_requests=100):
+        in_ = in_.encode("utf-8")
+        url = "http://{}:{}/{}".format(self._url, self._port, type_)
+        out = None
+        for _ in range(num_max_requests):
+            try:
+                r = requests.post(url, data=in_)
+                out = r.content.decode('utf-8')
+                if out == 'error':
+                    out = None
+                break
+            except:
+                time.sleep(1)
+        return out
+
+    def split_doc(self, doc):
+        out = self.get("doc", doc)
+        return out if out is None else json.loads(out)
+
+    def split_sent(self, sent):
+        out = self.get("sent", sent)
+        return out if out is None else json.loads(out)
+
+    def get_dep(self, sent):
+        out = self.get("dep", sent)
+        return out if out is None else json.loads(out)
+
+    def get_const(self, sent):
+        out = self.get("const", sent)
+        return out
+
+    def get_const_tree(self, sent):
+        out = self.get_const(sent)
+        return out if out is None else nltk.tree.Tree.fromstring(out)
+
+    @staticmethod
+    def dep2tree(dep):
+        tree = nx.DiGraph()
+        for dep, i, gov, j, label in dep:
+            tree.add_edge(gov, dep, label=label)
+        return tree
--- a/tensorflow/SQuAD/my/nltk_utils.py
+++ b/tensorflow/SQuAD/my/nltk_utils.py
@ -0,0 +1,129 @@
+import nltk
+import numpy as np
+
+
+def _set_span(t, i):
+    if isinstance(t[0], str):
+        t.span = (i, i+len(t))
+    else:
+        first = True
+        for c in t:
+            cur_span = _set_span(c, i)
+            i = cur_span[1]
+            if first:
+                min_ = cur_span[0]
+                first = False
+        max_ = cur_span[1]
+        t.span = (min_, max_)
+    return t.span
+
+
+def set_span(t):
+    assert isinstance(t, nltk.tree.Tree)
+    try:
+        return _set_span(t, 0)
+    except:
+        print(t)
+        exit()
+
+
+def tree_contains_span(tree, span):
+    """
+    Assumes that tree span has been set with set_span
+    Returns true if any subtree of t has exact span as the given span
+    :param t:
+    :param span:
+    :return bool:
+    """
+    return span in set(t.span for t in tree.subtrees())
+
+
+def span_len(span):
+    return span[1] - span[0]
+
+
+def span_overlap(s1, s2):
+    start = max(s1[0], s2[0])
+    stop = min(s1[1], s2[1])
+    if stop > start:
+        return start, stop
+    return None
+
+
+def span_prec(true_span, pred_span):
+    overlap = span_overlap(true_span, pred_span)
+    if overlap is None:
+        return 0
+    return span_len(overlap) / span_len(pred_span)
+
+
+def span_recall(true_span, pred_span):
+    overlap = span_overlap(true_span, pred_span)
+    if overlap is None:
+        return 0
+    return span_len(overlap) / span_len(true_span)
+
+
+def span_f1(true_span, pred_span):
+    p = span_prec(true_span, pred_span)
+    r = span_recall(true_span, pred_span)
+    if p == 0 or r == 0:
+        return 0.0
+    return 2 * p * r / (p + r)
+
+
+def find_max_f1_span(tree, span):
+    return find_max_f1_subtree(tree, span).span
+
+
+def find_max_f1_subtree(tree, span):
+    return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0]
+
+
+def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'):
+    set_span(tree)
+    D = tree.height() - 1
+    B = len(tree.leaves())
+    row_size = row_size or D
+    col_size = col_size or B
+    matrix = np.zeros([row_size, col_size], dtype=dtype)
+    mask = np.zeros([row_size, col_size, col_size], dtype='bool')
+
+    for subtree in tree.subtrees():
+        row = subtree.height() - 2
+        col = subtree.span[0]
+        matrix[row, col] = node2num(subtree)
+        for subsub in subtree.subtrees():
+            if isinstance(subsub, nltk.tree.Tree):
+                mask[row, col, subsub.span[0]] = True
+                if not isinstance(subsub[0], nltk.tree.Tree):
+                    c = subsub.span[0]
+                    for r in range(row):
+                        mask[r, c, c] = True
+            else:
+                mask[row, col, col] = True
+
+    return matrix, mask
+
+
+def load_compressed_tree(s):
+
+    def compress_tree(tree):
+        assert not isinstance(tree, str)
+        if len(tree) == 1:
+            if isinstance(tree[0], nltk.tree.Tree):
+                return compress_tree(tree[0])
+            else:
+                return tree
+        else:
+            for i, t in enumerate(tree):
+                if isinstance(t, nltk.tree.Tree):
+                    tree[i] = compress_tree(t)
+                else:
+                    tree[i] = t
+            return tree
+
+    return compress_tree(nltk.tree.Tree.fromstring(s))
+
+
+
--- a/tensorflow/SQuAD/my/tensorflow/init.py
+++ b/tensorflow/SQuAD/my/tensorflow/init.py
@ -0,0 +1 @@
+from my.tensorflow.general import *
--- a/tensorflow/SQuAD/my/tensorflow/general.py
+++ b/tensorflow/SQuAD/my/tensorflow/general.py
@ -0,0 +1,177 @@
+from itertools import zip_longest
+
+import itertools
+import tensorflow as tf
+from functools import reduce
+from operator import mul
+import numpy as np
+
+VERY_BIG_NUMBER = 1e30
+VERY_SMALL_NUMBER = 1e-30
+VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
+VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER
+
+
+def get_initializer(matrix):
+    def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
+    return _initializer
+
+
+def variable_on_cpu(name, shape, initializer):
+    """Helper to create a Variable stored on CPU memory.
+
+    Args:
+      name: name of the variable
+      shape: list of ints
+      initializer: initializer for Variable
+
+    Returns:
+      Variable Tensor
+    """
+    with tf.device('/cpu:0'):
+        var = tf.get_variable(name, shape, initializer=initializer)
+    return var
+
+
+def variable_with_weight_decay(name, shape, stddev, wd):
+    """Helper to create an initialized Variable with weight decay.
+
+    Note that the Variable is initialized with a truncated normal distribution.
+    A weight decay is added only if one is specified.
+
+    Args:
+      name: name of the variable
+      shape: list of ints
+      stddev: standard deviation of a truncated Gaussian
+      wd: add L2Loss weight decay multiplied by this float. If None, weight
+          decay is not added for this Variable.
+
+    Returns:
+      Variable Tensor
+    """
+    var = variable_on_cpu(name, shape,
+                           tf.truncated_normal_initializer(stddev=stddev))
+    if wd:
+        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
+        tf.add_to_collection('losses', weight_decay)
+    return var
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+
+    Note that this function provides a synchronization point across all towers.
+
+    Args:
+      tower_grads: List of lists of (gradient, variable) tuples. The outer list
+        is over individual gradients. The inner list is over the gradient
+        calculation for each tower.
+    Returns:
+       List of pairs of (gradient, variable) where the gradient has been averaged
+       across all towers.
+    """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, var in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            assert g is not None, var.name
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(axis=0, values=grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def mask(val, mask, name=None):
+    if name is None:
+        name = 'mask'
+    return tf.multiply(val, tf.cast(mask, 'float'), name=name)
+
+
+def exp_mask(val, mask, name=None):
+    """Give very negative number to unmasked elements in val.
+    For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
+    Typically, this effectively masks in exponential space (e.g. softmax)
+    Args:
+        val: values to be masked
+        mask: masking boolean tensor, same shape as tensor
+        name: name for output tensor
+
+    Returns:
+        Same shape as val, where some elements are very small (exponentially zero)
+    """
+    if name is None:
+        name = "exp_mask"
+    return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)
+
+
+def flatten(tensor, keep):
+    fixed_shape = tensor.get_shape().as_list()
+    start = len(fixed_shape) - keep
+    left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
+    out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
+    flat = tf.reshape(tensor, out_shape)
+    return flat
+
+
+def reconstruct(tensor, ref, keep):
+    ref_shape = ref.get_shape().as_list()
+    tensor_shape = tensor.get_shape().as_list()
+    ref_stop = len(ref_shape) - keep
+    tensor_start = len(tensor_shape) - keep
+    pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
+    keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
+    # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
+    # keep_shape = tensor.get_shape().as_list()[-keep:]
+    target_shape = pre_shape + keep_shape
+    out = tf.reshape(tensor, target_shape)
+    return out
+
+
+def add_wd(wd, scope=None):
+    scope = scope or tf.get_variable_scope().name
+    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
+    with tf.name_scope("weight_decay"):
+        for var in variables:
+            weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name="{}/wd".format(var.op.name))
+            tf.add_to_collection('losses', weight_decay)
+
+
+def grouper(iterable, n, fillvalue=None, shorten=False, num_groups=None):
+    args = [iter(iterable)] * n
+    out = zip_longest(*args, fillvalue=fillvalue)
+    out = list(out)
+    if num_groups is not None:
+        default = (fillvalue, ) * n
+        assert isinstance(num_groups, int)
+        out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
+    if shorten:
+        assert fillvalue is None
+        out = (tuple(e for e in each if e is not None) for each in out)
+    return out
+
+def padded_reshape(tensor, shape, mode='CONSTANT', name=None):
+    paddings = [[0, shape[i] - tf.shape(tensor)[i]] for i in range(len(shape))]
+    return tf.pad(tensor, paddings, mode=mode, name=name)
+
+
+def get_num_params():
+    num_params = 0
+    for variable in tf.trainable_variables():
+        shape = variable.get_shape()
+        num_params += reduce(mul, [dim.value for dim in shape], 1)
+    return num_params
--- a/tensorflow/SQuAD/my/tensorflow/nn.py
+++ b/tensorflow/SQuAD/my/tensorflow/nn.py
@ -0,0 +1,180 @@
+from tensorflow.python.ops.rnn_cell_impl import _linear
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+from my.tensorflow import flatten, reconstruct, add_wd, exp_mask
+
+
+def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
+           is_train=None):
+    if args is None or (nest.is_sequence(args) and not args):
+        raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+        args = [args]
+
+    flat_args = [flatten(arg, 1) for arg in args]
+    if input_keep_prob < 1.0:
+        assert is_train is not None
+        flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
+                     for arg in flat_args]
+    with tf.variable_scope(scope or 'Linear'):
+        flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start))
+    out = reconstruct(flat_out, args[0], 1)
+    if squeeze:
+        out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
+    if wd:
+        add_wd(wd)
+
+    return out
+
+
+def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None):
+    with tf.name_scope(name or "dropout"):
+        if keep_prob < 1.0:
+            d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed)
+            out = tf.cond(is_train, lambda: d, lambda: x)
+            return out
+        return x
+
+
+def softmax(logits, mask=None, scope=None):
+    with tf.name_scope(scope or "Softmax"):
+        if mask is not None:
+            logits = exp_mask(logits, mask)
+        flat_logits = flatten(logits, 1)
+        flat_out = tf.nn.softmax(flat_logits)
+        out = reconstruct(flat_out, logits, 1)
+
+        return out
+
+
+def softsel(target, logits, mask=None, scope=None):
+    """
+
+    :param target: [ ..., J, d] dtype=float
+    :param logits: [ ..., J], dtype=float
+    :param mask: [ ..., J], dtype=bool
+    :param scope:
+    :return: [..., d], dtype=float
+    """
+    with tf.name_scope(scope or "Softsel"):
+        a = softmax(logits, mask=mask)
+        target_rank = len(target.get_shape().as_list())
+        out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2)
+        return out
+
+
+def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
+    with tf.variable_scope(scope or "Double_Linear_Logits"):
+        first = tf.tanh(linear(args, size, bias, bias_start=bias_start, scope='first',
+                               wd=wd, input_keep_prob=input_keep_prob, is_train=is_train))
+        second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second',
+                        wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
+        if mask is not None:
+            second = exp_mask(second, mask)
+        return second
+
+
+def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
+    with tf.variable_scope(scope or "Linear_Logits"):
+        logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first',
+                        wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
+        if mask is not None:
+            logits = exp_mask(logits, mask)
+        return logits
+
+
+def sum_logits(args, mask=None, name=None):
+    with tf.name_scope(name or "sum_logits"):
+        if args is None or (nest.is_sequence(args) and not args):
+            raise ValueError("`args` must be specified")
+        if not nest.is_sequence(args):
+            args = [args]
+        rank = len(args[0].get_shape())
+        logits = sum(tf.reduce_sum(arg, rank-1) for arg in args)
+        if mask is not None:
+            logits = exp_mask(logits, mask)
+        return logits
+
+
+def get_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None, func=None):
+    if func is None:
+        func = "sum"
+    if func == 'sum':
+        return sum_logits(args, mask=mask, name=scope)
+    elif func == 'linear':
+        return linear_logits(args, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
+                             is_train=is_train)
+    elif func == 'double':
+        return double_linear_logits(args, size, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
+                                    is_train=is_train)
+    elif func == 'dot':
+        assert len(args) == 2
+        arg = args[0] * args[1]
+        return sum_logits([arg], mask=mask, name=scope)
+    elif func == 'mul_linear':
+        assert len(args) == 2
+        arg = args[0] * args[1]
+        return linear_logits([arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
+                             is_train=is_train)
+    elif func == 'proj':
+        assert len(args) == 2
+        d = args[1].get_shape()[-1]
+        proj = linear([args[0]], d, False, bias_start=bias_start, scope=scope, wd=wd, input_keep_prob=input_keep_prob,
+                      is_train=is_train)
+        return sum_logits([proj * args[1]], mask=mask)
+    elif func == 'tri_linear':
+        assert len(args) == 2
+        new_arg = args[0] * args[1]
+        return linear_logits([args[0], args[1], new_arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
+                             is_train=is_train)
+    else:
+        raise Exception()
+
+
+def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
+    with tf.variable_scope(scope or "highway_layer"):
+        d = arg.get_shape()[-1]
+        trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
+        trans = tf.nn.relu(trans)
+        gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
+        gate = tf.nn.sigmoid(gate)
+        out = gate * trans + (1 - gate) * arg
+        return out
+
+
+def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
+    with tf.variable_scope(scope or "highway_network"):
+        prev = arg
+        cur = None
+        for layer_idx in range(num_layers):
+            cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd,
+                                input_keep_prob=input_keep_prob, is_train=is_train)
+            prev = cur
+        return cur
+
+
+def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None):
+    with tf.variable_scope(scope or "conv1d"):
+        num_channels = in_.get_shape()[-1]
+        filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float')
+        bias = tf.get_variable("bias", shape=[filter_size], dtype='float')
+        strides = [1, 1, 1, 1]
+        if is_train is not None and keep_prob < 1.0:
+            in_ = dropout(in_, keep_prob, is_train)
+        xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias  # [N*M, JX, W/filter_stride, d]
+        out = tf.reduce_max(tf.nn.relu(xxc), 2)  # [-1, JX, d]
+        return out
+
+
+def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None):
+    with tf.variable_scope(scope or "multi_conv1d"):
+        assert len(filter_sizes) == len(heights)
+        outs = []
+        for filter_size, height in zip(filter_sizes, heights):
+            if filter_size == 0:
+                continue
+            out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height))
+            outs.append(out)
+        concat_out = tf.concat(axis=2, values=outs)
+        return concat_out
--- a/tensorflow/SQuAD/my/tensorflow/rnn.py
+++ b/tensorflow/SQuAD/my/tensorflow/rnn.py
@ -0,0 +1,81 @@
+import tensorflow as tf
+from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \
+    bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
+
+from my.tensorflow import flatten, reconstruct
+
+
+def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
+                dtype=None, parallel_iterations=None, swap_memory=False,
+                time_major=False, scope=None):
+    assert not time_major  # TODO : to be implemented later!
+    flat_inputs = flatten(inputs, 2)  # [-1, J, d]
+    flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
+
+    flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
+                                             initial_state=initial_state, dtype=dtype,
+                                             parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+                                             time_major=time_major, scope=scope)
+
+    outputs = reconstruct(flat_outputs, inputs, 2)
+    return outputs, final_state
+
+
+def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
+                   dtype=None, parallel_iterations=None, swap_memory=False,
+                   time_major=False, scope=None):
+    assert not time_major  # TODO : to be implemented later!
+
+    flat_inputs = flatten(inputs, 2)  # [-1, J, d]
+    flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
+
+    flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \
+        else tf.reverse_sequence(flat_inputs, sequence_length, 1)
+    flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
+                                             initial_state=initial_state, dtype=dtype,
+                                             parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+                                             time_major=time_major, scope=scope)
+    flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \
+        else tf.reverse_sequence(flat_outputs, sequence_length, 1)
+
+    outputs = reconstruct(flat_outputs, inputs, 2)
+    return outputs, final_state
+
+
+def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+                              initial_state_fw=None, initial_state_bw=None,
+                              dtype=None, parallel_iterations=None,
+                              swap_memory=False, time_major=False, scope=None):
+    assert not time_major
+
+    flat_inputs = flatten(inputs, 2)  # [-1, J, d]
+    flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
+
+    (flat_fw_outputs, flat_bw_outputs), final_state = \
+        _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
+                                   initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
+                                   dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+                                   time_major=time_major, scope=scope)
+
+    fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
+    bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
+    # FIXME : final state is not reshaped!
+    return (fw_outputs, bw_outputs), final_state
+
+
+def bidirectional_rnn(cell_fw, cell_bw, inputs,
+                      initial_state_fw=None, initial_state_bw=None,
+                      dtype=None, sequence_length=None, scope=None):
+
+    flat_inputs = flatten(inputs, 2)  # [-1, J, d]
+    flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
+
+    (flat_fw_outputs, flat_bw_outputs), final_state = \
+        tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
+                                        initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
+                                        dtype=dtype, scope=scope)
+
+    fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
+    bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
+    # FIXME : final state is not reshaped!
+    return (fw_outputs, bw_outputs), final_state
--- a/tensorflow/SQuAD/my/tensorflow/rnn_cell.py
+++ b/tensorflow/SQuAD/my/tensorflow/rnn_cell.py
@ -0,0 +1,223 @@
+import tensorflow as tf
+from tensorflow.contrib.rnn import DropoutWrapper, RNNCell, LSTMStateTuple
+
+from my.tensorflow import exp_mask, flatten
+from my.tensorflow.nn import linear, softsel, double_linear_logits
+
+
+class SwitchableDropoutWrapper(DropoutWrapper):
+    def __init__(self, cell, is_train, input_keep_prob=1.0, output_keep_prob=1.0,
+             seed=None):
+        super(SwitchableDropoutWrapper, self).__init__(cell, input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob,
+                                                       seed=seed)
+        self.is_train = is_train
+
+    def __call__(self, inputs, state, scope=None):
+        outputs_do, new_state_do = super(SwitchableDropoutWrapper, self).__call__(inputs, state, scope=scope)
+        tf.get_variable_scope().reuse_variables()
+        outputs, new_state = self._cell(inputs, state, scope)
+        outputs = tf.cond(self.is_train, lambda: outputs_do, lambda: outputs)
+        if isinstance(state, tuple):
+            new_state = state.__class__(*[tf.cond(self.is_train, lambda: new_state_do_i, lambda: new_state_i)
+                                          for new_state_do_i, new_state_i in zip(new_state_do, new_state)])
+        else:
+            new_state = tf.cond(self.is_train, lambda: new_state_do, lambda: new_state)
+        return outputs, new_state
+
+
+class TreeRNNCell(RNNCell):
+    def __init__(self, cell, input_size, reduce_func):
+        self._cell = cell
+        self._input_size = input_size
+        self._reduce_func = reduce_func
+
+    def __call__(self, inputs, state, scope=None):
+        """
+        :param inputs: [N*B, I + B]
+        :param state: [N*B, d]
+        :param scope:
+        :return: [N*B, d]
+        """
+        with tf.variable_scope(scope or self.__class__.__name__):
+            d = self.state_size
+            x = tf.slice(inputs, [0, 0], [-1, self._input_size])  # [N*B, I]
+            mask = tf.slice(inputs, [0, self._input_size], [-1, -1])  # [N*B, B]
+            B = tf.shape(mask)[1]
+            prev_state = tf.expand_dims(tf.reshape(state, [-1, B, d]), 1)  # [N, B, d] -> [N, 1, B, d]
+            mask = tf.tile(tf.expand_dims(tf.reshape(mask, [-1, B, B]), -1), [1, 1, 1, d])  # [N, B, B, d]
+            # prev_state = self._reduce_func(tf.tile(prev_state, [1, B, 1, 1]), 2)
+            prev_state = self._reduce_func(exp_mask(prev_state, mask), 2)  # [N, B, d]
+            prev_state = tf.reshape(prev_state, [-1, d])  # [N*B, d]
+            return self._cell(x, prev_state)
+
+    @property
+    def state_size(self):
+        return self._cell.state_size
+
+    @property
+    def output_size(self):
+        return self._cell.output_size
+
+
+class NoOpCell(RNNCell):
+    def __init__(self, num_units):
+        self._num_units = num_units
+
+    def __call__(self, inputs, state, scope=None):
+        return state, state
+
+    @property
+    def state_size(self):
+        return self._num_units
+
+    @property
+    def output_size(self):
+        return self._num_units
+
+
+class MatchCell(RNNCell):
+    def __init__(self, cell, input_size, q_len):
+        self._cell = cell
+        self._input_size = input_size
+        # FIXME : This won't be needed with good shape guessing
+        self._q_len = q_len
+
+    @property
+    def state_size(self):
+        return self._cell.state_size
+
+    @property
+    def output_size(self):
+        return self._cell.output_size
+
+    def __call__(self, inputs, state, scope=None):
+        """
+
+        :param inputs: [N, d + JQ + JQ * d]
+        :param state: [N, d]
+        :param scope:
+        :return:
+        """
+        with tf.variable_scope(scope or self.__class__.__name__):
+            c_prev, h_prev = state
+            x = tf.slice(inputs, [0, 0], [-1, self._input_size])
+            q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len])  # [N, JQ]
+            qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1])
+            qs = tf.reshape(qs, [-1, self._q_len, self._input_size])  # [N, JQ, d]
+            x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1])  # [N, JQ, d]
+            h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1])  # [N, JQ, d]
+            f = tf.tanh(linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f'))  # [N, JQ, d]
+            a = tf.nn.softmax(exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask))  # [N, JQ]
+            q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1)
+            z = tf.concat(axis=1, values=[x, q])  # [N, 2d]
+            return self._cell(z, state)
+
+
+class AttentionCell(RNNCell):
+    def __init__(self, cell, memory, mask=None, controller=None, mapper=None, input_keep_prob=1.0, is_train=None):
+        """
+        Early fusion attention cell: uses the (inputs, state) to control the current attention.
+
+        :param cell:
+        :param memory: [N, M, m]
+        :param mask:
+        :param controller: (inputs, prev_state, memory) -> memory_logits
+        """
+        self._cell = cell
+        self._memory = memory
+        self._mask = mask
+        self._flat_memory = flatten(memory, 2)
+        self._flat_mask = flatten(mask, 1)
+        if controller is None:
+            controller = AttentionCell.get_linear_controller(True, is_train=is_train)
+        self._controller = controller
+        if mapper is None:
+            mapper = AttentionCell.get_concat_mapper()
+        elif mapper == 'sim':
+            mapper = AttentionCell.get_sim_mapper()
+        self._mapper = mapper
+
+    @property
+    def state_size(self):
+        return self._cell.state_size
+
+    @property
+    def output_size(self):
+        return self._cell.output_size
+
+    def __call__(self, inputs, state, scope=None):
+        with tf.variable_scope(scope or "AttentionCell"):
+            memory_logits = self._controller(inputs, state, self._flat_memory)
+            sel_mem = softsel(self._flat_memory, memory_logits, mask=self._flat_mask)  # [N, m]
+            new_inputs, new_state = self._mapper(inputs, state, sel_mem)
+            return self._cell(new_inputs, state)
+
+    @staticmethod
+    def get_double_linear_controller(size, bias, input_keep_prob=1.0, is_train=None):
+        def double_linear_controller(inputs, state, memory):
+            """
+
+            :param inputs: [N, i]
+            :param state: [N, d]
+            :param memory: [N, M, m]
+            :return: [N, M]
+            """
+            rank = len(memory.get_shape())
+            _memory_size = tf.shape(memory)[rank-2]
+            tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
+            if isinstance(state, tuple):
+                tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
+                                for each in state]
+            else:
+                tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]
+
+            # [N, M, d]
+            in_ = tf.concat([tiled_inputs] + tiled_states + [memory], axis=2)
+            out = double_linear_logits(in_, size, bias, input_keep_prob=input_keep_prob,
+                                       is_train=is_train)
+            return out
+        return double_linear_controller
+
+    @staticmethod
+    def get_linear_controller(bias, input_keep_prob=1.0, is_train=None):
+        def linear_controller(inputs, state, memory):
+            rank = len(memory.get_shape())
+            _memory_size = tf.shape(memory)[rank-2]
+            tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
+            if isinstance(state, tuple):
+                tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
+                                for each in state]
+            else:
+                tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]
+
+            # [N, M, d]
+            in_ = tf.concat([tiled_inputs] + tiled_states + [memory], axis=2)
+            out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train)
+            return out
+        return linear_controller
+
+    @staticmethod
+    def get_concat_mapper():
+        def concat_mapper(inputs, state, sel_mem):
+            """
+
+            :param inputs: [N, i]
+            :param state: [N, d]
+            :param sel_mem: [N, m]
+            :return: (new_inputs, new_state) tuple
+            """
+            return tf.concat(axis=1, values=[inputs, sel_mem]), state
+        return concat_mapper
+
+    @staticmethod
+    def get_sim_mapper():
+        def sim_mapper(inputs, state, sel_mem):
+            """
+            Assume that inputs and sel_mem are the same size
+            :param inputs: [N, i]
+            :param state: [N, d]
+            :param sel_mem: [N, i]
+            :return: (new_inputs, new_state) tuple
+            """
+            return tf.concat(axis=1, values=[inputs, sel_mem, inputs * sel_mem, tf.abs(inputs - sel_mem)]), state
+        return sim_mapper
--- a/tensorflow/SQuAD/my/utils.py
+++ b/tensorflow/SQuAD/my/utils.py
@ -0,0 +1,58 @@
+import json
+from collections import deque
+
+import numpy as np
+from tqdm import tqdm
+
+
+def mytqdm(list_, desc="", show=True):
+    if show:
+        pbar = tqdm(list_)
+        pbar.set_description(desc)
+        return pbar
+    return list_
+
+
+def json_pretty_dump(obj, fh):
+    return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': '))
+
+
+def index(l, i):
+    return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]]
+
+
+def fill(l, shape, dtype=None):
+    out = np.zeros(shape, dtype=dtype)
+    stack = deque()
+    stack.appendleft(((), l))
+    while len(stack) > 0:
+        indices, cur = stack.pop()
+        if len(indices) < shape:
+            for i, sub in enumerate(cur):
+                stack.appendleft([indices + (i,), sub])
+        else:
+            out[indices] = cur
+    return out
+
+
+def short_floats(o, precision):
+    class ShortFloat(float):
+        def __repr__(self):
+            return '%.{}g'.format(precision) % self
+
+    def _short_floats(obj):
+        if isinstance(obj, float):
+            return ShortFloat(obj)
+        elif isinstance(obj, dict):
+            return dict((k, _short_floats(v)) for k, v in obj.items())
+        elif isinstance(obj, (list, tuple)):
+            return tuple(map(_short_floats, obj))
+        return obj
+
+    return _short_floats(o)
+
+
+def argmax(x):
+    return np.unravel_index(x.argmax(), x.shape)
+
+
--- a/tensorflow/SQuAD/my/zip_save.py
+++ b/tensorflow/SQuAD/my/zip_save.py
@ -0,0 +1,50 @@
+import argparse
+import os
+
+import shutil
+from zipfile import ZipFile
+
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('paths', nargs='+')
+    parser.add_argument('-o', '--out', default='save.zip')
+    args = parser.parse_args()
+    return args
+
+
+def zip_save(args):
+    temp_dir = "."
+    save_dir = os.path.join(temp_dir, "save")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    for save_source_path in tqdm(args.paths):
+        # path = "out/basic/30/save/basic-18000"
+        # target_path = "save_dir/30/save"
+        # also output full path name to "save_dir/30/readme.txt
+        # need to also extract "out/basic/30/shared.json"
+        temp, _ = os.path.split(save_source_path)  # "out/basic/30/save", _
+        model_dir, _ = os.path.split(temp)  # "out/basic/30, _
+        _, model_name = os.path.split(model_dir)
+        cur_dir = os.path.join(save_dir, model_name)
+        if not os.path.exists(cur_dir):
+            os.makedirs(cur_dir)
+        save_target_path = os.path.join(cur_dir, "save")
+        shared_target_path = os.path.join(cur_dir, "shared.json")
+        readme_path = os.path.join(cur_dir, "readme.txt")
+        shared_source_path = os.path.join(model_dir, "shared.json")
+        shutil.copy(save_source_path, save_target_path)
+        shutil.copy(shared_source_path, shared_target_path)
+        with open(readme_path, 'w') as fh:
+            fh.write(save_source_path)
+
+    os.system("zip {} -r {}".format(args.out, save_dir))
+
+def main():
+    args = get_args()
+    zip_save(args)
+
+if __name__ == "__main__":
+    main()
--- a/tensorflow/SQuAD/requirements.txt
+++ b/tensorflow/SQuAD/requirements.txt
@ -0,0 +1,3 @@
+nltk
+tqdm
+jinja2
--- a/tensorflow/SQuAD/run_training.sh
+++ b/tensorflow/SQuAD/run_training.sh
@ -0,0 +1 @@
+python3 -m basic.cli --mode train --noload --len_opt --cluster
--- a/tensorflow/SQuAD/squad/init.py
+++ b/tensorflow/SQuAD/squad/init.py
--- a/tensorflow/SQuAD/squad/aug_squad.py
+++ b/tensorflow/SQuAD/squad/aug_squad.py
@ -0,0 +1,157 @@
+import json
+import sys
+
+from tqdm import tqdm
+
+from my.corenlp_interface import CoreNLPInterface
+
+in_path = sys.argv[1]
+out_path = sys.argv[2]
+url = sys.argv[3]
+port = int(sys.argv[4])
+data = json.load(open(in_path, 'r'))
+
+h = CoreNLPInterface(url, port)
+
+
+def find_all(a_str, sub):
+    start = 0
+    while True:
+        start = a_str.find(sub, start)
+        if start == -1: return
+        yield start
+        start += len(sub)  # use start += 1 to find overlapping matches
+
+
+def to_hex(s):
+    return " ".join(map(hex, map(ord, s)))
+
+
+def handle_nobreak(cand, text):
+    if cand == text:
+        return cand
+    if cand.replace(u'\u00A0', ' ') == text:
+        return cand
+    elif cand == text.replace(u'\u00A0', ' '):
+        return text
+    raise Exception("{} '{}' {} '{}'".format(cand, to_hex(cand), text, to_hex(text)))
+
+
+# resolving unicode complication
+
+wrong_loc_count = 0
+loc_diffs = []
+
+for article in data['data']:
+    for para in article['paragraphs']:
+        para['context'] = para['context'].replace(u'\u000A', '')
+        para['context'] = para['context'].replace(u'\u00A0', ' ')
+        context = para['context']
+        for qa in para['qas']:
+            for answer in qa['answers']:
+                answer['text'] = answer['text'].replace(u'\u00A0', ' ')
+                text = answer['text']
+                answer_start = answer['answer_start']
+                if context[answer_start:answer_start + len(text)] == text:
+                    if text.lstrip() == text:
+                        pass
+                    else:
+                        answer_start += len(text) - len(text.lstrip())
+                        answer['answer_start'] = answer_start
+                        text = text.lstrip()
+                        answer['text'] = text
+                else:
+                    wrong_loc_count += 1
+                    text = text.lstrip()
+                    answer['text'] = text
+                    starts = list(find_all(context, text))
+                    if len(starts) == 1:
+                        answer_start = starts[0]
+                    elif len(starts) > 1:
+                        new_answer_start = min(starts, key=lambda s: abs(s - answer_start))
+                        loc_diffs.append(abs(new_answer_start - answer_start))
+                        answer_start = new_answer_start
+                    else:
+                        raise Exception()
+                    answer['answer_start'] = answer_start
+
+                answer_stop = answer_start + len(text)
+                answer['answer_stop'] = answer_stop
+                assert para['context'][answer_start:answer_stop] == answer['text'], "{} {}".format(
+                    para['context'][answer_start:answer_stop], answer['text'])
+
+print(wrong_loc_count, loc_diffs)
+
+mismatch_count = 0
+dep_fail_count = 0
+no_answer_count = 0
+
+size = sum(len(article['paragraphs']) for article in data['data'])
+pbar = tqdm(range(size))
+
+for ai, article in enumerate(data['data']):
+    for pi, para in enumerate(article['paragraphs']):
+        context = para['context']
+        sents = h.split_doc(context)
+        words = h.split_sent(context)
+        sent_starts = []
+        ref_idx = 0
+        for sent in sents:
+            new_idx = context.find(sent, ref_idx)
+            sent_starts.append(new_idx)
+            ref_idx = new_idx + len(sent)
+        para['sents'] = sents
+        para['words'] = words
+        para['sent_starts'] = sent_starts
+
+        consts = list(map(h.get_const, sents))
+        para['consts'] = consts
+        deps = list(map(h.get_dep, sents))
+        para['deps'] = deps
+
+        for qa in para['qas']:
+            question = qa['question']
+            question_const = h.get_const(question)
+            qa['const'] = question_const
+            question_dep = h.get_dep(question)
+            qa['dep'] = question_dep
+            qa['words'] = h.split_sent(question)
+
+            for answer in qa['answers']:
+                answer_start = answer['answer_start']
+                text = answer['text']
+                answer_stop = answer_start + len(text)
+                # answer_words = h.split_sent(text)
+                word_idxs = []
+                answer_words = []
+                for sent_idx, (sent, sent_start, dep) in enumerate(zip(sents, sent_starts, deps)):
+                    if dep is None:
+                        print("dep parse failed at {} {} {}".format(ai, pi, sent_idx))
+                        dep_fail_count += 1
+                        continue
+                    nodes, edges = dep
+                    words = [node[0] for node in nodes]
+
+                    for word_idx, (word, _, _, start, _) in enumerate(nodes):
+                        global_start = sent_start + start
+                        global_stop = global_start + len(word)
+                        if answer_start <= global_start < answer_stop or answer_start < global_stop <= answer_stop:
+                            word_idxs.append((sent_idx, word_idx))
+                            answer_words.append(word)
+                if len(word_idxs) > 0:
+                    answer['answer_word_start'] = word_idxs[0]
+                    answer['answer_word_stop'] = word_idxs[-1][0], word_idxs[-1][1] + 1
+                    if not text.startswith(answer_words[0]):
+                        print("'{}' '{}'".format(text, ' '.join(answer_words)))
+                        mismatch_count += 1
+                else:
+                    answer['answer_word_start'] = None
+                    answer['answer_word_stop'] = None
+                    no_answer_count += 1
+        pbar.update(1)
+pbar.close()
+
+print(mismatch_count, dep_fail_count, no_answer_count)
+
+print("saving...")
+json.dump(data, open(out_path, 'w'))
--- a/tensorflow/SQuAD/squad/eda_aug_dev.ipynb
+++ b/tensorflow/SQuAD/squad/eda_aug_dev.ipynb
@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "aug_data_path = \"/Users/minjoons/data/squad/dev-v1.0-aug.json\"\n",
+    "aug_data = json.load(open(aug_data_path, 'r'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(['Denver', 'Broncos'], 'Denver Broncos')\n",
+      "(['Denver', 'Broncos'], 'Denver Broncos')\n",
+      "(['Denver', 'Broncos'], 'Denver Broncos ')\n",
+      "(['Carolina', 'Panthers'], 'Carolina Panthers')\n"
+     ]
+    }
+   ],
+   "source": [
+    "def compare_answers():\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                nodes, edges = dep\n",
+    "                if dep is not None:\n",
+    "                    nodess.append(nodes)\n",
+    "                else:\n",
+    "                    nodess.append([])\n",
+    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
+    "            for qa in para['qas']:\n",
+    "                for answer in qa['answers']:\n",
+    "                    text = answer['text']\n",
+    "                    word_start = answer['answer_word_start']\n",
+    "                    word_stop = answer['answer_word_stop']\n",
+    "                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
+    "                    yield answer_words, text\n",
+    "\n",
+    "ca = compare_answers()\n",
+    "print(next(ca))\n",
+    "print(next(ca))\n",
+    "print(next(ca))\n",
+    "print(next(ca))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8\n"
+     ]
+    }
+   ],
+   "source": [
+    "def counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                if dep is None:\n",
+    "                    count += 1\n",
+    "    print(count)\n",
+    "counter()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "def bad_node_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            sents = para['sents']\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                if dep is not None:\n",
+    "                    nodes, edges = dep\n",
+    "                    for node in nodes:\n",
+    "                        if len(node) != 5:\n",
+    "                            count += 1\n",
+    "    print(count)\n",
+    "bad_node_counter()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "7\n"
+     ]
+    }
+   ],
+   "source": [
+    "def noanswer_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                if dep is not None:\n",
+    "                    nodes, edges = dep\n",
+    "                    nodess.append(nodes)\n",
+    "                else:\n",
+    "                    nodess.append([])\n",
+    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
+    "            for qa in para['qas']:\n",
+    "                for answer in qa['answers']:\n",
+    "                    text = answer['text']\n",
+    "                    word_start = answer['answer_word_start']\n",
+    "                    word_stop = answer['answer_word_stop']\n",
+    "                    if word_start is None:\n",
+    "                        count += 1\n",
+    "    print(count)\n",
+    "noanswer_counter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10600\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10348\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "\n",
+    "def _set_span(t, i):\n",
+    "    if isinstance(t[0], str):\n",
+    "        t.span = (i, i+len(t))\n",
+    "    else:\n",
+    "        first = True\n",
+    "        for c in t:\n",
+    "            cur_span = _set_span(c, i)\n",
+    "            i = cur_span[1]\n",
+    "            if first:\n",
+    "                min_ = cur_span[0]\n",
+    "                first = False\n",
+    "        max_ = cur_span[1]\n",
+    "        t.span = (min_, max_)\n",
+    "    return t.span\n",
+    "\n",
+    "\n",
+    "def set_span(t):\n",
+    "    assert isinstance(t, nltk.tree.Tree)\n",
+    "    try:\n",
+    "        return _set_span(t, 0)\n",
+    "    except:\n",
+    "        print(t)\n",
+    "        exit()\n",
+    "\n",
+    "def same_span_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            consts = para['consts']\n",
+    "            for const in consts:\n",
+    "                tree = nltk.tree.Tree.fromstring(const)\n",
+    "                set_span(tree)\n",
+    "                if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):\n",
+    "                    count += 1\n",
+    "    print(count)\n",
+    "same_span_counter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/tensorflow/SQuAD/squad/eda_aug_train.ipynb
+++ b/tensorflow/SQuAD/squad/eda_aug_train.ipynb
@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
+    "aug_data = json.load(open(aug_data_path, 'r'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
+      "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
+      "(['the', 'Main', 'Building'], 'the Main Building')\n",
+      "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
+     ]
+    }
+   ],
+   "source": [
+    "def compare_answers():\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                nodes, edges = dep\n",
+    "                if dep is not None:\n",
+    "                    nodess.append(nodes)\n",
+    "                else:\n",
+    "                    nodess.append([])\n",
+    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
+    "            for qa in para['qas']:\n",
+    "                for answer in qa['answers']:\n",
+    "                    text = answer['text']\n",
+    "                    word_start = answer['answer_word_start']\n",
+    "                    word_stop = answer['answer_word_stop']\n",
+    "                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
+    "                    yield answer_words, text\n",
+    "\n",
+    "ca = compare_answers()\n",
+    "print(next(ca))\n",
+    "print(next(ca))\n",
+    "print(next(ca))\n",
+    "print(next(ca))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "q: k\n",
+      "q: j\n",
+      "q: n\n",
+      "q: b\n",
+      "q: v\n",
+      "x: .\n",
+      "x: :208\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "q: dd\n",
+      "q: dd\n",
+      "q: dd\n",
+      "q: dd\n",
+      "q: d\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: :411\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: :40\n",
+      "x: .\n",
+      "x: *\n",
+      "x: :14\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: :131\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "x: .\n",
+      "53 10\n"
+     ]
+    }
+   ],
+   "source": [
+    "def nodep_counter():\n",
+    "    x_count = 0\n",
+    "    q_count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for sent, dep in zip(para['sents'], deps):\n",
+    "                if dep is None:\n",
+    "                    print(\"x:\", sent)\n",
+    "                    x_count += 1\n",
+    "            for qa in para['qas']:\n",
+    "                if qa['dep'] is None:\n",
+    "                    print(\"q:\", qa['question'])\n",
+    "                    q_count += 1\n",
+    "    print(x_count, q_count)\n",
+    "nodep_counter()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "def bad_node_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            sents = para['sents']\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                if dep is not None:\n",
+    "                    nodes, edges = dep\n",
+    "                    for node in nodes:\n",
+    "                        if len(node) != 5:\n",
+    "                            count += 1\n",
+    "    print(count)\n",
+    "bad_node_counter()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "36\n"
+     ]
+    }
+   ],
+   "source": [
+    "def noanswer_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            deps = para['deps']\n",
+    "            nodess = []\n",
+    "            for dep in deps:\n",
+    "                if dep is not None:\n",
+    "                    nodes, edges = dep\n",
+    "                    nodess.append(nodes)\n",
+    "                else:\n",
+    "                    nodess.append([])\n",
+    "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
+    "            for qa in para['qas']:\n",
+    "                for answer in qa['answers']:\n",
+    "                    text = answer['text']\n",
+    "                    word_start = answer['answer_word_start']\n",
+    "                    word_stop = answer['answer_word_stop']\n",
+    "                    if word_start is None:\n",
+    "                        count += 1\n",
+    "    print(count)\n",
+    "noanswer_counter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "106\n"
+     ]
+    }
+   ],
+   "source": [
+    "def mult_sent_answer_counter():\n",
+    "    count = 0\n",
+    "    for article in aug_data['data']:\n",
+    "        for para in article['paragraphs']:\n",
+    "            for qa in para['qas']:\n",
+    "                for answer in qa['answers']:\n",
+    "                    text = answer['text']\n",
+    "                    word_start = answer['answer_word_start']\n",
+    "                    word_stop = answer['answer_word_stop']\n",
+    "                    if word_start is not None and word_start[0] != word_stop[0]:\n",
+    "                        count += 1\n",
+    "    print(count)\n",
+    "mult_sent_answer_counter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/tensorflow/SQuAD/squad/evaluate-v1.1.py
+++ b/tensorflow/SQuAD/squad/evaluate-v1.1.py
@ -0,0 +1,94 @@
+""" Official evaluation script for v1.1 of the SQuAD dataset. """
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    return {'exact_match': exact_match, 'f1': f1}
+
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))
--- a/tensorflow/SQuAD/squad/evaluate.py
+++ b/tensorflow/SQuAD/squad/evaluate.py
@ -0,0 +1,94 @@
+""" Official evaluation script for v1.1 of the SQuAD dataset. [Changed name for external importing]"""
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    return {'exact_match': exact_match, 'f1': f1}
+
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))
--- a/tensorflow/SQuAD/squad/neg_squad.py
+++ b/tensorflow/SQuAD/squad/neg_squad.py
@ -0,0 +1,50 @@
+import argparse
+import json
+import os
+# data: q, cq, (dq), (pq), y, *x, *cx
+# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
+# no metadata
+import random
+from collections import Counter
+
+from tqdm import tqdm
+
+from squad.utils import get_word_span, get_word_idx, process_tokens
+
+
+def main():
+    args = get_args()
+    neg_squad(args)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    home = os.path.expanduser("~")
+    parser.add_argument("source_path")
+    parser.add_argument("target_path")
+    parser.add_argument('-d', "--debug", action='store_true')
+    parser.add_argument('-r', "--aug_ratio", default=1, type=int)
+    # TODO : put more args here
+    return parser.parse_args()
+
+
+def neg_squad(args):
+    with open(args.source_path, 'r') as fp:
+        squad = json.load(fp)
+    with open(args.source_path, 'r') as fp:
+        ref_squad = json.load(fp)
+
+    for ai, article in enumerate(ref_squad['data']):
+        for pi, para in enumerate(article['paragraphs']):
+            cands = list(range(pi)) + list(range(pi+1, len(article['paragraphs'])))
+            samples = random.sample(cands, args.aug_ratio)
+            for sample in samples:
+                for qi, ques in enumerate(article['paragraphs'][sample]['qas']):
+                    new_ques = {'question': ques['question'], 'answers': [], 'answer_start': 0, 'id': "neg_" + ques['id']}
+                    squad['data'][ai]['paragraphs'][pi]['qas'].append(new_ques)
+
+    with open(args.target_path, 'w') as fp:
+        json.dump(squad, fp)
+
+if __name__ == "__main__":
+    main()
--- a/tensorflow/SQuAD/squad/prepro.py
+++ b/tensorflow/SQuAD/squad/prepro.py
@ -0,0 +1,241 @@
+import argparse
+import json
+import os
+# data: q, cq, (dq), (pq), y, *x, *cx
+# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
+# no metadata
+from collections import Counter
+
+from tqdm import tqdm
+
+from squad.utils import get_word_span, get_word_idx, process_tokens
+
+
+def main():
+    args = get_args()
+    prepro(args)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    home = os.path.expanduser("~")
+    source_dir = os.path.join(home, "data", "squad")
+    target_dir = "data/squad"
+    glove_dir = os.path.join(home, "data", "glove")
+    parser.add_argument('-s', "--source_dir", default=source_dir)
+    parser.add_argument('-t', "--target_dir", default=target_dir)
+    parser.add_argument("--train_name", default='train-v1.1.json')
+    parser.add_argument('-d', "--debug", action='store_true')
+    parser.add_argument("--train_ratio", default=0.9, type=int)
+    parser.add_argument("--glove_corpus", default="6B")
+    parser.add_argument("--glove_dir", default=glove_dir)
+    parser.add_argument("--glove_vec_size", default=100, type=int)
+    parser.add_argument("--mode", default="full", type=str)
+    parser.add_argument("--single_path", default="", type=str)
+    parser.add_argument("--tokenizer", default="PTB", type=str)
+    parser.add_argument("--url", default="vision-server2.corp.ai2", type=str)
+    parser.add_argument("--port", default=8000, type=int)
+    parser.add_argument("--split", action='store_true')
+    parser.add_argument("--suffix", default="")
+    # TODO : put more args here
+    return parser.parse_args()
+
+
+def create_all(args):
+    out_path = os.path.join(args.source_dir, "all-v1.1.json")
+    if os.path.exists(out_path):
+        return
+    train_path = os.path.join(args.source_dir, args.train_name)
+    train_data = json.load(open(train_path, 'r'))
+    dev_path = os.path.join(args.source_dir, args.dev_name)
+    dev_data = json.load(open(dev_path, 'r'))
+    train_data['data'].extend(dev_data['data'])
+    print("dumping all data ...")
+    json.dump(train_data, open(out_path, 'w'))
+
+
+def prepro(args):
+    if not os.path.exists(args.target_dir):
+        os.makedirs(args.target_dir)
+
+    if args.mode == 'full':
+        prepro_each(args, 'train', out_name='train')
+        prepro_each(args, 'dev', out_name='dev')
+        prepro_each(args, 'dev', out_name='test')
+    elif args.mode == 'all':
+        create_all(args)
+        prepro_each(args, 'dev', 0.0, 0.0, out_name='dev')
+        prepro_each(args, 'dev', 0.0, 0.0, out_name='test')
+        prepro_each(args, 'all', out_name='train')
+    elif args.mode == 'single':
+        assert len(args.single_path) > 0
+        prepro_each(args, "NULL", out_name="single", in_path=args.single_path)
+    else:
+        prepro_each(args, 'train', 0.0, args.train_ratio, out_name='train')
+        prepro_each(args, 'train', args.train_ratio, 1.0, out_name='dev')
+        prepro_each(args, 'dev', out_name='test')
+
+
+def save(args, data, shared, data_type):
+    data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
+    json.dump(data, open(data_path, 'w'))
+    json.dump(shared, open(shared_path, 'w'))
+
+
+def get_word2vec(args, word_counter):
+    glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
+    sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
+    total = sizes[args.glove_corpus]
+    word2vec_dict = {}
+    with open(glove_path, 'r', encoding='utf-8') as fh:
+        for line in tqdm(fh, total=total):
+            array = line.lstrip().rstrip().split(" ")
+            word = array[0]
+            vector = list(map(float, array[1:]))
+            if word in word_counter:
+                word2vec_dict[word] = vector
+            elif word.capitalize() in word_counter:
+                word2vec_dict[word.capitalize()] = vector
+            elif word.lower() in word_counter:
+                word2vec_dict[word.lower()] = vector
+            elif word.upper() in word_counter:
+                word2vec_dict[word.upper()] = vector
+
+    print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
+    return word2vec_dict
+
+
+def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
+    if args.tokenizer == "PTB":
+        import nltk
+        sent_tokenize = nltk.sent_tokenize
+        def word_tokenize(tokens):
+            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
+    elif args.tokenizer == 'Stanford':
+        from my.corenlp_interface import CoreNLPInterface
+        interface = CoreNLPInterface(args.url, args.port)
+        sent_tokenize = interface.split_doc
+        word_tokenize = interface.split_sent
+    else:
+        raise Exception()
+
+    if not args.split:
+        sent_tokenize = lambda para: [para]
+
+    source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
+    source_data = json.load(open(source_path, 'r'))
+
+    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
+    na = []
+    cy = []
+    x, cx = [], []
+    answerss = []
+    p = []
+    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
+    start_ai = int(round(len(source_data['data']) * start_ratio))
+    stop_ai = int(round(len(source_data['data']) * stop_ratio))
+    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
+        xp, cxp = [], []
+        pp = []
+        x.append(xp)
+        cx.append(cxp)
+        p.append(pp)
+        for pi, para in enumerate(article['paragraphs']):
+            # wordss
+            context = para['context']
+            context = context.replace("''", '" ')
+            context = context.replace("``", '" ')
+            xi = list(map(word_tokenize, sent_tokenize(context)))
+            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
+            # given xi, add chars
+            cxi = [[list(xijk) for xijk in xij] for xij in xi]
+            xp.append(xi)
+            cxp.append(cxi)
+            pp.append(context)
+
+            for xij in xi:
+                for xijk in xij:
+                    word_counter[xijk] += len(para['qas'])
+                    lower_word_counter[xijk.lower()] += len(para['qas'])
+                    for xijkl in xijk:
+                        char_counter[xijkl] += len(para['qas'])
+
+            rxi = [ai, pi]
+            assert len(x) - 1 == ai
+            assert len(x[ai]) - 1 == pi
+            for qa in para['qas']:
+                # get words
+                qi = word_tokenize(qa['question'])
+                qi = process_tokens(qi)
+                cqi = [list(qij) for qij in qi]
+                yi = []
+                cyi = []
+                answers = []
+                for answer in qa['answers']:
+                    answer_text = answer['text']
+                    answers.append(answer_text)
+                    answer_start = answer['answer_start']
+                    answer_stop = answer_start + len(answer_text)
+                    # TODO : put some function that gives word_start, word_stop here
+                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
+                    # yi0 = answer['answer_word_start'] or [0, 0]
+                    # yi1 = answer['answer_word_stop'] or [0, 1]
+                    assert len(xi[yi0[0]]) > yi0[1]
+                    assert len(xi[yi1[0]]) >= yi1[1]
+                    w0 = xi[yi0[0]][yi0[1]]
+                    w1 = xi[yi1[0]][yi1[1]-1]
+                    i0 = get_word_idx(context, xi, yi0)
+                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
+                    cyi0 = answer_start - i0
+                    cyi1 = answer_stop - i1 - 1
+                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
+                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
+                    assert answer_text[-1] == w1[cyi1]
+                    assert cyi0 < 32, (answer_text, w0)
+                    assert cyi1 < 32, (answer_text, w1)
+
+                    yi.append([yi0, yi1])
+                    cyi.append([cyi0, cyi1])
+
+                if len(qa['answers']) == 0:
+                    yi.append([(0, 0), (0, 1)])
+                    cyi.append([0, 1])
+                    na.append(True)
+                else:
+                    na.append(False)
+
+                for qij in qi:
+                    word_counter[qij] += 1
+                    lower_word_counter[qij.lower()] += 1
+                    for qijk in qij:
+                        char_counter[qijk] += 1
+
+                q.append(qi)
+                cq.append(cqi)
+                y.append(yi)
+                cy.append(cyi)
+                rx.append(rxi)
+                rcx.append(rxi)
+                ids.append(qa['id'])
+                idxs.append(len(idxs))
+                answerss.append(answers)
+
+        if args.debug:
+            break
+
+    word2vec_dict = get_word2vec(args, word_counter)
+    lower_word2vec_dict = get_word2vec(args, lower_word_counter)
+
+    # add context here
+    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
+            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na}
+    shared = {'x': x, 'cx': cx, 'p': p,
+              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
+              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}
+
+    print("saving ...")
+    save(args, data, shared, out_name)
+
+if __name__ == "__main__":
+    main()
--- a/tensorflow/SQuAD/squad/prepro_aug.py
+++ b/tensorflow/SQuAD/squad/prepro_aug.py
@ -0,0 +1,183 @@
+import argparse
+import json
+import os
+# data: q, cq, (dq), (pq), y, *x, *cx
+# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
+# no metadata
+from collections import Counter
+
+import nltk
+from tqdm import tqdm
+
+from my.nltk_utils import load_compressed_tree
+
+
+def bool_(arg):
+    if arg == 'True':
+        return True
+    elif arg == 'False':
+        return False
+    raise Exception()
+
+
+def main():
+    args = get_args()
+    prepro(args)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    home = os.path.expanduser("~")
+    source_dir = os.path.join(home, "data", "squad")
+    target_dir = "data/squad"
+    glove_dir = os.path.join(home, "data", "glove")
+    parser.add_argument("--source_dir", default=source_dir)
+    parser.add_argument("--target_dir", default=target_dir)
+    parser.add_argument("--debug", default=False, type=bool_)
+    parser.add_argument("--train_ratio", default=0.9, type=int)
+    parser.add_argument("--glove_corpus", default="6B")
+    parser.add_argument("--glove_dir", default=glove_dir)
+    parser.add_argument("--glove_vec_size", default=100, type=int)
+    parser.add_argument("--full_train", default=False, type=bool_)
+    # TODO : put more args here
+    return parser.parse_args()
+
+
+def prepro(args):
+    if not os.path.exists(args.target_dir):
+        os.makedirs(args.target_dir)
+
+    if args.full_train:
+        data_train, shared_train = prepro_each(args, 'train')
+        data_dev, shared_dev = prepro_each(args, 'dev')
+    else:
+        data_train, shared_train = prepro_each(args, 'train', 0.0, args.train_ratio)
+        data_dev, shared_dev = prepro_each(args, 'train', args.train_ratio, 1.0)
+    data_test, shared_test = prepro_each(args, 'dev')
+
+    print("saving ...")
+    save(args, data_train, shared_train, 'train')
+    save(args, data_dev, shared_dev, 'dev')
+    save(args, data_test, shared_test, 'test')
+
+
+def save(args, data, shared, data_type):
+    data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
+    json.dump(data, open(data_path, 'w'))
+    json.dump(shared, open(shared_path, 'w'))
+
+
+def get_word2vec(args, word_counter):
+    glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
+    sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
+    total = sizes[args.glove_corpus]
+    word2vec_dict = {}
+    with open(glove_path, 'r') as fh:
+        for line in tqdm(fh, total=total):
+            array = line.lstrip().rstrip().split(" ")
+            word = array[0]
+            vector = list(map(float, array[1:]))
+            if word in word_counter:
+                word2vec_dict[word] = vector
+            elif word.capitalize() in word_counter:
+                word2vec_dict[word.capitalize()] = vector
+            elif word.lower() in word_counter:
+                word2vec_dict[word.lower()] = vector
+            elif word.upper() in word_counter:
+                word2vec_dict[word.upper()] = vector
+
+    print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
+    return word2vec_dict
+
+
+def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
+    source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
+    source_data = json.load(open(source_path, 'r'))
+
+    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
+    x, cx, tx, stx = [], [], [], []
+    answerss = []
+    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
+    pos_counter = Counter()
+    start_ai = int(round(len(source_data['data']) * start_ratio))
+    stop_ai = int(round(len(source_data['data']) * stop_ratio))
+    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
+        xp, cxp, txp, stxp = [], [], [], []
+        x.append(xp)
+        cx.append(cxp)
+        tx.append(txp)
+        stx.append(stxp)
+        for pi, para in enumerate(article['paragraphs']):
+            xi = []
+            for dep in para['deps']:
+                if dep is None:
+                    xi.append([])
+                else:
+                    xi.append([node[0] for node in dep[0]])
+            cxi = [[list(xijk) for xijk in xij] for xij in xi]
+            xp.append(xi)
+            cxp.append(cxi)
+            txp.append(para['consts'])
+            stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
+            trees = map(nltk.tree.Tree.fromstring, para['consts'])
+            for tree in trees:
+                for subtree in tree.subtrees():
+                    pos_counter[subtree.label()] += 1
+
+            for xij in xi:
+                for xijk in xij:
+                    word_counter[xijk] += len(para['qas'])
+                    lower_word_counter[xijk.lower()] += len(para['qas'])
+                    for xijkl in xijk:
+                        char_counter[xijkl] += len(para['qas'])
+
+            rxi = [ai, pi]
+            assert len(x) - 1 == ai
+            assert len(x[ai]) - 1 == pi
+            for qa in para['qas']:
+                dep = qa['dep']
+                qi = [] if dep is None else [node[0] for node in dep[0]]
+                cqi = [list(qij) for qij in qi]
+                yi = []
+                answers = []
+                for answer in qa['answers']:
+                    answers.append(answer['text'])
+                    yi0 = answer['answer_word_start'] or [0, 0]
+                    yi1 = answer['answer_word_stop'] or [0, 1]
+                    assert len(xi[yi0[0]]) > yi0[1]
+                    assert len(xi[yi1[0]]) >= yi1[1]
+                    yi.append([yi0, yi1])
+
+                for qij in qi:
+                    word_counter[qij] += 1
+                    lower_word_counter[qij.lower()] += 1
+                    for qijk in qij:
+                        char_counter[qijk] += 1
+
+                q.append(qi)
+                cq.append(cqi)
+                y.append(yi)
+                rx.append(rxi)
+                rcx.append(rxi)
+                ids.append(qa['id'])
+                idxs.append(len(idxs))
+                answerss.append(answers)
+
+            if args.debug:
+                break
+
+    word2vec_dict = get_word2vec(args, word_counter)
+    lower_word2vec_dict = get_word2vec(args, lower_word_counter)
+
+    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
+            'idxs': idxs, 'ids': ids, 'answerss': answerss}
+    shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
+              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
+              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}
+
+    return data, shared
+
+
+if __name__ == "__main__":
+    main()
--- a/tensorflow/SQuAD/squad/utils.py
+++ b/tensorflow/SQuAD/squad/utils.py
@ -0,0 +1,146 @@
+import re
+import numpy as np
+
+
+def get_2d_spans(text, tokenss):
+    spanss = []
+    cur_idx = 0
+    for tokens in tokenss:
+        spans = []
+        for token in tokens:
+            if text.find(token, cur_idx) < 0:
+                print(tokens)
+                print("{} {} {}".format(token, cur_idx, text))
+                raise Exception()
+            cur_idx = text.find(token, cur_idx)
+            spans.append((cur_idx, cur_idx + len(token)))
+            cur_idx += len(token)
+        spanss.append(spans)
+    return spanss
+
+
+def get_word_span(context, wordss, start, stop):
+    spanss = get_2d_spans(context, wordss)
+    idxs = []
+    for sent_idx, spans in enumerate(spanss):
+        for word_idx, span in enumerate(spans):
+            if not (stop <= span[0] or start >= span[1]):
+                idxs.append((sent_idx, word_idx))
+
+    assert len(idxs) > 0, "{} {} {} {}".format(context, spanss, start, stop)
+    return idxs[0], (idxs[-1][0], idxs[-1][1] + 1)
+
+
+def get_phrase(context, wordss, span):
+    """
+    Obtain phrase as substring of context given start and stop indices in word level
+    :param context:
+    :param wordss:
+    :param start: [sent_idx, word_idx]
+    :param stop: [sent_idx, word_idx]
+    :return:
+    """
+    start, stop = span
+    flat_start = get_flat_idx(wordss, start)
+    flat_stop = get_flat_idx(wordss, stop)
+    words = sum(wordss, [])
+    char_idx = 0
+    char_start, char_stop = None, None
+    for word_idx, word in enumerate(words):
+        char_idx = context.find(word, char_idx)
+        assert char_idx >= 0
+        if word_idx == flat_start:
+            char_start = char_idx
+        char_idx += len(word)
+        if word_idx == flat_stop - 1:
+            char_stop = char_idx
+    assert char_start is not None
+    assert char_stop is not None
+    return context[char_start:char_stop]
+
+
+def get_flat_idx(wordss, idx):
+    return sum(len(words) for words in wordss[:idx[0]]) + idx[1]
+
+
+def get_word_idx(context, wordss, idx):
+    spanss = get_2d_spans(context, wordss)
+    return spanss[idx[0]][idx[1]][0]
+
+
+def process_tokens(temp_tokens):
+    tokens = []
+    for token in temp_tokens:
+        flag = False
+        l = ("-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
+        # \u2013 is en-dash. Used for number to nubmer
+        # l = ("-", "\u2212", "\u2014", "\u2013")
+        # l = ("\u2013",)
+        tokens.extend(re.split("([{}])".format("".join(l)), token))
+    return tokens
+
+
+def get_best_span(ypi, yp2i):
+    max_val = 0
+    best_word_span = (0, 1)
+    best_sent_idx = 0
+    for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
+        argmax_j1 = 0
+        for j in range(len(ypif)):
+            val1 = ypif[argmax_j1]
+            if val1 < ypif[j]:
+                val1 = ypif[j]
+                argmax_j1 = j
+
+            val2 = yp2if[j]
+            if val1 * val2 > max_val:
+                best_word_span = (argmax_j1, j)
+                best_sent_idx = f
+                max_val = val1 * val2
+    return ((best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)), float(max_val)
+
+
+def get_best_span_wy(wypi, th):
+    chunk_spans = []
+    scores = []
+    chunk_start = None
+    score = 0
+    l = 0
+    th = min(th, np.max(wypi))
+    for f, wypif in enumerate(wypi):
+        for j, wypifj in enumerate(wypif):
+            if wypifj >= th:
+                if chunk_start is None:
+                    chunk_start = f, j
+                score += wypifj
+                l += 1
+            else:
+                if chunk_start is not None:
+                    chunk_stop = f, j
+                    chunk_spans.append((chunk_start, chunk_stop))
+                    scores.append(score/l)
+                    score = 0
+                    l = 0
+                    chunk_start = None
+        if chunk_start is not None:
+            chunk_stop = f, j+1
+            chunk_spans.append((chunk_start, chunk_stop))
+            scores.append(score/l)
+            score = 0
+            l = 0
+            chunk_start = None
+
+    return max(zip(chunk_spans, scores), key=lambda pair: pair[1])
+
+
+def get_span_score_pairs(ypi, yp2i):
+    span_score_pairs = []
+    for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
+        for j in range(len(ypif)):
+            for k in range(j, len(yp2if)):
+                span = ((f, j), (f, k+1))
+                score = ypif[j] * yp2if[k]
+                span_score_pairs.append((span, score))
+    return span_score_pairs
+
+
--- a/tensorflow/SQuAD/tree/init.py
+++ b/tensorflow/SQuAD/tree/init.py
--- a/tensorflow/SQuAD/tree/cli.py
+++ b/tensorflow/SQuAD/tree/cli.py
@ -0,0 +1,57 @@
+import os
+from pprint import pprint
+
+import tensorflow as tf
+
+from tree.main import main as m
+
+flags = tf.app.flags
+
+flags.DEFINE_string("model_name", "tree", "Model name [tree]")
+flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
+flags.DEFINE_integer("run_id", 0, "Run ID [0]")
+
+flags.DEFINE_integer("batch_size", 128, "Batch size [128]")
+flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
+flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
+flags.DEFINE_integer("num_steps", 0, "Number of steps [0]")
+flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
+flags.DEFINE_integer("load_step", 0, "load step [0]")
+flags.DEFINE_integer("early_stop", 4, "early stop [4]")
+
+flags.DEFINE_string("mode", "test", "train | test | forward [test]")
+flags.DEFINE_boolean("load", True, "load saved data? [True]")
+flags.DEFINE_boolean("progress", True, "Show progress? [True]")
+flags.DEFINE_integer("log_period", 100, "Log period [100]")
+flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
+flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
+flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
+
+flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
+
+flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]")
+flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]")
+flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
+flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
+flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]")
+flags.DEFINE_bool("lower_word", True, "lower word [True]")
+flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
+
+flags.DEFINE_integer("word_count_th", 100, "word count th [100]")
+flags.DEFINE_integer("char_count_th", 500, "char count th [500]")
+flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]")
+flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
+flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]")
+flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
+flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]")
+
+
+def main(_):
+    config = flags.FLAGS
+
+    config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2))
+
+    m(config)
+
+if __name__ == "__main__":
+    tf.app.run()
--- a/tensorflow/SQuAD/tree/evaluator.py
+++ b/tensorflow/SQuAD/tree/evaluator.py
@ -0,0 +1,197 @@
+import numpy as np
+import tensorflow as tf
+
+from tree.read_data import DataSet
+from my.nltk_utils import span_f1
+
+
+class Evaluation(object):
+    def __init__(self, data_type, global_step, idxs, yp):
+        self.data_type = data_type
+        self.global_step = global_step
+        self.idxs = idxs
+        self.yp = yp
+        self.num_examples = len(yp)
+        self.dict = {'data_type': data_type,
+                     'global_step': global_step,
+                     'yp': yp,
+                     'idxs': idxs,
+                     'num_examples': self.num_examples}
+        self.summaries = None
+
+    def __repr__(self):
+        return "{} step {}".format(self.data_type, self.global_step)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_idxs = self.idxs + other.idxs
+        return Evaluation(self.data_type, self.global_step, new_idxs, new_yp)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+
+class LabeledEvaluation(Evaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y):
+        super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp)
+        self.y = y
+        self.dict['y'] = y
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_idxs = self.idxs + other.idxs
+        return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y)
+
+
+class AccuracyEvaluation(LabeledEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, y, correct, loss):
+        super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y)
+        self.loss = loss
+        self.correct = correct
+        self.acc = sum(correct) / len(correct)
+        self.dict['loss'] = loss
+        self.dict['correct'] = correct
+        self.dict['acc'] = self.acc
+        loss_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=self.loss)])
+        acc_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/acc', simple_value=self.acc)])
+        self.summaries = [loss_summary, acc_summary]
+
+    def __repr__(self):
+        return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_y = self.y + other.y
+        new_correct = self.correct + other.correct
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss)
+
+
+class Evaluator(object):
+    def __init__(self, config, model):
+        self.config = config
+        self.model = model
+
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist())
+        return e
+
+    def get_evaluation_from_batches(self, sess, batches):
+        e = sum(self.get_evaluation(sess, batch) for batch in batches)
+        return e
+
+
+class LabeledEvaluator(Evaluator):
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
+        global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
+        yp = yp[:data_set.num_examples]
+        y = feed_dict[self.model.y]
+        e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist())
+        return e
+
+
+class AccuracyEvaluator(LabeledEvaluator):
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        assert isinstance(data_set, DataSet)
+        feed_dict = self.model.get_feed_dict(data_set, False)
+        global_step, yp, loss = sess.run([self.model.global_step, self.model.yp, self.model.loss], feed_dict=feed_dict)
+        y = feed_dict[self.model.y]
+        yp = yp[:data_set.num_examples]
+        correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
+        e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), correct, float(loss))
+        return e
+
+    @staticmethod
+    def compare(yi, ypi):
+        return int(np.argmax(yi)) == int(np.argmax(ypi))
+
+
+class AccuracyEvaluator2(AccuracyEvaluator):
+    @staticmethod
+    def compare(yi, ypi):
+        i = int(np.argmax(yi.flatten()))
+        j = int(np.argmax(ypi.flatten()))
+        # print(i, j, i == j)
+        return i == j
+
+
+class TempEvaluation(AccuracyEvaluation):
+    def __init__(self, data_type, global_step, idxs, yp, yp2, y, y2, correct, loss, f1s):
+        super(TempEvaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss)
+        self.y2 = y2
+        self.yp2 = yp2
+        self.f1s = f1s
+        self.f1 = float(np.mean(f1s))
+        self.dict['y2'] = y2
+        self.dict['yp2'] = yp2
+        self.dict['f1s'] = f1s
+        self.dict['f1'] = self.f1
+        f1_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=self.f1)])
+        self.summaries.append(f1_summary)
+
+    def __add__(self, other):
+        if other == 0:
+            return self
+        assert self.data_type == other.data_type
+        assert self.global_step == other.global_step
+        new_idxs = self.idxs + other.idxs
+        new_yp = self.yp + other.yp
+        new_yp2 = self.yp2 + other.yp2
+        new_y = self.y + other.y
+        new_y2 = self.y2 + other.y2
+        new_correct = self.correct + other.correct
+        new_f1s = self.f1s + other.f1s
+        new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
+        return TempEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_y2, new_correct, new_loss, new_f1s)
+
+
+class TempEvaluator(LabeledEvaluator):
+    def get_evaluation(self, sess, batch):
+        idxs, data_set = batch
+        assert isinstance(data_set, DataSet)
+        feed_dict = self.model.get_feed_dict(data_set, False)
+        global_step, yp, yp2, loss = sess.run([self.model.global_step, self.model.yp, self.model.yp2, self.model.loss], feed_dict=feed_dict)
+        y, y2 = feed_dict[self.model.y], feed_dict[self.model.y2]
+        yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
+        correct = [self.__class__.compare(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
+        f1s = [self.__class__.span_f1(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
+        e = TempEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y.tolist(), y2.tolist(), correct, float(loss), f1s)
+        return e
+
+    @staticmethod
+    def compare(yi, y2i, ypi, yp2i):
+        i = int(np.argmax(yi.flatten()))
+        j = int(np.argmax(ypi.flatten()))
+        k = int(np.argmax(y2i.flatten()))
+        l = int(np.argmax(yp2i.flatten()))
+        # print(i, j, i == j)
+        return i == j and k == l
+
+    @staticmethod
+    def span_f1(yi, y2i, ypi, yp2i):
+        true_span = (np.argmax(yi.flatten()), np.argmax(y2i.flatten())+1)
+        pred_span = (np.argmax(ypi.flatten()), np.argmax(yp2i.flatten())+1)
+        f1 = span_f1(true_span, pred_span)
+        return f1
+
--- a/tensorflow/SQuAD/tree/graph_handler.py
+++ b/tensorflow/SQuAD/tree/graph_handler.py
@ -0,0 +1,54 @@
+import json
+from json import encoder
+import os
+
+import tensorflow as tf
+
+from tree.evaluator import Evaluation
+from my.utils import short_floats
+
+
+class GraphHandler(object):
+    def __init__(self, config):
+        self.config = config
+        self.saver = tf.train.Saver()
+        self.writer = None
+        self.save_path = os.path.join(config.save_dir, config.model_name)
+
+    def initialize(self, sess):
+        if self.config.load:
+            self._load(sess)
+        else:
+            sess.run(tf.global_variables_initializer())
+
+        if self.config.mode == 'train':
+            self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
+
+    def save(self, sess, global_step=None):
+        self.saver.save(sess, self.save_path, global_step=global_step)
+
+    def _load(self, sess):
+        config = self.config
+        if config.load_step > 0:
+            save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
+        else:
+            save_dir = config.save_dir
+            checkpoint = tf.train.get_checkpoint_state(save_dir)
+            assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
+            save_path = checkpoint.model_checkpoint_path
+        print("Loading saved model from {}".format(save_path))
+        self.saver.restore(sess, save_path)
+
+    def add_summary(self, summary, global_step):
+        self.writer.add_summary(summary, global_step)
+
+    def add_summaries(self, summaries, global_step):
+        for summary in summaries:
+            self.add_summary(summary, global_step)
+
+    def dump_eval(self, e, precision=2):
+        assert isinstance(e, Evaluation)
+        path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
+        with open(path, 'w') as fh:
+            json.dump(short_floats(e.dict, precision), fh)
+
--- a/tensorflow/SQuAD/tree/main.py
+++ b/tensorflow/SQuAD/tree/main.py
@ -0,0 +1,187 @@
+import argparse
+import json
+import math
+import os
+import shutil
+from pprint import pprint
+
+import tensorflow as tf
+from tqdm import tqdm
+import numpy as np
+
+from tree.evaluator import AccuracyEvaluator2, Evaluator
+from tree.graph_handler import GraphHandler
+from tree.model import Model
+from tree.trainer import Trainer
+
+from tree.read_data import load_metadata, read_data, get_squad_data_filter, update_config
+
+
+def main(config):
+    set_dirs(config)
+    if config.mode == 'train':
+        _train(config)
+    elif config.mode == 'test':
+        _test(config)
+    elif config.mode == 'forward':
+        _forward(config)
+    else:
+        raise ValueError("invalid value for 'mode': {}".format(config.mode))
+
+
+def _config_draft(config):
+    if config.draft:
+        config.num_steps = 10
+        config.eval_period = 10
+        config.log_period = 1
+        config.save_period = 10
+        config.eval_num_batches = 1
+
+
+def _train(config):
+    # load_metadata(config, 'train')  # this updates the config file according to metadata file
+
+    data_filter = get_squad_data_filter(config)
+    train_data = read_data(config, 'train', config.load, data_filter=data_filter)
+    dev_data = read_data(config, 'dev', True, data_filter=data_filter)
+    update_config(config, [train_data, dev_data])
+
+    _config_draft(config)
+
+    word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
+    word2idx_dict = train_data.shared['word2idx']
+    idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
+    print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
+    emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
+                        else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
+                        for idx in range(config.word_vocab_size)])
+    config.emb_mat = emb_mat
+
+    # construct model graph and variables (using default graph)
+    pprint(config.__flags, indent=2)
+    model = Model(config)
+    trainer = Trainer(config, model)
+    evaluator = AccuracyEvaluator2(config, model)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    # Variables
+    sess = tf.Session()
+    graph_handler.initialize(sess)
+
+    # begin training
+    num_steps = config.num_steps or int(config.num_epochs * train_data.num_examples / config.batch_size)
+    max_acc = 0
+    noupdate_count = 0
+    global_step = 0
+    for _, batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_steps, shuffle=True), total=num_steps):
+        global_step = sess.run(model.global_step) + 1  # +1 because all calculations are done after step
+        get_summary = global_step % config.log_period == 0
+        loss, summary, train_op = trainer.step(sess, batch, get_summary=get_summary)
+        if get_summary:
+            graph_handler.add_summary(summary, global_step)
+
+        # Occasional evaluation and saving
+        if global_step % config.save_period == 0:
+            graph_handler.save(sess, global_step=global_step)
+        if global_step % config.eval_period == 0:
+            num_batches = math.ceil(dev_data.num_examples / config.batch_size)
+            if 0 < config.eval_num_batches < num_batches:
+                num_batches = config.eval_num_batches
+            e = evaluator.get_evaluation_from_batches(
+                sess, tqdm(dev_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
+            graph_handler.add_summaries(e.summaries, global_step)
+            if e.acc > max_acc:
+                max_acc = e.acc
+                noupdate_count = 0
+            else:
+                noupdate_count += 1
+                if noupdate_count == config.early_stop:
+                    break
+            if config.dump_eval:
+                graph_handler.dump_eval(e)
+    if global_step % config.save_period != 0:
+        graph_handler.save(sess, global_step=global_step)
+
+
+def _test(config):
+    test_data = read_data(config, 'test', True)
+    update_config(config, [test_data])
+
+    _config_draft(config)
+
+    pprint(config.__flags, indent=2)
+    model = Model(config)
+    evaluator = AccuracyEvaluator2(config, model)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    sess = tf.Session()
+    graph_handler.initialize(sess)
+
+    num_batches = math.ceil(test_data.num_examples / config.batch_size)
+    if 0 < config.eval_num_batches < num_batches:
+        num_batches = config.eval_num_batches
+    e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
+    print(e)
+    if config.dump_eval:
+        graph_handler.dump_eval(e)
+
+
+def _forward(config):
+
+    forward_data = read_data(config, 'forward', True)
+
+    _config_draft(config)
+
+    pprint(config.__flag, indent=2)
+    model = Model(config)
+    evaluator = Evaluator(config, model)
+    graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
+
+    sess = tf.Session()
+    graph_handler.initialize(sess)
+
+    num_batches = math.ceil(forward_data.num_examples / config.batch_size)
+    if 0 < config.eval_num_batches < num_batches:
+        num_batches = config.eval_num_batches
+    e = evaluator.get_evaluation_from_batches(sess, tqdm(forward_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
+    print(e)
+    if config.dump_eval:
+        graph_handler.dump_eval(e)
+
+
+def set_dirs(config):
+    # create directories
+    if not config.load and os.path.exists(config.out_dir):
+        shutil.rmtree(config.out_dir)
+
+    config.save_dir = os.path.join(config.out_dir, "save")
+    config.log_dir = os.path.join(config.out_dir, "log")
+    config.eval_dir = os.path.join(config.out_dir, "eval")
+    if not os.path.exists(config.out_dir):
+        os.makedirs(config.out_dir)
+    if not os.path.exists(config.save_dir):
+        os.mkdir(config.save_dir)
+    if not os.path.exists(config.log_dir):
+        os.mkdir(config.eval_dir)
+
+
+def _get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config_path")
+    return parser.parse_args()
+
+
+class Config(object):
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+
+def _run():
+    args = _get_args()
+    with open(args.config_path, 'r') as fh:
+        config = Config(**json.load(fh))
+        main(config)
+
+
+if __name__ == "__main__":
+    _run()
--- a/tensorflow/SQuAD/tree/model.py
+++ b/tensorflow/SQuAD/tree/model.py
@ -0,0 +1,248 @@
+import nltk
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops.rnn_cell import BasicLSTMCell
+
+from my.nltk_utils import tree2matrix, find_max_f1_subtree, load_compressed_tree, set_span
+from tree.read_data import DataSet
+from my.tensorflow import exp_mask, get_initializer
+from my.tensorflow.nn import linear
+from my.tensorflow.rnn import bidirectional_dynamic_rnn, dynamic_rnn
+from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, NoOpCell, TreeRNNCell
+
+
+class Model(object):
+    def __init__(self, config):
+        self.config = config
+        self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
+                                           initializer=tf.constant_initializer(0), trainable=False)
+
+        # Define forward inputs here
+        N, M, JX, JQ, VW, VC, W, H = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size, config.max_tree_height
+        self.x = tf.placeholder('int32', [None, M, JX], name='x')
+        self.cx = tf.placeholder('int32', [None, M, JX, W], name='cx')
+        self.q = tf.placeholder('int32', [None, JQ], name='q')
+        self.cq = tf.placeholder('int32', [None, JQ, W], name='cq')
+        self.tx = tf.placeholder('int32', [None, M, H, JX], name='tx')
+        self.tx_edge_mask = tf.placeholder('bool', [None, M, H, JX, JX], name='tx_edge_mask')
+        self.y = tf.placeholder('bool', [None, M, H, JX], name='y')
+        self.is_train = tf.placeholder('bool', [], name='is_train')
+
+        # Define misc
+
+        # Forward outputs / loss inputs
+        self.logits = None
+        self.yp = None
+        self.var_list = None
+
+        # Loss outputs
+        self.loss = None
+
+        self._build_forward()
+        self._build_loss()
+
+        self.ema_op = self._get_ema_op()
+        self.summary = tf.summary.merge_all()
+
+    def _build_forward(self):
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, dc, W = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
+            config.char_emb_size, config.max_word_size
+        H = config.max_tree_height
+
+        x_mask = self.x > 0
+        q_mask = self.q > 0
+        tx_mask = self.tx > 0  # [N, M, H, JX]
+
+        with tf.variable_scope("char_emb"):
+            char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
+            Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
+            Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
+
+            filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float')
+            bias = tf.get_variable("bias", shape=[d], dtype='float')
+            strides = [1, 1, 1, 1]
+            Acx = tf.reshape(Acx, [-1, JX, W, dc])
+            Acq = tf.reshape(Acq, [-1, JQ, W, dc])
+            xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias  # [N*M, JX, W/filter_stride, d]
+            qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias  # [N, JQ, W/filter_stride, d]
+            xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d])
+            qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d])
+
+        with tf.variable_scope("word_emb"):
+            if config.mode == 'train':
+                word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat))
+            else:
+                word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float')
+            Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
+            Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
+            # Ax = linear([Ax], d, False, scope='Ax_reshape')
+            # Aq = linear([Aq], d, False, scope='Aq_reshape')
+
+        xx = tf.concat(axis=3, values=[xxc, Ax])  # [N, M, JX, 2d]
+        qq = tf.concat(axis=2, values=[qqc, Aq])  # [N, JQ, 2d]
+        D = d + config.word_emb_size
+
+        with tf.variable_scope("pos_emb"):
+            pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float')
+            Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx)  # [N, M, H, JX, d]
+
+        cell = BasicLSTMCell(D, state_is_tuple=True)
+        cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
+        x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2)  # [N, M]
+        q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1)  # [N]
+
+        with tf.variable_scope("rnn"):
+            (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start')  # [N, M, JX, 2d]
+            tf.get_variable_scope().reuse_variables()
+            (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start')  # [N, J, d], [N, d]
+            u = (fw_u + bw_u) / 2.0
+            h = (fw_h + bw_h) / 2.0
+
+        with tf.variable_scope("h"):
+            no_op_cell = NoOpCell(D)
+            tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max)
+            initial_state = tf.reshape(h, [N*M*JX, D])  # [N*M*JX, D]
+            inputs = tf.concat(axis=4, values=[Atx, tf.cast(self.tx_edge_mask, 'float')])  # [N, M, H, JX, d+JX]
+            inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX])  # [N*M*JX, H, d+JX]
+            length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX])
+            # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1])
+            h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state)  # [N*M*JX, H, D]
+            h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4])  # [N, M, H, JX, D]
+
+        u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1)  # [N, 1, 1, 1, 4d]
+        dot = linear(h * u, 1, True, squeeze=True, scope='dot')  # [N, M, H, JX]
+        # self.logits = tf.reshape(dot, [N, M * H * JX])
+        self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX])  # [N, M, H, JX]
+        self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
+
+    def _build_loss(self):
+        config = self.config
+        N, M, JX, JQ, VW, VC = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size
+        H = config.max_tree_height
+        ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
+            logits=self.logits, labels=tf.cast(tf.reshape(self.y, [N, M * H * JX]), 'float')))
+        tf.add_to_collection('losses', ce_loss)
+        self.loss = tf.add_n(tf.get_collection('losses'), name='loss')
+        tf.summary.scalar(self.loss.op.name, self.loss)
+        tf.add_to_collection('ema/scalar', self.loss)
+
+    def _get_ema_op(self):
+        ema = tf.train.ExponentialMovingAverage(self.config.decay)
+        ema_op = ema.apply(tf.get_collection("ema/scalar") + tf.get_collection("ema/histogram"))
+        for var in tf.get_collection("ema/scalar"):
+            ema_var = ema.average(var)
+            tf.summary.scalar(ema_var.op.name, ema_var)
+        for var in tf.get_collection("ema/histogram"):
+            ema_var = ema.average(var)
+            tf.summary.histogram(ema_var.op.name, ema_var)
+        return ema_op
+
+    def get_loss(self):
+        return self.loss
+
+    def get_global_step(self):
+        return self.global_step
+
+    def get_var_list(self):
+        return self.var_list
+
+    def get_feed_dict(self, batch, is_train, supervised=True):
+        assert isinstance(batch, DataSet)
+        config = self.config
+        N, M, JX, JQ, VW, VC, d, W, H = \
+            config.batch_size, config.max_num_sents, config.max_sent_size, \
+            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size, \
+            config.max_tree_height
+        feed_dict = {}
+
+        x = np.zeros([N, M, JX], dtype='int32')
+        cx = np.zeros([N, M, JX, W], dtype='int32')
+        q = np.zeros([N, JQ], dtype='int32')
+        cq = np.zeros([N, JQ, W], dtype='int32')
+        tx = np.zeros([N, M, H, JX], dtype='int32')
+        tx_edge_mask = np.zeros([N, M, H, JX, JX], dtype='bool')
+
+        feed_dict[self.x] = x
+        feed_dict[self.cx] = cx
+        feed_dict[self.q] = q
+        feed_dict[self.cq] = cq
+        feed_dict[self.tx] = tx
+        feed_dict[self.tx_edge_mask] = tx_edge_mask
+        feed_dict[self.is_train] = is_train
+
+        def _get_word(word):
+            d = batch.shared['word2idx']
+            for each in (word, word.lower(), word.capitalize(), word.upper()):
+                if each in d:
+                    return d[each]
+            return 1
+
+        def _get_char(char):
+            d = batch.shared['char2idx']
+            if char in d:
+                return d[char]
+            return 1
+
+        def _get_pos(tree):
+            d = batch.shared['pos2idx']
+            if tree.label() in d:
+                return d[tree.label()]
+            return 1
+
+        for i, xi in enumerate(batch.data['x']):
+            for j, xij in enumerate(xi):
+                for k, xijk in enumerate(xij):
+                    x[i, j, k] = _get_word(xijk)
+
+        for i, cxi in enumerate(batch.data['cx']):
+            for j, cxij in enumerate(cxi):
+                for k, cxijk in enumerate(cxij):
+                    for l, cxijkl in enumerate(cxijk):
+                        cx[i, j, k, l] = _get_char(cxijkl)
+                        if l + 1 == config.max_word_size:
+                            break
+
+        for i, qi in enumerate(batch.data['q']):
+            for j, qij in enumerate(qi):
+                q[i, j] = _get_word(qij)
+
+        for i, cqi in enumerate(batch.data['cq']):
+            for j, cqij in enumerate(cqi):
+                for k, cqijk in enumerate(cqij):
+                    cq[i, j, k] = _get_char(cqijk)
+                    if k + 1 == config.max_word_size:
+                        break
+
+        for i, txi in enumerate(batch.data['stx']):
+            for j, txij in enumerate(txi):
+                txij_mat, txij_mask = tree2matrix(nltk.tree.Tree.fromstring(txij), _get_pos, row_size=H, col_size=JX)
+                tx[i, j, :, :], tx_edge_mask[i, j, :, :, :] = txij_mat, txij_mask
+
+        if supervised:
+            y = np.zeros([N, M, H, JX], dtype='bool')
+            feed_dict[self.y] = y
+            for i, yi in enumerate(batch.data['y']):
+                start_idx, stop_idx = yi
+                sent_idx = start_idx[0]
+                if start_idx[0] == stop_idx[0]:
+                    span = [start_idx[1], stop_idx[1]]
+                else:
+                    span = [start_idx[1], len(batch.data['x'][sent_idx])]
+                tree = nltk.tree.Tree.fromstring(batch.data['stx'][i][sent_idx])
+                set_span(tree)
+                best_subtree = find_max_f1_subtree(tree, span)
+
+                def _get_y(t):
+                    return t == best_subtree
+
+                yij, _ = tree2matrix(tree, _get_y, H, JX, dtype='bool')
+                y[i, sent_idx, :, :] = yij
+
+        return feed_dict
--- a/tensorflow/SQuAD/tree/read_data.py
+++ b/tensorflow/SQuAD/tree/read_data.py
@ -0,0 +1,159 @@
+import json
+import os
+import random
+import itertools
+import math
+
+import nltk
+
+from my.nltk_utils import load_compressed_tree
+from my.utils import index
+
+
+class DataSet(object):
+    def __init__(self, data, data_type, shared=None, valid_idxs=None):
+        total_num_examples = len(next(iter(data.values())))
+        self.data = data  # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
+        self.data_type = data_type
+        self.shared = shared
+        self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
+        self.num_examples = len(self.valid_idxs)
+
+    def get_batches(self, batch_size, num_batches=None, shuffle=False):
+        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
+        if num_batches is None:
+            num_batches = num_batches_per_epoch
+        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
+
+        idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs))
+                                             if shuffle else self.valid_idxs
+                                             for _ in range(num_epochs))
+        for _ in range(num_batches):
+            batch_idxs = tuple(itertools.islice(idxs, batch_size))
+            batch_data = {}
+            for key, val in self.data.items():
+                if key.startswith('*'):
+                    assert self.shared is not None
+                    shared_key = key[1:]
+                    batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs]
+                else:
+                    batch_data[key] = list(map(val.__getitem__, batch_idxs))
+
+            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
+            yield batch_idxs, batch_ds
+
+
+class SquadDataSet(DataSet):
+    def __init__(self, data, data_type, shared=None, valid_idxs=None):
+        super(SquadDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
+
+
+def load_metadata(config, data_type):
+    metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
+    with open(metadata_path, 'r') as fh:
+        metadata = json.load(fh)
+        for key, val in metadata.items():
+            config.__setattr__(key, val)
+        return metadata
+
+
+def read_data(config, data_type, ref, data_filter=None):
+    data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
+    with open(data_path, 'r') as fh:
+        data = json.load(fh)
+    with open(shared_path, 'r') as fh:
+        shared = json.load(fh)
+
+    num_examples = len(next(iter(data.values())))
+    if data_filter is None:
+        valid_idxs = range(num_examples)
+    else:
+        mask = []
+        keys = data.keys()
+        values = data.values()
+        for vals in zip(*values):
+            each = {key: val for key, val in zip(keys, vals)}
+            mask.append(data_filter(each, shared))
+        valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
+
+    print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
+
+    shared_path = os.path.join(config.out_dir, "shared.json")
+    if not ref:
+        word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
+        char_counter = shared['char_counter']
+        pos_counter = shared['pos_counter']
+        shared['word2idx'] = {word: idx + 2 for idx, word in
+                              enumerate(word for word, count in word_counter.items()
+                                        if count > config.word_count_th)}
+        shared['char2idx'] = {char: idx + 2 for idx, char in
+                              enumerate(char for char, count in char_counter.items()
+                                        if count > config.char_count_th)}
+        shared['pos2idx'] = {pos: idx + 2 for idx, pos in enumerate(pos_counter.keys())}
+        NULL = "-NULL-"
+        UNK = "-UNK-"
+        shared['word2idx'][NULL] = 0
+        shared['word2idx'][UNK] = 1
+        shared['char2idx'][NULL] = 0
+        shared['char2idx'][UNK] = 1
+        shared['pos2idx'][NULL] = 0
+        shared['pos2idx'][UNK] = 1
+        json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx'],
+                   'pos2idx': shared['pos2idx']}, open(shared_path, 'w'))
+    else:
+        new_shared = json.load(open(shared_path, 'r'))
+        for key, val in new_shared.items():
+            shared[key] = val
+
+    data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
+    return data_set
+
+
+def get_squad_data_filter(config):
+    def data_filter(data_point, shared):
+        assert shared is not None
+        rx, rcx, q, cq, y  = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
+        x, cx, stx = shared['x'], shared['cx'], shared['stx']
+        if len(q) > config.ques_size_th:
+            return False
+        xi = x[rx[0]][rx[1]]
+        if len(xi) > config.num_sents_th:
+            return False
+        if any(len(xij) > config.sent_size_th for xij in xi):
+            return False
+        stxi = stx[rx[0]][rx[1]]
+        if any(nltk.tree.Tree.fromstring(s).height() > config.tree_height_th for s in stxi):
+            return False
+        return True
+    return data_filter
+
+
+def update_config(config, data_sets):
+    config.max_num_sents = 0
+    config.max_sent_size = 0
+    config.max_ques_size = 0
+    config.max_word_size = 0
+    config.max_tree_height = 0
+    for data_set in data_sets:
+        data = data_set.data
+        shared = data_set.shared
+        for idx in data_set.valid_idxs:
+            rx = data['*x'][idx]
+            q = data['q'][idx]
+            sents = shared['x'][rx[0]][rx[1]]
+            trees = map(nltk.tree.Tree.fromstring, shared['stx'][rx[0]][rx[1]])
+            config.max_tree_height = max(config.max_tree_height, max(tree.height() for tree in trees))
+            config.max_num_sents = max(config.max_num_sents, len(sents))
+            config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
+            config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
+            if len(q) > 0:
+                config.max_ques_size = max(config.max_ques_size, len(q))
+                config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
+
+    config.max_word_size = min(config.max_word_size, config.word_size_th)
+
+    config.char_vocab_size = len(data_sets[0].shared['char2idx'])
+    config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
+    config.word_vocab_size = len(data_sets[0].shared['word2idx'])
+    config.pos_vocab_size = len(data_sets[0].shared['pos2idx'])
--- a/tensorflow/SQuAD/tree/templates/visualizer.html
+++ b/tensorflow/SQuAD/tree/templates/visualizer.html
@ -0,0 +1,67 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>{{ title }}</title>
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
+    <script>
+        $(document).ready(function(){
+            $(".att").each(function() {
+                // var val = parseFloat($(this).text());
+                var val = parseFloat($(this).attr("color"));
+                var scale = chroma.scale(['white', 'red']);
+                var color = scale(val).hex();
+                $(this).attr("bgcolor", color);
+            });
+        })
+    </script>
+</head>
+<style>
+    table, th, td {border: 1px solid black}
+</style>
+<body>
+    <h2>{{ title }}</h2>
+    <table>
+        <tr>
+            <th>ID</th>
+            <th>Question</th>
+            <th>Answer</th>
+            <th>Paragraph</th>
+        </tr>
+        {% for row in rows %}
+            <tr>
+                <td>{{ row.id }}</td>
+                <td>
+                    {% for qj in row.ques %}
+                        {{ qj }}
+                    {% endfor %}
+                </td>
+                <td>{{ row.a }}</td>
+                <td>
+                    <table>
+                    {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %}
+                        <tr>
+                        {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %}
+                            <td class="att" color="{{ ypjk }}">
+                            {% if yjk or y2jk %}
+                                <b>{{ xjk }}</b>
+                            {% else %}
+                                {{ xjk }}
+                            {% endif %}
+                            </td>
+                        {% endfor %}
+                        </tr>
+                        <tr>
+                        {% for xjk, yp2jk in zip(xj, yp2j) %}
+                            <td class="att" color="{{ yp2jk }}">-</td>
+                        {% endfor %}
+                        </tr>
+                    {% endfor %}
+                    </table>
+                </td>
+            </tr>
+        {% endfor %}
+    </table>
+</body>
+</html>
--- a/tensorflow/SQuAD/tree/test.ipynb
+++ b/tensorflow/SQuAD/tree/test.ipynb
@ -0,0 +1,294 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n",
+      "(PRP I)\n",
+      "(VP (VBP am) (NNP Sam))\n",
+      "(VBP am)\n",
+      "(NNP Sam)\n",
+      "(. .)\n",
+      "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n"
+     ]
+    }
+   ],
+   "source": [
+    "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
+    "tree = nltk.tree.Tree.fromstring(string)\n",
+    "\n",
+    "def load_compressed_tree(s):\n",
+    "\n",
+    "    def compress_tree(tree):\n",
+    "        if len(tree) == 1:\n",
+    "            if isinstance(tree[0], nltk.tree.Tree):\n",
+    "                return compress_tree(tree[0])\n",
+    "            else:\n",
+    "                return tree\n",
+    "        else:\n",
+    "            for i, t in enumerate(tree):\n",
+    "                tree[i] = compress_tree(t)\n",
+    "            return tree\n",
+    "\n",
+    "    return compress_tree(nltk.tree.Tree.fromstring(s))\n",
+    "tree = load_compressed_tree(string)\n",
+    "for t in tree.subtrees():\n",
+    "    print(t)\n",
+    "    \n",
+    "print(str(tree))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(ROOT I am Sam .)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tree.flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ROOT', 'S', 'NP', 'PRP', 'VP', 'VBP', 'NP', 'NNP', '.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(list(t.label() for t in tree.subtrees()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "d = json.load(open(\"data/squad/shared_dev.json\", 'r'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "73"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(d['pos_counter'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'#': 6,\n",
+       " '$': 80,\n",
+       " \"''\": 1291,\n",
+       " ',': 14136,\n",
+       " '-LRB-': 1926,\n",
+       " '-RRB-': 1925,\n",
+       " '.': 9505,\n",
+       " ':': 1455,\n",
+       " 'ADJP': 3426,\n",
+       " 'ADVP': 4936,\n",
+       " 'CC': 9300,\n",
+       " 'CD': 6216,\n",
+       " 'CONJP': 191,\n",
+       " 'DT': 26286,\n",
+       " 'EX': 288,\n",
+       " 'FRAG': 107,\n",
+       " 'FW': 96,\n",
+       " 'IN': 32564,\n",
+       " 'INTJ': 12,\n",
+       " 'JJ': 21452,\n",
+       " 'JJR': 563,\n",
+       " 'JJS': 569,\n",
+       " 'LS': 7,\n",
+       " 'LST': 1,\n",
+       " 'MD': 1051,\n",
+       " 'NAC': 19,\n",
+       " 'NN': 34750,\n",
+       " 'NNP': 28392,\n",
+       " 'NNPS': 1400,\n",
+       " 'NNS': 16716,\n",
+       " 'NP': 91636,\n",
+       " 'NP-TMP': 236,\n",
+       " 'NX': 108,\n",
+       " 'PDT': 89,\n",
+       " 'POS': 1451,\n",
+       " 'PP': 33278,\n",
+       " 'PRN': 2085,\n",
+       " 'PRP': 2320,\n",
+       " 'PRP$': 1959,\n",
+       " 'PRT': 450,\n",
+       " 'QP': 838,\n",
+       " 'RB': 7611,\n",
+       " 'RBR': 301,\n",
+       " 'RBS': 252,\n",
+       " 'ROOT': 9587,\n",
+       " 'RP': 454,\n",
+       " 'RRC': 19,\n",
+       " 'S': 21557,\n",
+       " 'SBAR': 5009,\n",
+       " 'SBARQ': 6,\n",
+       " 'SINV': 135,\n",
+       " 'SQ': 5,\n",
+       " 'SYM': 17,\n",
+       " 'TO': 5167,\n",
+       " 'UCP': 143,\n",
+       " 'UH': 15,\n",
+       " 'VB': 4197,\n",
+       " 'VBD': 8377,\n",
+       " 'VBG': 3570,\n",
+       " 'VBN': 7218,\n",
+       " 'VBP': 2897,\n",
+       " 'VBZ': 4146,\n",
+       " 'VP': 33696,\n",
+       " 'WDT': 1368,\n",
+       " 'WHADJP': 5,\n",
+       " 'WHADVP': 439,\n",
+       " 'WHNP': 1927,\n",
+       " 'WHPP': 153,\n",
+       " 'WP': 482,\n",
+       " 'WP$': 50,\n",
+       " 'WRB': 442,\n",
+       " 'X': 23,\n",
+       " '``': 1269}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d['pos_counter']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[False False False False]\n",
+      " [False  True False False]\n",
+      " [False False False False]]\n",
+      "[[0 2 2 0]\n",
+      " [2 2 0 2]\n",
+      " [2 0 0 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from my.nltk_utils import tree2matrix, load_compressed_tree, find_max_f1_subtree, set_span\n",
+    "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
+    "tree = load_compressed_tree(string)\n",
+    "span = (1, 3)\n",
+    "set_span(tree)\n",
+    "subtree = find_max_f1_subtree(tree, span)\n",
+    "f = lambda t: t == subtree\n",
+    "g = lambda t: 1 if isinstance(t, str) else 2\n",
+    "a, b = tree2matrix(tree, f, dtype='bool')\n",
+    "c, d = tree2matrix(tree, g, dtype='int32')\n",
+    "print(a)\n",
+    "print(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/tensorflow/SQuAD/tree/trainer.py
+++ b/tensorflow/SQuAD/tree/trainer.py
@ -0,0 +1,36 @@
+import tensorflow as tf
+
+from tree.model import Model
+
+
+class Trainer(object):
+    def __init__(self, config, model):
+        assert isinstance(model, Model)
+        self.config = config
+        self.model = model
+        self.opt = tf.train.AdagradOptimizer(config.init_lr)
+        self.loss = model.get_loss()
+        self.var_list = model.get_var_list()
+        self.global_step = model.get_global_step()
+        self.ema_op = model.ema_op
+        self.summary = model.summary
+        self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
+        opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
+
+        # Define train op
+        with tf.control_dependencies([opt_op]):
+            self.train_op = tf.group(self.ema_op)
+
+    def get_train_op(self):
+        return self.train_op
+
+    def step(self, sess, batch, get_summary=False):
+        assert isinstance(sess, tf.Session)
+        feed_dict = self.model.get_feed_dict(batch, True)
+        if get_summary:
+            loss, summary, train_op = \
+                sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
+        else:
+            loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+            summary = None
+        return loss, summary, train_op
--- a/tensorflow/SQuAD/tree/visualizer.py
+++ b/tensorflow/SQuAD/tree/visualizer.py
@ -0,0 +1,122 @@
+import shutil
+from collections import OrderedDict
+import http.server
+import socketserver
+import argparse
+import json
+import os
+import numpy as np
+from tqdm import tqdm
+
+from jinja2 import Environment, FileSystemLoader
+
+
+def bool_(string):
+    if string == 'True':
+        return True
+    elif string == 'False':
+        return False
+    else:
+        raise Exception()
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default='basic')
+    parser.add_argument("--data_type", type=str, default='dev')
+    parser.add_argument("--step", type=int, default=5000)
+    parser.add_argument("--template_name", type=str, default="visualizer.html")
+    parser.add_argument("--num_per_page", type=int, default=100)
+    parser.add_argument("--data_dir", type=str, default="data/squad")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--open", type=str, default='False')
+    parser.add_argument("--run_id", type=str, default="0")
+
+    args = parser.parse_args()
+    return args
+
+
+def _decode(decoder, sent):
+    return " ".join(decoder[idx] for idx in sent)
+
+
+def accuracy2_visualizer(args):
+    model_name = args.model_name
+    data_type = args.data_type
+    num_per_page = args.num_per_page
+    data_dir = args.data_dir
+    run_id = args.run_id.zfill(2)
+    step = args.step
+
+    eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
+    eval_ = json.load(open(eval_path, 'r'))
+
+    _id = 0
+    html_dir = "/tmp/list_results%d" % _id
+    while os.path.exists(html_dir):
+        _id += 1
+        html_dir = "/tmp/list_results%d" % _id
+
+    if os.path.exists(html_dir):
+        shutil.rmtree(html_dir)
+    os.mkdir(html_dir)
+
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    templates_dir = os.path.join(cur_dir, 'templates')
+    env = Environment(loader=FileSystemLoader(templates_dir))
+    env.globals.update(zip=zip, reversed=reversed)
+    template = env.get_template(args.template_name)
+
+    data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
+    shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
+    data = json.load(open(data_path, 'r'))
+    shared = json.load(open(shared_path, 'r'))
+
+    rows = []
+    for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])):
+        id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x'))
+        x = shared['x'][rx[0]][rx[1]]
+        ques = [" ".join(q)]
+        para = [[word for word in sent] for sent in x]
+        row = {
+            'id': id_,
+            'title': "Hello world!",
+            'ques': ques,
+            'para': para,
+            'y': yi,
+            'y2': yi,
+            'yp': ypi,
+            'yp2': ypi,
+            'a': ""
+               }
+        rows.append(row)
+
+        if i % num_per_page == 0:
+            html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
+
+        if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
+            var_dict = {'title': "Accuracy Visualization",
+                        'rows': rows
+                        }
+            with open(html_path, "wb") as f:
+                f.write(template.render(**var_dict).encode('UTF-8'))
+            rows = []
+
+    os.chdir(html_dir)
+    port = args.port
+    host = args.host
+    # Overriding to suppress log message
+    class MyHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass
+    handler = MyHandler
+    httpd = socketserver.TCPServer((host, port), handler)
+    if args.open == 'True':
+        os.system("open http://%s:%d" % (args.host, args.port))
+    print("serving at %s:%d" % (host, port))
+    httpd.serve_forever()
+
+
+if __name__ == "__main__":
+    ARGS = get_args()
+    accuracy2_visualizer(ARGS)
--- a/tensorflow/SQuAD/visualization/compare_models.py
+++ b/tensorflow/SQuAD/visualization/compare_models.py
@ -0,0 +1,244 @@
+import numpy as np
+from collections import Counter
+import string
+import re
+import argparse
+import os
+import json
+import nltk
+from matplotlib_venn import venn2
+from matplotlib import pyplot as plt
+
+
+class Question:
+    def __init__(self, id, question_text, ground_truth, model_names):
+        self.id = id
+        self.question_text = self.normalize_answer(question_text)
+        self.question_head_ngram = []
+        self.question_tokens = nltk.word_tokenize(self.question_text)
+        for nc in range(3):
+            self.question_head_ngram.append(' '.join(self.question_tokens[0:nc]))
+        self.ground_truth = ground_truth
+        self.model_names = model_names
+        self.em = np.zeros(2)
+        self.f1 = np.zeros(2)
+        self.answer_text = []
+
+    def add_answers(self, answer_model_1, answer_model_2):
+        self.answer_text.append(answer_model_1)
+        self.answer_text.append(answer_model_2)
+        self.eval()
+
+    def eval(self):
+        for model_count in range(2):
+            self.em[model_count] = self.metric_max_over_ground_truths(self.exact_match_score, self.answer_text[model_count], self.ground_truth)
+            self.f1[model_count] = self.metric_max_over_ground_truths(self.f1_score, self.answer_text[model_count], self.ground_truth)
+
+    def normalize_answer(self, s):
+        """Lower text and remove punctuation, articles and extra whitespace."""
+        def remove_articles(text):
+            return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+        def white_space_fix(text):
+            return ' '.join(text.split())
+
+        def remove_punc(text):
+            exclude = set(string.punctuation)
+            return ''.join(ch for ch in text if ch not in exclude)
+
+        def lower(text):
+            return text.lower()
+
+        return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+    def f1_score(self, prediction, ground_truth):
+        prediction_tokens = self.normalize_answer(prediction).split()
+        ground_truth_tokens = self.normalize_answer(ground_truth).split()
+        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(prediction_tokens)
+        recall = 1.0 * num_same / len(ground_truth_tokens)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    def exact_match_score(self, prediction, ground_truth):
+        return (self.normalize_answer(prediction) == self.normalize_answer(ground_truth))
+
+    def metric_max_over_ground_truths(self, metric_fn, prediction, ground_truths):
+        scores_for_ground_truths = []
+        for ground_truth in ground_truths:
+            score = metric_fn(prediction, ground_truth)
+            scores_for_ground_truths.append(score)
+        return max(scores_for_ground_truths)
+
+
+def safe_dict_access(in_dict, in_key, default_string='some junk string'):
+    if in_key in in_dict:
+        return in_dict[in_key]
+    else:
+        return default_string
+
+
+def aggregate_metrics(questions):
+    total = len(questions)
+    exact_match = np.zeros(2)
+    f1_scores = np.zeros(2)
+
+    for mc in range(2):
+        exact_match[mc] = 100 * np.sum(np.array([questions[x].em[mc] for x in questions])) / total
+        f1_scores[mc] = 100 * np.sum(np.array([questions[x].f1[mc] for x in questions])) / total
+
+    model_names = questions[list(questions.keys())[0]].model_names
+    print('\nAggregate Scores:')
+    for model_count in range(2):
+        print('Model {0} EM = {1:.2f}'.format(model_names[model_count], exact_match[model_count]))
+        print('Model {0} F1 = {1:.2f}'.format(model_names[model_count], f1_scores[model_count]))
+
+
+def venn_diagram(questions, output_dir):
+    em_model1_ids = [x for x in questions if questions[x].em[0] == 1]
+    em_model2_ids = [x for x in questions if questions[x].em[1] == 1]
+    model_names = questions[list(questions.keys())[0]].model_names
+    print('\nVenn diagram')
+
+    correct_model1 = em_model1_ids
+    correct_model2 = em_model2_ids
+    correct_model1_and_model2 = list(set(em_model1_ids).intersection(set(em_model2_ids)))
+    correct_model1_and_not_model2 = list(set(em_model1_ids) - set(em_model2_ids))
+    correct_model2_and_not_model1 = list(set(em_model2_ids) - set(em_model1_ids))
+
+    print('{0} answers correctly = {1}'.format(model_names[0], len(correct_model1)))
+    print('{0} answers correctly = {1}'.format(model_names[1], len(correct_model2)))
+    print('Both answer correctly = {1}'.format(model_names[0], len(correct_model1_and_model2)))
+    print('{0} correct & {1} incorrect = {2}'.format(model_names[0], model_names[1], len(correct_model1_and_not_model2)))
+    print('{0} correct & {1} incorrect = {2}'.format(model_names[1], model_names[0], len(correct_model2_and_not_model1)))
+
+    plt.clf()
+    venn_diagram_plot = venn2(
+        subsets=(len(correct_model1_and_not_model2), len(correct_model2_and_not_model1), len(correct_model1_and_model2)),
+        set_labels=('{0} correct'.format(model_names[0]), '{0} correct'.format(model_names[1]), 'Both correct'),
+        set_colors=('r', 'b'),
+        alpha=0.3,
+        normalize_to=1
+    )
+    plt.savefig(os.path.join(output_dir, 'venn_diagram.png'))
+    plt.close()
+    return correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1
+
+
+def get_head_ngrams(questions, num_grams):
+    head_ngrams = []
+    for question in questions.values():
+        head_ngrams.append(question.question_head_ngram[num_grams])
+    return head_ngrams
+
+
+def get_head_ngram_frequencies(questions, head_ngrams, num_grams):
+    head_ngram_frequencies = {}
+    for current_ngram in head_ngrams:
+        head_ngram_frequencies[current_ngram] = 0
+    for question in questions.values():
+        head_ngram_frequencies[question.question_head_ngram[num_grams]] += 1
+    return head_ngram_frequencies
+
+
+def get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1, output_dir, num_grams=2, top_count=25):
+    # Head ngram statistics
+    head_ngrams = get_head_ngrams(questions, num_grams)
+
+    # Get head_ngram_frequencies (hnf)
+    hnf_all = get_head_ngram_frequencies(questions, head_ngrams, num_grams)
+    hnf_correct_model1 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1}, head_ngrams, num_grams)
+    hnf_correct_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model2}, head_ngrams, num_grams)
+    hnf_correct_model1_and_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1_and_model2}, head_ngrams, num_grams)
+    hnf_correct_model1_and_not_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1_and_not_model2}, head_ngrams, num_grams)
+    hnf_correct_model2_and_not_model1 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model2_and_not_model1}, head_ngrams, num_grams)
+
+    sorted_bigrams_all = sorted(hnf_all.items(), key=lambda x: x[1], reverse=True)
+    top_bigrams = [x[0] for x in sorted_bigrams_all[0:top_count]]
+
+    counts_total = [hnf_all[x] for x in top_bigrams]
+    counts_model1 = [hnf_correct_model1[x] for x in top_bigrams]
+    counts_model2 = [hnf_correct_model2[x] for x in top_bigrams]
+    counts_model1_and_model2 = [hnf_correct_model1_and_model2[x] for x in top_bigrams]
+    counts_model1_and_not_model2 = [hnf_correct_model1_and_not_model2[x] for x in top_bigrams]
+    counts_model2_and_not_model1 = [hnf_correct_model2_and_not_model1[x] for x in top_bigrams]
+
+    top_bigrams_with_counts = []
+    for cc in range(len(top_bigrams)):
+        top_bigrams_with_counts.append('{0} ({1})'.format(top_bigrams[cc], counts_total[cc]))
+
+    plt.clf()
+    fig, ax = plt.subplots(figsize=(6, 10))
+
+    ylocs = list(range(top_count))
+    counts_model1_percent = 100 * np.array(counts_model1) / np.array(counts_total)
+    plt.barh([top_count - x for x in ylocs], counts_model1_percent, height=0.4, alpha=0.5, color='#EE3224', label=top_bigrams)
+    counts_model2_percent = 100 * np.array(counts_model2) / np.array(counts_total)
+    plt.barh([top_count - x+0.4 for x in ylocs], counts_model2_percent, height=0.4, alpha=0.5, color='#2432EE', label=top_bigrams  )
+    ax.set_yticks([top_count - x + 0.4 for x in ylocs])
+    ax.set_yticklabels(top_bigrams_with_counts)
+    ax.set_ylim([0.5, top_count+1])
+    ax.set_xlim([0, 100])
+    plt.subplots_adjust(left=0.28, right=0.9, top=0.9, bottom=0.1)
+    plt.xlabel('Percentage of questions with correct answers')
+    plt.ylabel('Top N-grams')
+    plt.savefig(os.path.join(output_dir, 'ngram_stats_{0}.png'.format(num_grams)))
+    plt.close()
+
+
+def read_json(filename):
+    with open(filename) as filepoint:
+        data = json.load(filepoint)
+    return data
+
+
+def compare_models(dataset_file, predictions_m1_file, predictions_m2_file, output_dir, name_m1='Model 1', name_m2='Model 2'):
+    dataset = read_json(dataset_file)['data']
+    predictions_m1 = read_json(predictions_m1_file)
+    predictions_m2 = read_json(predictions_m2_file)
+
+    # Read in data
+    total = 0
+    questions = {}
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                current_question = Question(id=qa['id'], question_text=qa['question'], ground_truth=list(map(lambda x: x['text'], qa['answers'])), model_names=[name_m1, name_m2])
+                current_question.add_answers(answer_model_1=safe_dict_access(predictions_m1, qa['id']), answer_model_2=safe_dict_access(predictions_m2, qa['id']))
+                questions[current_question.id] = current_question
+                total += 1
+    model_names = questions[list(questions.keys())[0]].model_names
+    print('Read in {0} questions'.format(total))
+
+    # Aggregate scores
+    aggregate_metrics(questions)
+
+    # Venn diagram
+    correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1 = venn_diagram(questions, output_dir=output_dir)
+
+    # Head Unigram statistics
+    get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2,
+                              correct_model2_and_not_model1, output_dir, num_grams=1, top_count=10)
+
+    # Head Bigram statistics
+    get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2,
+                              correct_model2_and_not_model1, output_dir, num_grams=2, top_count=10)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Compare two QA models')
+    parser.add_argument('-dataset', action='store', dest='dataset', required=True, help='Dataset file')
+    parser.add_argument('-model1', action='store', dest='predictions_m1', required=True, help='Prediction file for model 1')
+    parser.add_argument('-model2', action='store', dest='predictions_m2', required=True, help='Prediction file for model 2')
+    parser.add_argument('-name1', action='store', dest='name_m1', help='Name for model 1')
+    parser.add_argument('-name2', action='store', dest='name_m2', help='Name for model 2')
+    parser.add_argument('-output', action='store', dest='output_dir', help='Output directory for visualizations')
+    results = parser.parse_args()
+
+    if results.name_m1 is not None and results.name_m2 is not None:
+        compare_models(dataset_file=results.dataset, predictions_m1_file=results.predictions_m1, predictions_m2_file=results.predictions_m2, output_dir=results.output_dir, name_m1=results.name_m1, name_m2=results.name_m2)
+    else:
+        compare_models(dataset_file=results.dataset, predictions_m1_file=results.predictions_m1, predictions_m2_file=results.predictions_m2, output_dir=results.output_dir)
				`@ -0,0 +1 @@`
				`python3 -m basic.cli --mode train --noload --len_opt --cluster`