First commit

This commit is contained in:
Deepak Narayanan 2017-08-17 11:43:17 -07:00
commit b7e1e0fa0f
98 changed files with 42749 additions and 0 deletions

5
.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
*.pyc
__pycache__/
.eggs/
*.egg-info/
.cache

6
pytorch/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
*.pyc
__pycache__/
.eggs/
*.egg-info/
.cache
data/

View file

View file

@ -0,0 +1,140 @@
import os
import timeit
from glob import glob
from collections import OrderedDict
import click
import torch
import numpy as np
from torch.autograd import Variable
from torchvision import transforms
from torchvision import datasets
from benchmark.train import load, MEAN, STD, save_result, MODELS
class PyTorchEngine:
def __init__(self, filename, use_cuda=False, name=None):
self.filename = filename
self.use_cuda = use_cuda
self.name = name
model, epoch, accuracy = load(self.filename)
if self.use_cuda:
self.model = model.cuda()
else:
self.model = model.cpu()
self.epoch = epoch
self.accuracy = accuracy
def pred(self, inputs):
inputs = Variable(inputs, requires_grad=False, volatile=True)
if self.use_cuda:
inputs = inputs.cuda()
return self.model(inputs).data.cpu().numpy()
else:
return self.model(inputs).data.numpy()
def time_batch_size(dataset, batch_size, pred, use_cuda, repeat=100, bestof=3):
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
shuffle=False, pin_memory=use_cuda)
inputs, targets = loader.__iter__().next()
assert inputs.size(0) == batch_size
times = timeit.repeat('pred(inputs)', globals=locals(),
repeat=repeat, number=1)
return times
def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
output=None):
if log2:
start = int(np.floor(np.log2(start)))
end = int(np.ceil(np.log2(end)))
assert start >= 0
assert end >= start
batch_sizes = map(lambda x: 2**x, range(start, end + 1))
else:
batch_sizes = range(start, end + 1)
results = []
for batch_size in batch_sizes:
times = time_batch_size(dataset, batch_size, engine.pred,
engine.use_cuda, repeat=repeat)
result = OrderedDict()
result['nodename'] = os.uname().nodename
result['model'] = engine.name
result['use_cuda'] = engine.use_cuda
result['batch_size'] = batch_size
result['mean'] = np.mean(times)
result['std'] = np.std(times)
result['throughput'] = batch_size / np.mean(times)
result['filename'] = engine.filename
if output is not None:
save_result(result, output)
print('batch_size: {batch_size:4d}'
' - mean: {mean:.4f}'
' - std: {std:.4f}'
' - throughput: {throughput:.4f}'.format(**result))
results.append(result)
return results
@click.command()
@click.option('--dataset-dir', default='./data/cifar10')
@click.option('--run-dir', default='./run/')
@click.option('--output-file', default='inference.csv')
@click.option('--start', '-s', default=1)
@click.option('--end', '-e', default=128)
@click.option('--repeat', '-r', default=100)
@click.option('--log2/--no-log2', default=True)
@click.option('--cpu/--no-cpu', default=True)
@click.option('--gpu/--no-gpu', default=True)
@click.option('--append', is_flag=True)
@click.option('--models', '-m', type=click.Choice(MODELS.keys()),
multiple=True)
def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
cpu, gpu, append, models):
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(MEAN, STD)
])
testset = datasets.CIFAR10(root=dataset_dir, train=False, download=True,
transform=transform_test)
models = models or os.listdir(run_dir)
output_path = os.path.join(run_dir, output_file)
assert not os.path.exists(output_path) or append
for model in models:
model_dir = os.path.join(run_dir, model)
paths = glob(f"{model_dir}/*/checkpoint_best_model.t7")
assert len(paths) > 0
path = os.path.abspath(paths[0])
print(f'Model: {model}')
print(f'Path: {path}')
if cpu:
print('With CPU:')
engine = PyTorchEngine(path, use_cuda=False, name=model)
infer_cifar10(testset, engine, start=start, end=end, log2=log2,
repeat=repeat, output=output_path)
if gpu and torch.cuda.is_available():
print('With GPU:')
engine = PyTorchEngine(path, use_cuda=True, name=model)
# Warmup
time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)
infer_cifar10(testset, engine, start=start, end=end, log2=log2,
repeat=repeat, output=output_path)
if __name__ == '__main__':
infer()

View file

@ -0,0 +1,108 @@
'''DenseNet in PyTorch.'''
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
def __init__(self, in_planes, growth_rate):
super(Bottleneck, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv1 = nn.Conv2d(in_planes, 4 * growth_rate, kernel_size=1, bias=False)
self.bn2 = nn.BatchNorm2d(4 * growth_rate)
self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
def forward(self, x):
out = self.conv1(F.relu(self.bn1(x)))
out = self.conv2(F.relu(self.bn2(out)))
out = torch.cat([out, x], 1)
return out
class Transition(nn.Module):
def __init__(self, in_planes, out_planes, last=False, pool_size=2):
super(Transition, self).__init__()
self.last = last
self.pool_size = pool_size
self.bn = nn.BatchNorm2d(in_planes)
if not self.last:
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
def forward(self, x):
out = F.relu(self.bn(x))
if not self.last:
out = self.conv(out)
out = F.avg_pool2d(out, self.pool_size)
return out
class DenseNet(nn.Module):
def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
super(DenseNet, self).__init__()
# TODO: Add drop for CIFAR10 without data augmentation
self.growth_rate = growth_rate
num_planes = 2 * growth_rate
self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
num_planes += nblocks[0] * growth_rate
out_planes = int(math.floor(num_planes*reduction))
self.trans1 = Transition(num_planes, out_planes)
num_planes = out_planes
self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
num_planes += nblocks[1] * growth_rate
out_planes = int(math.floor(num_planes*reduction))
self.trans2 = Transition(num_planes, out_planes)
num_planes = out_planes
self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
num_planes += nblocks[2] * growth_rate
self.trans3 = Transition(num_planes, num_planes, last=True, pool_size=8)
self.linear = nn.Linear(num_planes, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_dense_layers(self, block, in_planes, nblock):
layers = []
for i in range(nblock):
layers.append(block(in_planes, self.growth_rate))
in_planes += self.growth_rate
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv1(x)
out = self.trans1(self.dense1(out))
out = self.trans2(self.dense2(out))
out = self.trans3(self.dense3(out))
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def DenseNetBC(L, k):
assert (L - 4) % 6 == 0
num_blocks = int((L - 4) / 6)
return DenseNet(Bottleneck, [num_blocks] * 3, growth_rate=k, reduction=0.5)
def DenseNetBC100():
return DenseNetBC(100, 12)
def DenseNetBC250():
return DenseNetBC(250, 24)
def DenseNetBC190():
return DenseNetBC(190, 40)

View file

@ -0,0 +1,372 @@
import math
from functools import partial
from torch import nn
from torch.nn import functional as F
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
if stride != 1 or inplanes != (planes * self.expansion):
self.shortcut = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
bias=False),
nn.BatchNorm2d(planes * self.expansion)
)
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
H = self.conv1(inputs)
H = self.bn1(H)
H = F.relu(H)
H = self.conv2(H)
H = self.bn2(H)
H += self.shortcut(inputs)
outputs = F.relu(H)
return outputs
class PreActBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1):
super().__init__()
self.bn1 = nn.BatchNorm2d(inplanes)
self.conv1 = nn.Conv2d(inplanes, planes, 3, stride=stride, padding=1,
bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.increasing = stride != 1 or inplanes != (planes * self.expansion)
if self.increasing:
self.shortcut = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
bias=False)
)
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
H = self.bn1(inputs)
H = F.relu(H)
if self.increasing:
inputs = H
H = self.conv1(H)
H = self.bn2(H)
H = F.relu(H)
H = self.conv2(H)
H += self.shortcut(inputs)
return H
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, 3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
if stride != 1 or inplanes != (planes * self.expansion):
self.shortcut = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
bias=False),
nn.BatchNorm2d(planes * self.expansion)
)
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
H = self.conv1(inputs)
H = self.bn1(H)
H = F.relu(H)
H = self.conv2(H)
H = self.bn2(H)
H = F.relu(H)
H = self.conv3(H)
H = self.bn3(H)
H += self.shortcut(inputs)
outputs = F.relu(H)
return outputs
class ResNeXtBottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, cardinality=32,
base_width=4):
super().__init__()
width = math.floor(planes * (base_width / 64.0))
self.conv1 = nn.Conv2d(inplanes, width * cardinality, 1, bias=False)
self.bn1 = nn.BatchNorm2d(width * cardinality)
self.conv2 = nn.Conv2d(width * cardinality, width * cardinality, 3,
groups=cardinality, padding=1, stride=stride,
bias=False)
self.bn2 = nn.BatchNorm2d(width * cardinality)
self.conv3 = nn.Conv2d(width * cardinality, planes * 4, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
if stride != 1 or inplanes != (planes * self.expansion):
self.shortcut = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
bias=False),
nn.BatchNorm2d(planes * self.expansion)
)
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
H = self.conv1(inputs)
H = self.bn1(H)
H = F.relu(H)
H = self.conv2(H)
H = self.bn2(H)
H = F.relu(H)
H = self.conv3(H)
H = self.bn3(H)
H += self.shortcut(inputs)
outputs = F.relu(H)
return outputs
class PreActBottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1):
super().__init__()
self.bn1 = nn.BatchNorm2d(inplanes)
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, stride=stride,
bias=False)
self.bn3 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, 1, bias=False)
self.increasing = stride != 1 or inplanes != (planes * self.expansion)
if self.increasing:
self.shortcut = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion, 1, stride=stride,
bias=False)
)
else:
self.shortcut = nn.Sequential()
def forward(self, inputs):
H = self.bn1(inputs)
H = F.relu(H)
if self.increasing:
inputs = H
H = self.conv1(H)
H = self.bn2(H)
H = F.relu(H)
H = self.conv2(H)
H = self.bn3(H)
H = F.relu(H)
H = self.conv3(H)
H += self.shortcut(inputs)
return H
class ResNet(nn.Module):
def __init__(self, Block, layers, filters, num_classes=10, inplanes=None):
self.inplanes = inplanes or filters[0]
super().__init__()
self.pre_act = 'Pre' in Block.__name__
self.conv1 = nn.Conv2d(3, self.inplanes, 3, padding=1, bias=False)
if not self.pre_act:
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.num_sections = len(layers)
for section_index, (size, planes) in enumerate(zip(layers, filters)):
section = []
for layer_index in range(size):
if section_index != 0 and layer_index == 0:
stride = 2
else:
stride = 1
section.append(Block(self.inplanes, planes, stride=stride))
self.inplanes = planes * Block.expansion
section = nn.Sequential(*section)
setattr(self, f'section_{section_index}', section)
if self.pre_act:
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.fc = nn.Linear(filters[-1] * Block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, inputs):
H = self.conv1(inputs)
if not self.pre_act:
H = self.bn1(H)
H = F.relu(H)
for section_index in range(self.num_sections):
H = getattr(self, f'section_{section_index}')(H)
if self.pre_act:
H = self.bn1(H)
H = F.relu(H)
H = F.avg_pool2d(H, H.size()[2:])
H = H.view(H.size(0), -1)
outputs = self.fc(H)
return outputs
# From "Deep Residual Learning for Image Recognition"
def ResNet20():
return ResNet(BasicBlock, layers=[3] * 3, filters=[16, 32, 64])
def ResNet32():
return ResNet(BasicBlock, layers=[5] * 3, filters=[16, 32, 64])
def ResNet44():
return ResNet(BasicBlock, layers=[7] * 3, filters=[16, 32, 64])
def ResNet56():
return ResNet(BasicBlock, layers=[9] * 3, filters=[16, 32, 64])
def ResNet110():
return ResNet(BasicBlock, layers=[18] * 3, filters=[16, 32, 64])
def ResNet1202():
return ResNet(BasicBlock, layers=[200] * 3, filters=[16, 32, 64])
# Based on but not it "Identity Mappings in Deep Residual Networks"
def PreActResNet20():
return ResNet(PreActBlock, layers=[3] * 3, filters=[16, 32, 64])
def PreActResNet56():
return ResNet(PreActBlock, layers=[9] * 3, filters=[16, 32, 64])
def PreActResNet164Basic():
return ResNet(PreActBlock, layers=[27] * 3, filters=[16, 32, 64])
# From "Identity Mappings in Deep Residual Networks"
def PreActResNet110():
return ResNet(PreActBlock, layers=[18] * 3, filters=[16, 32, 64])
def PreActResNet164():
return ResNet(PreActBottleneck, layers=[18] * 3, filters=[16, 32, 64])
def PreActResNet1001():
return ResNet(PreActBottleneck, layers=[111] * 3, filters=[16, 32, 64])
# From "Wide Residual Networks"
def WRN(n, k):
assert (n - 4) % 6 == 0
base_filters = [16, 32, 64]
filters = [num_filters * k for num_filters in base_filters]
d = (n - 4) / 2 # l = 2
return ResNet(PreActBlock, layers=[int(d / 3)] * 3, filters=filters,
inplanes=16)
def WRN_40_4():
return WRN(40, 4)
def WRN_16_8():
return WRN(16, 8)
def WRN_28_10():
return WRN(28, 10)
# From "Aggregated Residual Transformations for Deep Neural Networks"
def ResNeXt29(cardinality, base_width):
Block = partial(ResNeXtBottleneck, cardinality=cardinality,
base_width=base_width)
Block.__name__ = ResNeXtBottleneck.__name__
Block.expansion = ResNeXtBottleneck.expansion
return ResNet(Block, layers=[3, 3, 3], filters=[64, 128, 256])
# From kunagliu/pytorch
def ResNet18():
return ResNet(BasicBlock, layers=[2, 2, 2, 2], filters=[64, 128, 256, 512])
def ResNet34():
return ResNet(BasicBlock, layers=[3, 4, 6, 3], filters=[64, 128, 256, 512])
def ResNet50():
return ResNet(Bottleneck, layers=[3, 4, 6, 3], filters=[64, 128, 256, 512])
def ResNet101():
return ResNet(Bottleneck,
layers=[3, 4, 23, 3], filters=[64, 128, 256, 512])
def ResNet152():
return ResNet(Bottleneck,
layers=[3, 8, 36, 3], filters=[64, 128, 256, 512])

View file

@ -0,0 +1,336 @@
import os
import re
import json
from functools import reduce
from datetime import datetime
from collections import OrderedDict
import click
import torch
import progressbar
from torch import nn, optim
from torch.autograd import Variable
from torchvision import transforms
from torchvision import datasets as dset
from benchmark.models import resnet, densenet
MEAN = (0.4914, 0.4822, 0.4465)
STD = (0.2023, 0.1994, 0.2010)
MODELS = {
# "Deep Residual Learning for Image Recognition"
'resnet20': resnet.ResNet20,
'resnet32': resnet.ResNet32,
'resnet44': resnet.ResNet44,
'resnet56': resnet.ResNet56,
'resnet110': resnet.ResNet110,
'resnet1202': resnet.ResNet1202,
# "Wide Residual Networks"
'wrn-40-4': resnet.WRN_40_4,
'wrn-16-8': resnet.WRN_16_8,
'wrn-28-10': resnet.WRN_28_10,
# Based on "Identity Mappings in Deep Residual Networks"
'preact20': resnet.PreActResNet20,
'preact56': resnet.PreActResNet56,
'preact164-basic': resnet.PreActResNet164Basic,
# "Identity Mappings in Deep Residual Networks"
'preact110': resnet.PreActResNet110,
'preact164': resnet.PreActResNet164,
'preact1001': resnet.PreActResNet1001,
# "Aggregated Residual Transformations for Deep Neural Networks"
'resnext29-8-64': lambda _=None: resnet.ResNeXt29(8, 64),
'resnext29-16-64': lambda _=None: resnet.ResNeXt29(16, 64),
# "Densely Connected Convolutional Networks"
'densenetbc100': densenet.DenseNetBC100,
'densenetbc250': densenet.DenseNetBC250,
'densenetbc190': densenet.DenseNetBC190,
# Kuangliu/pytorch-cifar
'resnet18': resnet.ResNet18,
'resnet50': resnet.ResNet50,
'resnet101': resnet.ResNet101,
'resnet152': resnet.ResNet152,
}
def count_parameters(model):
c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
return sum(c)
def correct(outputs, targets, top=(1, )):
_, predictions = outputs.topk(max(top), dim=1, largest=True, sorted=True)
targets = targets.view(-1, 1).expand_as(predictions)
corrects = predictions.eq(targets).cpu().cumsum(1).sum(0)
tops = list(map(lambda k: corrects.data[0][k - 1], top))
return tops
def save_result(result, path):
write_heading = not os.path.exists(path)
with open(path, mode='a') as out:
if write_heading:
out.write(",".join([str(k) for k, v in result.items()]) + '\n')
out.write(",".join([str(v) for k, v in result.items()]) + '\n')
def run(epoch, model, loader, criterion=None, optimizer=None, top=(1, 5),
use_cuda=False, tracking=None, max_value=None, train=True):
assert criterion is not None or not train, 'Need criterion to train model'
assert optimizer is not None or not train, 'Need optimizer to train model'
max_value = max_value or progressbar.UnknownLength
bar = progressbar.ProgressBar(max_value=max_value)
total = 0
correct_counts = {}
if train:
model.train()
else:
model.eval()
start = datetime.now()
for batch_index, (inputs, targets) in enumerate(loader):
inputs = Variable(inputs, requires_grad=False, volatile=not train)
targets = Variable(targets, requires_grad=False, volatile=not train)
if use_cuda:
inputs = inputs.cuda()
targets = targets.cuda()
outputs = model(inputs)
if train:
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, predictions = torch.max(outputs.data, 1)
batch_size = targets.size(0)
top_correct = correct(outputs, targets, top=top)
total += batch_size
for k, count in zip(top, top_correct):
correct_counts[k] = correct_counts.get(k, 0) + count
end = datetime.now()
if tracking is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['batch_duration'] = end - start
result['epoch'] = epoch
result['batch'] = batch_index
result['batch_size'] = batch_size
for i, k in enumerate(top):
result['top{}_correct'.format(k)] = top_correct[i]
if train:
result['loss'] = loss.data[0]
save_result(result, tracking)
bar.update(batch_index + 1)
start = datetime.now()
print()
if train:
message = 'Training accuracy of'
else:
message = 'Test accuracy of'
for k in top:
accuracy = correct_counts[k] / total
message += ' top-{}: {}'.format(k, accuracy)
print(message)
return (1. * correct_counts[top[0]]) / total, batch_index + 1
def save(model, directory, epoch, accuracy, use_cuda=False, filename=None):
state = {
'model': model.module if use_cuda else model,
'epoch': epoch,
'accuracy': accuracy
}
filename = filename or 'checkpoint_{}.t7'.format(epoch)
torch.save(state, os.path.join(directory, filename))
def save_config(config, run_dir):
path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
with open(path, 'w') as config_file:
json.dump(config, config_file)
config_file.write('\n')
def load(path):
assert os.path.exists(path)
state = torch.load(path)
model = state['model']
epoch = state['epoch']
accuracy = state['accuracy']
return model, epoch, accuracy
def latest_file(model):
restore = f'./run/{model}'
timestamps = sorted(os.listdir(restore))
assert len(timestamps) > 0
run_dir = os.path.join(restore, timestamps[-1])
files = os.listdir(run_dir)
max_checkpoint = -1
for filename in files:
if re.search('checkpoint_\d+.t7', filename):
num = int(re.search('\d+', filename).group())
if num > max_checkpoint:
max_checkpoint = num
max_checkpoint_file = filename
assert max_checkpoint != -1
return os.path.join(run_dir, max_checkpoint_file)
@click.command()
@click.option('--dataset-dir', default='./data/cifar10')
@click.option('--checkpoint', '-c', type=click.Choice(['best', 'all', 'last']),
default='last')
@click.option('--restore', '-r')
@click.option('--tracking/--no-tracking', default=True)
@click.option('--cuda/--no-cuda', default=True)
@click.option('--epochs', '-e', default=200)
@click.option('--batch-size', '-b', default=32)
@click.option('--learning-rate', '-l', default=1e-3)
@click.option('--sgd', 'optimizer', flag_value='sgd')
@click.option('--adam', 'optimizer', flag_value='adam', default=True)
@click.option('--augmentation/--no-augmentation', default=True)
@click.option('--num-workers', type=int)
@click.option('--weight-decay', default=5e-4)
@click.option('--model', '-m', type=click.Choice(MODELS.keys()),
default='resnet20')
def main(dataset_dir, checkpoint, restore, tracking, cuda, epochs,
batch_size, learning_rate, optimizer, augmentation, num_workers,
weight_decay, model):
timestamp = "{:.0f}".format(datetime.utcnow().timestamp())
config = {k: v for k, v in locals().items()}
use_cuda = cuda and torch.cuda.is_available()
if use_cuda:
num_workers = num_workers or torch.cuda.device_count()
else:
num_workers = num_workers or 1
print(f"using {num_workers} workers for data loading")
print("Preparing data:")
if augmentation:
transform_train = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
else:
transform_train = []
transform_train = transforms.Compose(transform_train + [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
trainset = dset.CIFAR10(root=dataset_dir, train=True, download=True,
transform=transform_train)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
pin_memory=use_cuda)
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(MEAN, STD),
])
testset = dset.CIFAR10(root=dataset_dir, train=False, download=True,
transform=transform_test)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
pin_memory=use_cuda)
if restore is not None:
if restore == 'latest':
restore = latest_file(model)
print(f'Restoring model from {restore}')
model, start_epoch, best_accuracy = load(restore)
start_epoch += 1
print('Starting accuracy is {}'.format(best_accuracy))
run_dir = os.path.split(restore)[0]
else:
print(f'Building {model} model')
best_accuracy = -1
start_epoch = 1
run_dir = f"./run/{model}/{timestamp}"
model = MODELS[model]()
if not os.path.exists(run_dir):
os.makedirs(run_dir)
save_config(config, run_dir)
print(model)
print("{} parameters".format(count_parameters(model)))
print(f"Run directory set to {run_dir}")
# Save model text description
with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
file.write(str(model))
if tracking:
train_results_file = os.path.join(run_dir, 'train_results.csv')
test_results_file = os.path.join(run_dir, 'test_results.csv')
else:
train_results_file = None
test_results_file = None
if use_cuda:
print('Copying model to GPU')
model.cuda()
model = torch.nn.DataParallel(
model, device_ids=range(torch.cuda.device_count()))
criterion = nn.CrossEntropyLoss()
# Other parameters?
if optimizer == 'adam':
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
momentum=0.9,
weight_decay=weight_decay)
else:
raise NotImplementedError("Unknown optimizer: {}".format(optimizer))
train_max_value = None
test_max_value = None
end_epoch = start_epoch + epochs
for epoch in range(start_epoch, end_epoch):
print('Epoch {} of {}'.format(epoch, end_epoch - 1))
train_acc, train_max_value = run(epoch, model, train_loader, criterion,
optimizer, use_cuda=use_cuda,
tracking=train_results_file,
max_value=train_max_value, train=True)
test_acc, test_max_value = run(epoch, model, test_loader,
use_cuda=use_cuda,
tracking=test_results_file, train=False)
if test_acc > best_accuracy:
print('New best model!')
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda,
filename='checkpoint_best_model.t7')
best_accuracy = test_acc
last_epoch = epoch == (end_epoch - 1)
if checkpoint == 'all' or (checkpoint == 'last' and last_epoch):
save(model, run_dir, epoch, test_acc, use_cuda=use_cuda)
if __name__ == '__main__':
main()

20
pytorch/CIFAR10/setup.py Normal file
View file

@ -0,0 +1,20 @@
from setuptools import setup
setup(
name='benchmark',
version='0.0.0',
url='http://www.codycoleman.com',
author='Cody Austun Coleman',
author_email='cody.coleman@cs.stanford.edu',
packages=['benchmark'],
entry_points={
'console_scripts': [
'bench = benchmark.train:main'
]
},
install_requires=[
'torchvision',
'click',
'progressbar2'
]
)

View file

@ -0,0 +1,18 @@
# ResNets on TensorFlow
To train a ResNet, run,
```bash
python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* --log_root=data/resnet20/log_root \
--train_dir=data/resnet20/log_root/train --dataset='cifar10' --model=resnet20 \
--num_gpus=1 --checkpoint_dir=data/resnet20/checkpoints --data_format=NCHW
```
To evaluate resulting checkpoints, run,
```bash
python3 eval_checkpoints.py -i data/resnet20/checkpoints \
-c "python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin --eval_dir=data/resnet20/log_root/eval --dataset='cifar10' --model=resnet20 --num_gpus=1 --eval_batch_count=100 --eval_once=True --data_format=NCHW"
```
Make sure to first follow the instructions in `resnet/README.md` to get necessary data, etc.

View file

@ -0,0 +1,59 @@
import argparse
import os
import subprocess
import sys
def main(checkpoints_path, command, start_cnt):
cnt = start_cnt
times = {}
cum_time = 0.0
with open(os.path.join(checkpoints_path, "times.log"), 'r') as f:
output = f.read().strip()
output_lines = output.split('\n')
for output_line in output_lines:
[step, time] = output_line.split('\t')
step = int(step.split(': ')[1])
time = float(time.split(': ')[1])
cum_time += time
times[step] = cum_time
print("Time (in secs)\tNumber of minibatches\tTop 1 accuracy\tTop 5 accuracy")
while True:
ckpt_path = ("%5d" % cnt).replace(' ', '0')
full_ckpt_path = os.path.join(checkpoints_path, ckpt_path)
if not os.path.exists(full_ckpt_path):
break
if len(os.listdir(full_ckpt_path)) <= 2:
cnt += 1
continue
full_command = command + " --log_root=%s 2>/dev/null" % full_ckpt_path
output = subprocess.check_output(full_command, shell=True)
output = output.decode('utf8').strip()
for line in output.split('\n'):
if "Precision" in line and "Recall" in line:
tokens = line.split(", ") # TODO: Nasty hack, make more robust.
precision_at_1 = float(tokens[0].split()[-1])
recall_at_5 = float(tokens[1].split()[-1])
step = int(tokens[2].split()[3])
stats = [times[step], step, precision_at_1, recall_at_5]
print("\t".join([str(stat) for stat in stats]))
sys.stdout.flush()
cnt += 1
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=("Backup model checkpoints periodically")
)
parser.add_argument('-i', "--checkpoints_path", type=str, required=True,
help="Path to dumped model checkpoints")
parser.add_argument('-c', "--command", type=str, required=True,
help="Command to evaluate each individual checkpoint")
parser.add_argument('-s', "--start_cnt", type=int, default=1,
help="Count to start evaluating checkpoints from")
cmdline_args = parser.parse_args()
opt_dict = vars(cmdline_args)
main(opt_dict["checkpoints_path"], opt_dict["command"], opt_dict["start_cnt"])

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,88 @@
# ResNet on CIFAR10 and CIFAR100
(Borrowed from the tensorflow/models repository)
## Dataset
https://www.cs.toronto.edu/~kriz/cifar.html
## Related papers
- [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027v2.pdf)
- [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385v1.pdf)
- [Wide Residual Networks](https://arxiv.org/pdf/1605.07146v1.pdf)
## Setting
* Pad to 36x36 and random crop. Horizontal flip. Per-image whitening.
* Momentum optimizer (momentum = 0.9).
* Learning rate schedule: 0.01 (1 epoch), 0.1 (90 epochs), 0.01 (45 epochs), 0.001 (45 epochs).
* L2 weight decay: 0.005.
* Batch size: 128. (28-10 wide and 1001 layer bottleneck use 64)
## Results
CIFAR-10 Model|Best Precision|Steps
--------------|--------------|------
32 layer|92.5%|~80k
110 layer|93.6%|~80k
164 layer bottleneck|94.5%|~80k
1001 layer bottleneck|94.9%|~80k
28-10 wide|95%|~90k
CIFAR-100 Model|Best Precision|Steps
---------------|--------------|-----
32 layer|68.1%|~45k
110 layer|71.3%|~60k
164 layer bottleneck|75.7%|~50k
1001 layer bottleneck|78.2%|~70k
28-10 wide|78.3%|~70k
## Prerequisites
1. Install TensorFlow 1.2 (preferably from source for higher performance) and Python 3.6.2.
2. Download CIFAR-10/CIFAR-100 dataset.
```shell
curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
```
## How to run
```shell
# cd to the models repository and run with bash. Expected command output shown.
# The directory should contain an empty WORKSPACE file, the resnet code, and the cifar10 dataset.
# Note: The user can split 5k from train set for eval set.
$ ls -R
.:
cifar10 resnet WORKSPACE
./cifar10:
data_batch_1.bin data_batch_2.bin data_batch_3.bin data_batch_4.bin
data_batch_5.bin test_batch.bin
./resnet:
cifar_input.py README.md resnet_main.py resnet_model.py
# Train the model.
$ python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* \
--log_root=/tmp/resnet_model \
--train_dir=/tmp/resnet_model/train \
--dataset='cifar10' \
--num_gpus=1
# While the model is training, you can also check on its progress using tensorboard:
$ tensorboard --logdir=/tmp/resnet_model
# Evaluate the model.
# Avoid running on the same GPU as the training job at the same time,
# otherwise, you might run out of memory.
$ python3 resnet/resnet_main.py --eval_data_path=cifar10/test_batch.bin \
--log_root=/tmp/resnet_model \
--eval_dir=/tmp/resnet_model/test \
--mode=eval \
--dataset='cifar10' \
--num_gpus=0
```

View file

@ -0,0 +1,121 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CIFAR dataset input module.
"""
import tensorflow as tf
def build_input(dataset, data_path, batch_size, mode, data_format):
"""Build CIFAR image and labels.
Args:
dataset: Either 'cifar10' or 'cifar100'.
data_path: Filename for data.
batch_size: Input batch size.
mode: Either 'train' or 'eval'.
data_format: Either 'NCHW' or 'NHWC'.
Returns:
images: Batches of images. [batch_size, image_size, image_size, 3]
labels: Batches of labels. [batch_size, num_classes]
Raises:
ValueError: when the specified dataset is not supported.
"""
with tf.device('/cpu:0'):
image_size = 32
if dataset == 'cifar10':
label_bytes = 1
label_offset = 0
num_classes = 10
elif dataset == 'cifar100':
label_bytes = 1
label_offset = 1
num_classes = 100
else:
raise ValueError('Not supported dataset %s', dataset)
depth = 3
image_bytes = image_size * image_size * depth
record_bytes = label_bytes + label_offset + image_bytes
data_files = tf.gfile.Glob(data_path)
file_queue = tf.train.string_input_producer(data_files, shuffle=True)
# Read examples from files in the filename queue.
reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
_, value = reader.read(file_queue)
# Convert these examples to dense labels and processed images.
record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
# Convert from string to [depth * height * width] to [depth, height, width].
depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
[depth, image_size, image_size])
# Convert from [depth, height, width] to [height, width, depth].
image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
if mode == 'train':
image = tf.image.resize_image_with_crop_or_pad(
image, image_size+4, image_size+4)
image = tf.random_crop(image, [image_size, image_size, 3])
image = tf.image.random_flip_left_right(image)
# Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
# image = tf.image.random_brightness(image, max_delta=63. / 255.)
# image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
# image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
image = tf.image.per_image_standardization(image)
example_queue = tf.RandomShuffleQueue(
capacity=16 * batch_size,
min_after_dequeue=8 * batch_size,
dtypes=[tf.float32, tf.int32],
shapes=[[image_size, image_size, depth], [1]])
num_threads = 16
else:
image = tf.image.resize_image_with_crop_or_pad(
image, image_size, image_size)
image = tf.image.per_image_standardization(image)
example_queue = tf.FIFOQueue(
3 * batch_size,
dtypes=[tf.float32, tf.int32],
shapes=[[image_size, image_size, depth], [1]])
num_threads = 1
example_enqueue_op = example_queue.enqueue([image, label])
tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
example_queue, [example_enqueue_op] * num_threads))
# Read 'batch' labels + images from the example queue.
images, labels = example_queue.dequeue_many(batch_size)
labels = tf.reshape(labels, [batch_size, 1])
indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
labels = tf.sparse_to_dense(
tf.concat(values=[indices, labels], axis=1),
[batch_size, num_classes], 1.0, 0.0)
if data_format == 'NCHW':
images = tf.transpose(images, [0, 3, 1, 2])
assert len(images.get_shape()) == 4
assert images.get_shape()[0] == batch_size
if data_format == 'NCHW':
assert images.get_shape()[1] == 3
else:
assert images.get_shape()[-1] == 3
assert len(labels.get_shape()) == 2
assert labels.get_shape()[0] == batch_size
assert labels.get_shape()[1] == num_classes
return images, labels

View file

@ -0,0 +1,302 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet Train/Eval module.
"""
import os
import six
import subprocess
import sys
import time
import cifar_input
import numpy as np
import resnet_model
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('dataset', 'cifar10', 'cifar10 or cifar100.')
tf.app.flags.DEFINE_string('mode', 'train', 'train or eval.')
tf.app.flags.DEFINE_string('model', '', 'model to train.')
tf.app.flags.DEFINE_string('data_format', 'NHWC',
"""Data layout to use: NHWC (TF native)
or NCHW (cuDNN native).""")
tf.app.flags.DEFINE_string('train_data_path', '',
'Filepattern for training data.')
tf.app.flags.DEFINE_string('eval_data_path', '',
'Filepattern for eval data')
tf.app.flags.DEFINE_integer('image_size', 32, 'Image side length.')
tf.app.flags.DEFINE_string('train_dir', '',
'Directory to keep training outputs.')
tf.app.flags.DEFINE_string('eval_dir', '',
'Directory to keep eval outputs.')
tf.app.flags.DEFINE_integer('eval_batch_count', 50,
'Number of batches to eval.')
tf.app.flags.DEFINE_bool('eval_once', False,
'Whether evaluate the model only once.')
tf.app.flags.DEFINE_string('log_root', '',
'Should be a parent directory of FLAGS.train_dir/eval_dir.')
tf.app.flags.DEFINE_string('checkpoint_dir', '',
'Directory to store the checkpoints')
tf.app.flags.DEFINE_integer('num_gpus', 0,
'Number of gpus used for training. (0 or 1)')
tf.app.flags.DEFINE_bool('use_bottleneck', False,
'Use bottleneck module or not.')
tf.app.flags.DEFINE_bool('time_inference', False,
'Time inference.')
tf.app.flags.DEFINE_integer('batch_size', -1,
'Batch size to use.')
def train(hps):
"""Training loop."""
images, labels = cifar_input.build_input(
FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode, hps.data_format)
model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
model.build_graph()
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
truth = tf.argmax(model.labels, axis=1)
predictions = tf.argmax(model.predictions, axis=1)
precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))
summary_hook = tf.train.SummarySaverHook(
save_steps=100,
output_dir=FLAGS.train_dir,
summary_op=tf.summary.merge([model.summaries,
tf.summary.scalar('Precision', precision)]))
num_steps_per_epoch = 391 # TODO: Don't hardcode this.
logging_hook = tf.train.LoggingTensorHook(
tensors={'step': model.global_step,
'loss': model.cost,
'precision': precision},
every_n_iter=100)
class _LearningRateSetterHook(tf.train.SessionRunHook):
"""Sets learning_rate based on global step."""
def begin(self):
self._lrn_rate = 0.01
def before_run(self, run_context):
return tf.train.SessionRunArgs(
model.global_step, # Asks for global step value.
feed_dict={model.lrn_rate: self._lrn_rate}) # Sets learning rate
def after_run(self, run_context, run_values):
train_step = run_values.results
if train_step < num_steps_per_epoch:
self._lrn_rate = 0.01
elif train_step < (91 * num_steps_per_epoch):
self._lrn_rate = 0.1
elif train_step < (136 * num_steps_per_epoch):
self._lrn_rate = 0.01
elif train_step < (181 * num_steps_per_epoch):
self._lrn_rate = 0.001
else:
self._lrn_rate = 0.0001
class _SaverHook(tf.train.SessionRunHook):
"""Sets learning_rate based on global step."""
def begin(self):
self.saver = tf.train.Saver(max_to_keep=10000)
subprocess.call("rm -rf %s; mkdir -p %s" % (FLAGS.checkpoint_dir,
FLAGS.checkpoint_dir), shell=True)
self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w')
def after_create_session(self, sess, coord):
self.sess = sess
self.start_time = time.time()
def before_run(self, run_context):
return tf.train.SessionRunArgs(
model.global_step # Asks for global step value.
)
def after_run(self, run_context, run_values):
train_step = run_values.results
epoch = train_step / num_steps_per_epoch
if train_step % num_steps_per_epoch == 0:
end_time = time.time()
directory = os.path.join(FLAGS.checkpoint_dir, ("%5d" % epoch).replace(' ', '0'))
subprocess.call("mkdir -p %s" % directory, shell=True)
ckpt_name = 'model.ckpt'
self.saver.save(self.sess, os.path.join(directory, ckpt_name),
global_step=train_step)
self.f.write("Step: %d\tTime: %s\n" % (train_step, end_time - self.start_time))
print("Saved checkpoint after %d epoch(s) to %s..." % (epoch, directory))
sys.stdout.flush()
self.start_time = time.time()
def end(self, sess):
self.f.close()
with tf.train.MonitoredTrainingSession(
checkpoint_dir=FLAGS.log_root,
hooks=[logging_hook, _LearningRateSetterHook()],
chief_only_hooks=[summary_hook, _SaverHook()],
save_checkpoint_secs=None,
# Since we provide a SummarySaverHook, we need to disable default
# SummarySaverHook. To do that we set save_summaries_steps to 0.
save_summaries_steps=None,
save_summaries_secs=None,
config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
for i in range(num_steps_per_epoch * 181):
mon_sess.run(model.train_op)
def evaluate(hps):
"""Eval loop."""
images, labels = cifar_input.build_input(
FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode, hps.data_format)
model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
model.build_graph()
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
tf.train.start_queue_runners(sess)
best_precision = 0.0
while True:
try:
ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
except tf.errors.OutOfRangeError as e:
tf.logging.error('Cannot restore checkpoint: %s', e)
continue
if not (ckpt_state and ckpt_state.model_checkpoint_path):
tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
break
tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
saver.restore(sess, ckpt_state.model_checkpoint_path)
global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split('-')[-1]
if not global_step.isdigit():
global_step = 0
else:
global_step = int(global_step)
total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0
start_time = time.time()
for _ in six.moves.range(FLAGS.eval_batch_count):
(summaries, loss, predictions, truth, train_step) = sess.run(
[model.summaries, model.cost, model.predictions,
model.labels, model.global_step])
if not FLAGS.time_inference:
for (indiv_truth, indiv_prediction) in zip(truth, predictions):
indiv_truth = np.argmax(indiv_truth)
top5_prediction = np.argsort(indiv_prediction)[-5:]
top1_prediction = np.argsort(indiv_prediction)[-1]
correct_prediction += (indiv_truth == top1_prediction)
if indiv_truth in top5_prediction:
correct_prediction_top5 += 1
total_prediction += 1
if FLAGS.time_inference:
print("Time for inference: %.4f" % (time.time() - start_time))
else:
precision = 1.0 * correct_prediction / total_prediction
precision_top5 = 1.0 * correct_prediction_top5 / total_prediction
best_precision = max(precision, best_precision)
precision_summ = tf.Summary()
precision_summ.value.add(
tag='Precision', simple_value=precision)
summary_writer.add_summary(precision_summ, train_step)
best_precision_summ = tf.Summary()
best_precision_summ.value.add(
tag='Best Precision', simple_value=best_precision)
summary_writer.add_summary(best_precision_summ, train_step)
summary_writer.add_summary(summaries, train_step)
print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' %
(precision, precision_top5, global_step))
summary_writer.flush()
if FLAGS.eval_once:
break
time.sleep(60)
def main(_):
if FLAGS.model == '':
raise Exception('--model must be specified.')
if FLAGS.num_gpus == 0:
dev = '/cpu:0'
elif FLAGS.num_gpus == 1:
dev = '/gpu:0'
else:
raise ValueError('Only support 0 or 1 gpu.')
if FLAGS.batch_size == -1:
if FLAGS.mode == 'train':
batch_size = 128
elif FLAGS.mode == 'eval':
batch_size = 100
else:
batch_size = FLAGS.batch_size
if FLAGS.dataset == 'cifar10':
num_classes = 10
elif FLAGS.dataset == 'cifar100':
num_classes = 100
if FLAGS.model == 'resnet20':
num_residual_units = 3
elif FLAGS.model == 'resnet56':
num_residual_units = 9
elif FLAGS.model == 'resnet164' and FLAGS.use_bottleneck:
num_residual_units = 18
elif FLAGS.model == 'resnet164' and not FLAGS.use_bottleneck:
num_residual_units = 27
else:
raise Exception("Invalid model -- only resnet20, resnet56 and resnet164 supported")
data_format = FLAGS.data_format
hps = resnet_model.HParams(batch_size=batch_size,
num_classes=num_classes,
min_lrn_rate=0.0001,
lrn_rate=0.1,
num_residual_units=num_residual_units,
use_bottleneck=FLAGS.use_bottleneck,
weight_decay_rate=0.0005,
relu_leakiness=0.1,
optimizer='mom',
data_format=data_format)
with tf.device(dev):
if FLAGS.mode == 'train':
train(hps)
elif FLAGS.mode == 'eval':
evaluate(hps)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()

View file

@ -0,0 +1,281 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet model.
Related papers:
https://arxiv.org/pdf/1603.05027v2.pdf
https://arxiv.org/pdf/1512.03385v1.pdf
https://arxiv.org/pdf/1605.07146v1.pdf
"""
from collections import namedtuple
import numpy as np
import tensorflow as tf
import six
from tensorflow.python.training import moving_averages
HParams = namedtuple('HParams',
'batch_size, num_classes, min_lrn_rate, lrn_rate, '
'num_residual_units, use_bottleneck, weight_decay_rate, '
'relu_leakiness, optimizer, data_format')
class ResNet(object):
"""ResNet model."""
def __init__(self, hps, images, labels, mode):
"""ResNet constructor.
Args:
hps: Hyperparameters.
images: Batches of images. [batch_size, image_size, image_size, 3]
labels: Batches of labels. [batch_size, num_classes]
mode: One of 'train' and 'eval'.
"""
self.hps = hps
self._images = images
self.labels = labels
self.mode = mode
self._extra_train_ops = []
def build_graph(self):
"""Build a whole graph for the model."""
self.global_step = tf.contrib.framework.get_or_create_global_step()
self._build_model()
if self.mode == 'train':
self._build_train_op()
self.summaries = tf.summary.merge_all()
def _stride_arr(self, stride):
"""Map a stride scalar to the stride array for tf.nn.conv2d."""
if self.hps.data_format == 'NHWC':
return [1, stride, stride, 1]
elif self.hps.data_format == 'NCHW':
return [1, 1, stride, stride]
else:
raise Exception("Invalid data_format")
def _build_model(self):
"""Build the core model within the graph."""
with tf.variable_scope('init'):
x = self._images
x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))
strides = [1, 2, 2]
activate_before_residual = [True, False, False]
if self.hps.use_bottleneck:
res_func = self._bottleneck_residual
filters = [16, 64, 128, 256]
else:
res_func = self._residual
filters = [16, 16, 32, 64]
# Uncomment the following codes to use w28-10 wide residual network.
# It is more memory efficient than very deep residual network and has
# comparably good performance.
# https://arxiv.org/pdf/1605.07146v1.pdf
# filters = [16, 160, 320, 640]
# Update hps.num_residual_units to 4
with tf.variable_scope('unit_1_0'):
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
activate_before_residual[0])
for i in six.moves.range(1, self.hps.num_residual_units):
with tf.variable_scope('unit_1_%d' % i):
x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
with tf.variable_scope('unit_2_0'):
x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
activate_before_residual[1])
for i in six.moves.range(1, self.hps.num_residual_units):
with tf.variable_scope('unit_2_%d' % i):
x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
with tf.variable_scope('unit_3_0'):
x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
activate_before_residual[2])
for i in six.moves.range(1, self.hps.num_residual_units):
with tf.variable_scope('unit_3_%d' % i):
x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
with tf.variable_scope('unit_last'):
x = self._batch_norm('final_bn', x)
x = self._relu(x, self.hps.relu_leakiness)
x = self._global_avg_pool(x)
with tf.variable_scope('logit'):
logits = self._fully_connected(x, self.hps.num_classes)
self.predictions = tf.nn.softmax(logits)
with tf.variable_scope('costs'):
xent = tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=self.labels)
self.cost = tf.reduce_mean(xent, name='xent')
self.cost += self._decay()
tf.summary.scalar('cost', self.cost)
def _build_train_op(self):
"""Build training specific ops for the graph."""
self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
tf.summary.scalar('learning_rate', self.lrn_rate)
trainable_variables = tf.trainable_variables()
grads = tf.gradients(self.cost, trainable_variables)
if self.hps.optimizer == 'sgd':
optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
elif self.hps.optimizer == 'mom':
optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
apply_op = optimizer.apply_gradients(
zip(grads, trainable_variables),
global_step=self.global_step, name='train_step')
train_ops = [apply_op] + self._extra_train_ops
self.train_op = tf.group(*train_ops)
# TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
def _batch_norm(self, name, x):
"""Batch normalization."""
with tf.variable_scope(name) as scope:
output = tf.contrib.layers.batch_norm(x,
decay=0.9,
epsilon=0.001,
data_format=self.hps.data_format,
scope=scope,
is_training=(self.mode == 'train'),
fused=True,
updates_collections=None)
return output
def _residual(self, x, in_filter, out_filter, stride,
activate_before_residual=False):
"""Residual unit with 2 sub layers."""
if activate_before_residual:
with tf.variable_scope('shared_activation'):
x = self._batch_norm('init_bn', x)
x = self._relu(x, self.hps.relu_leakiness)
orig_x = x
else:
with tf.variable_scope('residual_only_activation'):
orig_x = x
x = self._batch_norm('init_bn', x)
x = self._relu(x, self.hps.relu_leakiness)
with tf.variable_scope('sub1'):
x = self._conv('conv1', x, 3, in_filter, out_filter, stride)
with tf.variable_scope('sub2'):
x = self._batch_norm('bn2', x)
x = self._relu(x, self.hps.relu_leakiness)
x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])
with tf.variable_scope('sub_add'):
if in_filter != out_filter:
orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID',
data_format=self.hps.data_format)
if self.hps.data_format == 'NHWC':
orig_x = tf.pad(
orig_x, [[0, 0], [0, 0], [0, 0],
[(out_filter-in_filter)//2, (out_filter-in_filter)//2]])
elif self.hps.data_format == 'NCHW':
orig_x = tf.pad(
orig_x, [[0, 0], [(out_filter-in_filter)//2, (out_filter-in_filter)//2],
[0, 0], [0, 0]])
x += orig_x
tf.logging.debug('image after unit %s', x.get_shape())
return x
def _bottleneck_residual(self, x, in_filter, out_filter, stride,
activate_before_residual=False):
"""Bottleneck residual unit with 3 sub layers."""
if activate_before_residual:
with tf.variable_scope('common_bn_relu'):
x = self._batch_norm('init_bn', x)
x = self._relu(x, self.hps.relu_leakiness)
orig_x = x
else:
with tf.variable_scope('residual_bn_relu'):
orig_x = x
x = self._batch_norm('init_bn', x)
x = self._relu(x, self.hps.relu_leakiness)
with tf.variable_scope('sub1'):
x = self._conv('conv1', x, 1, in_filter, out_filter/4, stride)
with tf.variable_scope('sub2'):
x = self._batch_norm('bn2', x)
x = self._relu(x, self.hps.relu_leakiness)
x = self._conv('conv2', x, 3, out_filter/4, out_filter/4, [1, 1, 1, 1])
with tf.variable_scope('sub3'):
x = self._batch_norm('bn3', x)
x = self._relu(x, self.hps.relu_leakiness)
x = self._conv('conv3', x, 1, out_filter/4, out_filter, [1, 1, 1, 1])
with tf.variable_scope('sub_add'):
if in_filter != out_filter:
orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
x += orig_x
tf.logging.info('image after unit %s', x.get_shape())
return x
def _decay(self):
"""L2 weight decay loss."""
costs = []
for var in tf.trainable_variables():
if var.op.name.find(r'DW') > 0:
costs.append(tf.nn.l2_loss(var))
# tf.summary.histogram(var.op.name, var)
return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))
def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
"""Convolution."""
with tf.variable_scope(name):
n = filter_size * filter_size * out_filters
kernel = tf.get_variable(
'DW', [filter_size, filter_size, in_filters, out_filters],
tf.float32, initializer=tf.random_normal_initializer(
stddev=np.sqrt(2.0/n)))
return tf.nn.conv2d(x, kernel, strides, padding='SAME',
data_format=self.hps.data_format)
def _relu(self, x, leakiness=0.0):
"""Relu, with optional leaky support."""
return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
def _fully_connected(self, x, out_dim):
"""FullyConnected layer for final output."""
x = tf.reshape(x, [self.hps.batch_size, -1])
w = tf.get_variable(
'DW', [x.get_shape()[1], out_dim],
initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
b = tf.get_variable('biases', [out_dim],
initializer=tf.constant_initializer())
return tf.nn.xw_plus_b(x, w, b)
def _global_avg_pool(self, x):
assert x.get_shape().ndims == 4
if self.hps.data_format == 'NHWC':
return tf.reduce_mean(x, [1, 2])
elif self.hps.data_format == 'NCHW':
return tf.reduce_mean(x, [2, 3])

View file

@ -0,0 +1,51 @@
import argparse
import os
import subprocess
import sys
def main(checkpoint_path, model, use_bottleneck):
print("Number of images\tInference time")
num_trials = 10
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
command = ("python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin "
"--eval_dir=data/%(model)s/log_root/eval --dataset='cifar10' --model=%(model)s "
"--use_bottleneck=%(use_bottleneck)s --eval_batch_count=%(num_trials)d --eval_once=True --num_gpus=1 "
"--data_format=NHWC --time_inference=True --eval_batch_count=1 --batch_size=%(batch_size)d" %
{"model": model, "use_bottleneck": "True" if use_bottleneck else "False", "batch_size": batch_size,
"num_trials": num_trials})
full_command = command + " --log_root=%s 2>/dev/null" % checkpoint_path
try:
output = subprocess.check_output(full_command, shell=True)
output = output.decode('utf8').strip()
for line in output.split('\n'):
if "Time for inference" in line:
line = line.strip()
inference_time = float(line.split(": ")[1]) / num_trials
stats = [batch_size, inference_time]
print("\t".join([str(stat) for stat in stats]))
sys.stdout.flush()
except:
stats = [batch_size, ""]
print("\t".join([str(stat) for stat in stats]))
sys.stdout.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=("Backup model checkpoints periodically")
)
parser.add_argument('-i', "--checkpoint_path", type=str, required=True,
help="Path to dumped model checkpoints")
parser.add_argument('-m', "--model", type=str, required=True,
help="Model name")
parser.add_argument('-b', "--use_bottleneck", type=bool, default=False,
help="Use bottleneck")
cmdline_args = parser.parse_args()
opt_dict = vars(cmdline_args)
checkpoint_path = opt_dict["checkpoint_path"]
model = opt_dict["model"]
use_bottleneck = opt_dict["use_bottleneck"]
main(checkpoint_path, model, use_bottleneck)

3
tensorflow/SQuAD/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
out/
data/
*/__pycache__/

165
tensorflow/SQuAD/README.md Normal file
View file

@ -0,0 +1,165 @@
# Bi-directional Attention Flow for Machine Comprehension
- This the original implementation of [Bi-directional Attention Flow for Machine Comprehension][paper] (Seo et al., 2016).
- This is tensorflow v1.1.0 comaptible version. This is not compatible with previous trained models,
so if you want to use them, go to [v0.2.1][v0.2.1].
- The CodaLab worksheet for the [SQuAD Leaderboard][squad] submission is available [here][worksheet].
- Please contact [Minjoon Seo][minjoon] ([@seominjoon][minjoon-github]) for questions and suggestions.
## 0. Requirements
#### General
- Python (developed on 3.5.2. Issues have been reported with Python 2!)
- unzip
#### Python Packages
- tensorflow (deep learning library, verified on 1.1.0)
- nltk (NLP tools, verified on 3.2.1)
- tqdm (progress bar, verified on 4.7.4)
- jinja2 (for visaulization; if you only train and test, not needed)
## 1. Pre-processing
First, prepare data. Donwload SQuAD data and GloVe and nltk corpus
(~850 MB, this will download files to `$HOME/data`):
```
chmod +x download.sh; ./download.sh
```
Second, Preprocess Stanford QA dataset (along with GloVe vectors) and save them in `$PWD/data/squad` (~5 minutes):
```
python -m squad.prepro
```
## 2. Training
The model was trained with NVidia Titan X (Pascal Architecture, 2016).
The model requires at least 12GB of GPU RAM.
If your GPU RAM is smaller than 12GB, you can either decrease batch size (performance might degrade),
or you can use multi GPU (see below).
The training converges at ~18k steps, and it took ~4s per step (i.e. ~20 hours).
Before training, it is recommended to first try the following code to verify everything is okay and memory is sufficient:
```
python -m basic.cli --mode train --noload --debug
```
Then to fully train, run:
```
python -m basic.cli --mode train --noload
```
You can speed up the training process with optimization flags:
```
python -m basic.cli --mode train --noload --len_opt --cluster
```
You can still omit them, but training will be much slower.
## 3. Test
To test, run:
```
python -m basic.cli
```
Similarly to training, you can give the optimization flags to speed up test (5 minutes on dev data):
```
python -m basic.cli --len_opt --cluster
```
This command loads the most recently saved model during training and begins testing on the test data.
After the process ends, it prints F1 and EM scores, and also outputs a json file (`$PWD/out/basic/00/answer/test-####.json`,
where `####` is the step # that the model was saved).
Note that the printed scores are not official (our scoring scheme is a bit harsher).
To obtain the official number, use the official evaluator (copied in `squad` folder) and the output json file:
```
python squad/evaluate-v1.1.py $HOME/data/squad/dev-v1.1.json out/basic/00/answer/test-####.json
```
### 3.1 Loading from pre-trained weights
NOTE: this version is not compatible with the following trained models.
For compatibility, use [v0.2.1][v0.2.1].
Instead of training the model yourself, you can choose to use pre-trained weights that were used for [SQuAD Leaderboard][squad] submission.
Refer to [this worksheet][worksheet] in CodaLab to reproduce the results.
If you are unfamiliar with CodaLab, follow these simple steps (given that you met all prereqs above):
1. Download `save.zip` from the [worksheet][worksheet] and unzip it in the current directory.
2. Copy `glove.6B.100d.txt` from your glove data folder (`$HOME/data/glove/`) to the current directory.
3. To reproduce single model:
```
basic/run_single.sh $HOME/data/squad/dev-v1.1.json single.json
```
This writes the answers to `single.json` in the current directory. You can then use the official evaluator to obtain EM and F1 scores. If you want to run on GPU (~5 mins), change the value of batch_size flag in the shell file to a higher number (60 for 12GB GPU RAM).
4. Similarly, to reproduce ensemble method:
```
basic/run_ensemble.sh $HOME/data/squad/dev-v1.1.json ensemble.json
```
If you want to run on GPU, you should run the script sequentially by removing '&' in the forloop, or you will need to specify different GPUs for each run of the for loop.
## Results
### Dev Data
| | EM (%) | F1 (%) |
| -------- |:------:|:------:|
| single | 67.8 | 77.4 |
###Dev Data (old)
NOTE: These numbers are from [v0.2.1][v0.2.1].
| | EM (%) | F1 (%) |
| -------- |:------:|:------:|
| single | 67.7 | 77.3 |
| ensemble | 72.6 | 80.7 |
###Test Data (old)
NOTE: These numbers are from [v0.2.1][v0.2.1].
| | EM (%) | F1 (%) |
| -------- |:------:|:------:|
| single | 68.0 | 77.3 |
| ensemble | 73.3 | 81.1 |
Refer to [our paper][paper] for more details.
See [SQuAD Leaderboard][squad] to compare with other models.
<!--
## Using Pre-trained Model
If you would like to use pre-trained model, it's very easy!
You can download the model weights [here][save] (make sure that its commit id matches the source code's).
Extract them and put them in `$PWD/out/basic/00/save` directory, with names unchanged.
Then do the testing again, but you need to specify the step # that you are loading from:
```
python -m basic.cli --mode test --batch_size 8 --eval_num_batches 0 --load_step ####
```
-->
## Multi-GPU Training & Testing
Our model supports multi-GPU training.
We follow the parallelization paradigm described in [TensorFlow Tutorial][multi-gpu].
In short, if you want to use batch size of 60 (default) but if you have 3 GPUs with 4GB of RAM,
then you initialize each GPU with batch size of 20, and combine the gradients on CPU.
This can be easily done by running:
```
python -m basic.cli --mode train --noload --num_gpus 3 --batch_size 20
```
Similarly, you can speed up your testing by:
```
python -m basic.cli --num_gpus 3 --batch_size 20
```
[multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards
[squad]: http://stanford-qa.com
[paper]: https://arxiv.org/abs/1611.01603
[worksheet]: https://worksheets.codalab.org/worksheets/0x37a9b8c44f6845c28866267ef941c89d/
[minjoon]: https://seominjoon.github.io
[minjoon-github]: https://github.com/seominjoon
[v0.2.1]: https://github.com/allenai/bi-att-flow/tree/v0.2.1

View file

View file

@ -0,0 +1,112 @@
import os
import tensorflow as tf
from basic.main import main as m
flags = tf.app.flags
# Names and directories
flags.DEFINE_string("model_name", "basic", "Model name [basic]")
flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
flags.DEFINE_string("run_id", "0", "Run ID [0]")
flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
flags.DEFINE_string("forward_name", "single", "Forward name [single]")
flags.DEFINE_string("answer_path", "", "Answer path []")
flags.DEFINE_string("eval_path", "", "Eval path []")
flags.DEFINE_string("load_path", "", "Load path []")
flags.DEFINE_string("shared_path", "", "Shared path []")
# Device placement
flags.DEFINE_string("device", "/cpu:0", "default device for summing gradients. [/cpu:0]")
flags.DEFINE_string("device_type", "gpu", "device for computing gradients (parallelization). cpu | gpu [gpu]")
flags.DEFINE_integer("num_gpus", 1, "num of gpus or cpus for computing gradients [1]")
# Essential training and test options
flags.DEFINE_string("mode", "test", "trains | test | forward [test]")
flags.DEFINE_boolean("load", True, "load saved data? [True]")
flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
flags.DEFINE_bool('load_ema', True, "load exponential average of variables when testing? [True]")
flags.DEFINE_bool("eval", True, "eval? [True]")
flags.DEFINE_bool("wy", False, "Use wy for loss / eval? [False]")
flags.DEFINE_bool("na", False, "Enable no answer strategy and learn bias? [False]")
flags.DEFINE_float("th", 0.5, "Threshold [0.5]")
# Training / test parameters
flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
flags.DEFINE_integer("num_epochs", 12, "Total number of epochs for training [12]")
flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
flags.DEFINE_integer("load_step", 0, "load step [0]")
flags.DEFINE_float("init_lr", 0.001, "Initial learning rate [0.001]")
flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]")
flags.DEFINE_float("keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]")
flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
flags.DEFINE_string("out_channel_dims", "100", "Out channel dims of Char-CNN, separated by commas [100]")
flags.DEFINE_string("filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]")
flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
flags.DEFINE_bool("highway", True, "Use highway? [True]")
flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
flags.DEFINE_bool("share_lstm_weights", True, "Share pre-processing (phrase-level) LSTM weights [True]")
flags.DEFINE_float("var_decay", 0.999, "Exponential moving average decay for variables [0.999]")
# Optimizations
flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
flags.DEFINE_bool("cpu_opt", False, "CPU optimization? GPU computation can be slower [False]")
# Logging and saving options
flags.DEFINE_boolean("progress", True, "Show progress? [True]")
flags.DEFINE_integer("log_period", 100, "Log period [100]")
flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
flags.DEFINE_float("decay", 0.9, "Exponential moving average decay for logging values [0.9]")
# Thresholds for speed and less memory usage
flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
flags.DEFINE_integer("sent_size_th", 400, "sent size th [64]")
flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
# Advanced training options
flags.DEFINE_bool("lower_word", True, "lower word [True]")
flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
# Ablation options
flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")
def main(_):
config = flags.FLAGS
config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
m(config)
if __name__ == "__main__":
tf.app.run()

View file

@ -0,0 +1,116 @@
import argparse
import functools
import gzip
import json
import pickle
from collections import defaultdict
from operator import mul
from tqdm import tqdm
from squad.utils import get_phrase, get_best_span, get_span_score_pairs
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('paths', nargs='+')
parser.add_argument('-o', '--out', default='ensemble.json')
parser.add_argument("--data_path", default="data/squad/data_test.json")
parser.add_argument("--shared_path", default="data/squad/shared_test.json")
args = parser.parse_args()
return args
def ensemble(args):
e_list = []
for path in tqdm(args.paths):
with gzip.open(path, 'r') as fh:
e = pickle.load(fh)
e_list.append(e)
with open(args.data_path, 'r') as fh:
data = json.load(fh)
with open(args.shared_path, 'r') as fh:
shared = json.load(fh)
out = {}
for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])):
if idx >= len(e['yp']):
# for debugging purpose
break
context = shared['p'][rx[0]][rx[1]]
wordss = shared['x'][rx[0]][rx[1]]
yp_list = [e['yp'][idx] for e in e_list]
yp2_list = [e['yp2'][idx] for e in e_list]
answer = ensemble4(context, wordss, yp_list, yp2_list)
out[id_] = answer
with open(args.out, 'w') as fh:
json.dump(out, fh)
def ensemble1(context, wordss, y1_list, y2_list):
"""
:param context: Original context
:param wordss: tokenized words (nested 2D list)
:param y1_list: list of start index probs (each element corresponds to probs form single model)
:param y2_list: list of stop index probs
:return:
"""
sum_y1 = combine_y_list(y1_list)
sum_y2 = combine_y_list(y2_list)
span, score = get_best_span(sum_y1, sum_y2)
return get_phrase(context, wordss, span)
def ensemble2(context, wordss, y1_list, y2_list):
start_dict = defaultdict(float)
stop_dict = defaultdict(float)
for y1, y2 in zip(y1_list, y2_list):
span, score = get_best_span(y1, y2)
start_dict[span[0]] += y1[span[0][0]][span[0][1]]
stop_dict[span[1]] += y2[span[1][0]][span[1][1]]
start = max(start_dict.items(), key=lambda pair: pair[1])[0]
stop = max(stop_dict.items(), key=lambda pair: pair[1])[0]
best_span = (start, stop)
return get_phrase(context, wordss, best_span)
def ensemble3(context, wordss, y1_list, y2_list):
d = defaultdict(float)
for y1, y2 in zip(y1_list, y2_list):
span, score = get_best_span(y1, y2)
phrase = get_phrase(context, wordss, span)
d[phrase] += score
return max(d.items(), key=lambda pair: pair[1])[0]
def ensemble4(context, wordss, y1_list, y2_list):
d = defaultdict(lambda: 0.0)
for y1, y2 in zip(y1_list, y2_list):
for span, score in get_span_score_pairs(y1, y2):
d[span] += score
span = max(d.items(), key=lambda pair: pair[1])[0]
phrase = get_phrase(context, wordss, span)
return phrase
def combine_y_list(y_list, op='*'):
if op == '+':
func = sum
elif op == '*':
def func(l): return functools.reduce(mul, l)
else:
func = op
return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)]
def main():
args = get_args()
ensemble(args)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,39 @@
import sys
import json
from collections import Counter, defaultdict
import re
def key_func(pair):
return pair[1]
def get_func(vals, probs):
counter = Counter(vals)
# return max(zip(vals, probs), key=lambda pair: pair[1])[0]
# return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
# return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
d = defaultdict(float)
for val, prob in zip(vals, probs):
d[val] += prob
d[''] = 0
return max(d.items(), key=lambda pair: pair[1])[0]
third_path = sys.argv[1]
other_paths = sys.argv[2:]
others = [json.load(open(path, 'r')) for path in other_paths]
c = {}
assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
for key in others[0].keys():
if key == 'scores':
continue
probs = [other['scores'][key] for other in others]
vals = [other[key] for other in others]
largest_val = get_func(vals, probs)
c[key] = largest_val
json.dump(c, open(third_path, 'w'))

View file

@ -0,0 +1,453 @@
import numpy as np
import tensorflow as tf
from basic.read_data import DataSet
from my.nltk_utils import span_f1
from my.tensorflow import padded_reshape
from my.utils import argmax
from squad.utils import get_phrase, get_best_span, get_best_span_wy
class Evaluation(object):
def __init__(self, data_type, global_step, idxs, yp, tensor_dict=None):
self.data_type = data_type
self.global_step = global_step
self.idxs = idxs
self.yp = yp
self.num_examples = len(yp)
self.tensor_dict = None
self.dict = {'data_type': data_type,
'global_step': global_step,
'yp': yp,
'idxs': idxs,
'num_examples': self.num_examples}
if tensor_dict is not None:
self.tensor_dict = {key: val.tolist() for key, val in tensor_dict.items()}
for key, val in self.tensor_dict.items():
self.dict[key] = val
self.summaries = None
def __repr__(self):
return "{} step {}".format(self.data_type, self.global_step)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_idxs = self.idxs + other.idxs
new_tensor_dict = None
if self.tensor_dict is not None:
new_tensor_dict = {key: val + other.tensor_dict[key] for key, val in self.tensor_dict.items()}
return Evaluation(self.data_type, self.global_step, new_idxs, new_yp, tensor_dict=new_tensor_dict)
def __radd__(self, other):
return self.__add__(other)
class LabeledEvaluation(Evaluation):
def __init__(self, data_type, global_step, idxs, yp, y, tensor_dict=None):
super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
self.y = y
self.dict['y'] = y
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_idxs = self.idxs + other.idxs
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, tensor_dict=new_tensor_dict)
class AccuracyEvaluation(LabeledEvaluation):
def __init__(self, data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=None):
super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y, tensor_dict=tensor_dict)
self.loss = loss
self.correct = correct
self.acc = sum(correct) / len(correct)
self.dict['loss'] = loss
self.dict['correct'] = correct
self.dict['acc'] = self.acc
loss_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/loss'.format(data_type), simple_value=self.loss)])
acc_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/acc'.format(data_type), simple_value=self.acc)])
self.summaries = [loss_summary, acc_summary]
def __repr__(self):
return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_correct = self.correct + other.correct
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss, tensor_dict=new_tensor_dict)
class Evaluator(object):
def __init__(self, config, model, tensor_dict=None):
self.config = config
self.model = model
self.global_step = model.global_step
self.yp = model.yp
self.tensor_dict = {} if tensor_dict is None else tensor_dict
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), tensor_dict=tensor_dict)
return e
def get_evaluation_from_batches(self, sess, batches):
e = sum(self.get_evaluation(sess, batch) for batch in batches)
return e
class LabeledEvaluator(Evaluator):
def __init__(self, config, model, tensor_dict=None):
super(LabeledEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.y = model.y
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
y = feed_dict[self.y]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), tensor_dict=tensor_dict)
return e
class AccuracyEvaluator(LabeledEvaluator):
def __init__(self, config, model, tensor_dict=None):
super(AccuracyEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.loss = model.loss
def get_evaluation(self, sess, batch):
idxs, data_set = batch
assert isinstance(data_set, DataSet)
feed_dict = self.model.get_feed_dict(data_set, False)
global_step, yp, loss, vals = sess.run([self.global_step, self.yp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
y = data_set.data['y']
yp = yp[:data_set.num_examples]
correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y, correct, float(loss), tensor_dict=tensor_dict)
return e
@staticmethod
def compare(yi, ypi):
for start, stop in yi:
if start == int(np.argmax(ypi)):
return True
return False
class AccuracyEvaluator2(AccuracyEvaluator):
@staticmethod
def compare(yi, ypi):
for start, stop in yi:
para_start = int(np.argmax(np.max(ypi, 1)))
sent_start = int(np.argmax(ypi[para_start]))
if tuple(start) == (para_start, sent_start):
return True
return False
class ForwardEvaluation(Evaluation):
def __init__(self, data_type, global_step, idxs, yp, yp2, loss, id2answer_dict, tensor_dict=None):
super(ForwardEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
self.yp2 = yp2
self.loss = loss
self.dict['loss'] = loss
self.dict['yp2'] = yp2
self.id2answer_dict = id2answer_dict
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_yp2 = self.yp2 + other.yp2
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_yp)
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
new_id2answer_dict['scores'] = new_id2score_dict
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return ForwardEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_loss, new_id2answer_dict, tensor_dict=new_tensor_dict)
def __repr__(self):
return "{} step {}: loss={:.4f}".format(self.data_type, self.global_step, self.loss)
class F1Evaluation(AccuracyEvaluation):
def __init__(self, data_type, global_step, idxs, yp, yp2, y, correct, loss, f1s, id2answer_dict, tensor_dict=None):
super(F1Evaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=tensor_dict)
self.yp2 = yp2
self.f1s = f1s
self.f1 = float(np.mean(f1s))
self.dict['yp2'] = yp2
self.dict['f1s'] = f1s
self.dict['f1'] = self.f1
self.id2answer_dict = id2answer_dict
f1_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/f1'.format(data_type), simple_value=self.f1)])
self.summaries.append(f1_summary)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_yp2 = self.yp2 + other.yp2
new_y = self.y + other.y
new_correct = self.correct + other.correct
new_f1s = self.f1s + other.f1s
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
new_id2answer_dict['scores'] = new_id2score_dict
if 'na' in self.id2answer_dict:
new_id2na_dict = dict(list(self.id2answer_dict['na'].items()) + list(other.id2answer_dict['na'].items()))
new_id2answer_dict['na'] = new_id2na_dict
e = F1Evaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_correct, new_loss, new_f1s, new_id2answer_dict)
if 'wyp' in self.dict:
new_wyp = self.dict['wyp'] + other.dict['wyp']
e.dict['wyp'] = new_wyp
return e
def __repr__(self):
return "{} step {}: accuracy={:.4f}, f1={:.4f}, loss={:.4f}".format(self.data_type, self.global_step, self.acc, self.f1, self.loss)
class F1Evaluator(LabeledEvaluator):
def __init__(self, config, model, tensor_dict=None):
super(F1Evaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.yp2 = model.yp2
self.wyp = model.wyp
self.loss = model.loss
if config.na:
self.na = model.na_prob
def get_evaluation(self, sess, batch):
idxs, data_set = self._split_batch(batch)
assert isinstance(data_set, DataSet)
feed_dict = self._get_feed_dict(batch)
if self.config.na:
global_step, yp, yp2, wyp, loss, na, vals = sess.run([self.global_step, self.yp, self.yp2, self.wyp, self.loss, self.na, list(self.tensor_dict.values())], feed_dict=feed_dict)
else:
global_step, yp, yp2, wyp, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.wyp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
y = data_set.data['y']
if self.config.squash:
new_y = []
for xi, yi in zip(data_set.data['x'], y):
new_yi = []
for start, stop in yi:
start_offset = sum(map(len, xi[:start[0]]))
stop_offset = sum(map(len, xi[:stop[0]]))
new_start = 0, start_offset + start[1]
new_stop = 0, stop_offset + stop[1]
new_yi.append((new_start, new_stop))
new_y.append(new_yi)
y = new_y
if self.config.single:
new_y = []
for yi in y:
new_yi = []
for start, stop in yi:
new_start = 0, start[1]
new_stop = 0, stop[1]
new_yi.append((new_start, new_stop))
new_y.append(new_yi)
y = new_y
yp, yp2, wyp = yp[:data_set.num_examples], yp2[:data_set.num_examples], wyp[:data_set.num_examples]
if self.config.wy:
spans, scores = zip(*[get_best_span_wy(wypi, self.config.th) for wypi in wyp])
else:
spans, scores = zip(*[get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)])
def _get(xi, span):
if len(xi) <= span[0][0]:
return [""]
if len(xi[span[0][0]]) <= span[1][1]:
return [""]
return xi[span[0][0]][span[0][1]:span[1][1]]
def _get2(context, xi, span):
if len(xi) <= span[0][0]:
return ""
if len(xi[span[0][0]]) <= span[1][1]:
return ""
return get_phrase(context, xi, span)
id2answer_dict = {id_: _get2(context, xi, span)
for id_, xi, span, context in zip(data_set.data['ids'], data_set.data['x'], spans, data_set.data['p'])}
id2score_dict = {id_: score for id_, score in zip(data_set.data['ids'], scores)}
id2answer_dict['scores'] = id2score_dict
if self.config.na:
id2na_dict = {id_: float(each) for id_, each in zip(data_set.data['ids'], na)}
id2answer_dict['na'] = id2na_dict
correct = [self.__class__.compare2(yi, span) for yi, span in zip(y, spans)]
f1s = [self.__class__.span_f1(yi, span) for yi, span in zip(y, spans)]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = F1Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y,
correct, float(loss), f1s, id2answer_dict, tensor_dict=tensor_dict)
if self.config.wy:
e.dict['wyp'] = wyp.tolist()
return e
def _split_batch(self, batch):
return batch
def _get_feed_dict(self, batch):
return self.model.get_feed_dict(batch[1], False)
@staticmethod
def compare(yi, ypi, yp2i):
for start, stop in yi:
aypi = argmax(ypi)
mask = np.zeros(yp2i.shape)
mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
return True
return False
@staticmethod
def compare2(yi, span):
for start, stop in yi:
if tuple(start) == span[0] and tuple(stop) == span[1]:
return True
return False
@staticmethod
def span_f1(yi, span):
max_f1 = 0
for start, stop in yi:
if start[0] == span[0][0]:
true_span = start[1], stop[1]
pred_span = span[0][1], span[1][1]
f1 = span_f1(true_span, pred_span)
max_f1 = max(f1, max_f1)
return max_f1
class MultiGPUF1Evaluator(F1Evaluator):
def __init__(self, config, models, tensor_dict=None):
super(MultiGPUF1Evaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
self.models = models
with tf.name_scope("eval_concat"):
N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
self.yp2 = tf.concat(axis=0, values=[padded_reshape(model.yp2, [N, M, JX]) for model in models])
self.wy = tf.concat(axis=0, values=[padded_reshape(model.wy, [N, M, JX]) for model in models])
self.loss = tf.add_n([model.loss for model in models])/len(models)
def _split_batch(self, batches):
idxs_list, data_sets = zip(*batches)
idxs = sum(idxs_list, ())
data_set = sum(data_sets, data_sets[0].get_empty())
return idxs, data_set
def _get_feed_dict(self, batches):
feed_dict = {}
for model, (_, data_set) in zip(self.models, batches):
feed_dict.update(model.get_feed_dict(data_set, False))
return feed_dict
class ForwardEvaluator(Evaluator):
def __init__(self, config, model, tensor_dict=None):
super(ForwardEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.yp2 = model.yp2
self.loss = model.loss
if config.na:
self.na = model.na_prob
def get_evaluation(self, sess, batch):
idxs, data_set = batch
assert isinstance(data_set, DataSet)
feed_dict = self.model.get_feed_dict(data_set, False)
if self.config.na:
global_step, yp, yp2, loss, na, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, self.na, list(self.tensor_dict.values())], feed_dict=feed_dict)
else:
global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
spans, scores = zip(*[get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)])
def _get(xi, span):
if len(xi) <= span[0][0]:
return [""]
if len(xi[span[0][0]]) <= span[1][1]:
return [""]
return xi[span[0][0]][span[0][1]:span[1][1]]
def _get2(context, xi, span):
if len(xi) <= span[0][0]:
return ""
if len(xi[span[0][0]]) <= span[1][1]:
return ""
return get_phrase(context, xi, span)
id2answer_dict = {id_: _get2(context, xi, span)
for id_, xi, span, context in zip(data_set.data['ids'], data_set.data['x'], spans, data_set.data['p'])}
id2score_dict = {id_: score for id_, score in zip(data_set.data['ids'], scores)}
id2answer_dict['scores'] = id2score_dict
if self.config.na:
id2na_dict = {id_: float(each) for id_, each in zip(data_set.data['ids'], na)}
id2answer_dict['na'] = id2na_dict
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = ForwardEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), float(loss), id2answer_dict, tensor_dict=tensor_dict)
# TODO : wy support
return e
@staticmethod
def compare(yi, ypi, yp2i):
for start, stop in yi:
aypi = argmax(ypi)
mask = np.zeros(yp2i.shape)
mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
return True
return False
@staticmethod
def compare2(yi, span):
for start, stop in yi:
if tuple(start) == span[0] and tuple(stop) == span[1]:
return True
return False
@staticmethod
def span_f1(yi, span):
max_f1 = 0
for start, stop in yi:
if start[0] == span[0][0]:
true_span = start[1], stop[1]
pred_span = span[0][1], span[1][1]
f1 = span_f1(true_span, pred_span)
max_f1 = max(f1, max_f1)
return max_f1

View file

@ -0,0 +1,35 @@
import json
import argparse
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("path")
parser.add_argument("-t", "--th", type=float, default=0.5)
# TODO : put more args here
return parser.parse_args()
def get_pr(args):
with open(args.path, 'r') as fp:
answers = json.load(fp)
na = answers['na']
tp = sum(int(not id_.startswith("neg") and score < args.th) for id_, score in na.items())
fp = sum(int(id_.startswith("neg") and score < args.th) for id_, score in na.items())
tn = sum(int(id_.startswith("neg") and score >= args.th) for id_, score in na.items())
fn = sum(int(not id_.startswith("neg") and score >= args.th) for id_, score in na.items())
p = tp / (tp + fp)
r = tp / (tp + fn)
print("p={:.3f}, r={:.3f}".format(p, r))
def main():
args = get_args()
get_pr(args)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,79 @@
import gzip
import json
from json import encoder
import os
import tensorflow as tf
from basic.evaluator import Evaluation, F1Evaluation
from my.utils import short_floats
import pickle
class GraphHandler(object):
def __init__(self, config, model):
self.config = config
self.model = model
self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
self.writer = None
self.save_path = os.path.join(config.save_dir, config.model_name)
def initialize(self, sess):
sess.run(tf.global_variables_initializer())
if self.config.load:
self._load(sess)
if self.config.mode == 'train':
self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
def save(self, sess, global_step=None):
saver = tf.train.Saver(max_to_keep=self.config.max_to_keep)
saver.save(sess, self.save_path, global_step=global_step)
def _load(self, sess):
config = self.config
vars_ = {var.name.split(":")[0]: var for var in tf.global_variables()}
if config.load_ema:
ema = self.model.var_ema
for var in tf.trainable_variables():
del vars_[var.name.split(":")[0]]
vars_[ema.average_name(var)] = var
saver = tf.train.Saver(vars_, max_to_keep=config.max_to_keep)
if config.load_path:
save_path = config.load_path
elif config.load_step > 0:
save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
else:
save_dir = config.save_dir
checkpoint = tf.train.get_checkpoint_state(save_dir)
assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
save_path = checkpoint.model_checkpoint_path
print("Loading saved model from {}".format(save_path))
saver.restore(sess, save_path)
def add_summary(self, summary, global_step):
self.writer.add_summary(summary, global_step)
def add_summaries(self, summaries, global_step):
for summary in summaries:
self.add_summary(summary, global_step)
def dump_eval(self, e, precision=2, path=None):
assert isinstance(e, Evaluation)
if self.config.dump_pickle:
path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
with gzip.open(path, 'wb', compresslevel=3) as fh:
pickle.dump(e.dict, fh)
else:
path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
with open(path, 'w') as fh:
json.dump(short_floats(e.dict, precision), fh)
def dump_answer(self, e, path=None):
assert isinstance(e, Evaluation)
path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
with open(path, 'w') as fh:
json.dump(e.id2answer_dict, fh)

View file

@ -0,0 +1,233 @@
import argparse
import json
import math
import os
import shutil
from pprint import pprint
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from basic.evaluator import ForwardEvaluator, MultiGPUF1Evaluator
from basic.graph_handler import GraphHandler
from basic.model import get_multi_gpu_models
from basic.trainer import MultiGPUTrainer
from basic.read_data import read_data, get_squad_data_filter, update_config
from my.tensorflow import get_num_params
def main(config):
set_dirs(config)
with tf.device(config.device):
if config.mode == 'train':
_train(config)
elif config.mode == 'test':
_test(config)
elif config.mode == 'forward':
_forward(config)
else:
raise ValueError("invalid value for 'mode': {}".format(config.mode))
def set_dirs(config):
# create directories
assert config.load or config.mode == 'train', "config.load must be True if not training"
if not config.load and os.path.exists(config.out_dir):
shutil.rmtree(config.out_dir)
config.save_dir = os.path.join(config.out_dir, "save")
config.log_dir = os.path.join(config.out_dir, "log")
config.eval_dir = os.path.join(config.out_dir, "eval")
config.answer_dir = os.path.join(config.out_dir, "answer")
if not os.path.exists(config.out_dir):
os.makedirs(config.out_dir)
if not os.path.exists(config.save_dir):
os.mkdir(config.save_dir)
if not os.path.exists(config.log_dir):
os.mkdir(config.log_dir)
if not os.path.exists(config.answer_dir):
os.mkdir(config.answer_dir)
if not os.path.exists(config.eval_dir):
os.mkdir(config.eval_dir)
def _config_debug(config):
if config.debug:
config.num_steps = 2
config.eval_period = 1
config.log_period = 1
config.save_period = 1
config.val_num_batches = 2
config.test_num_batches = 2
def _train(config):
data_filter = get_squad_data_filter(config)
train_data = read_data(config, 'train', config.load, data_filter=data_filter)
dev_data = read_data(config, 'dev', True, data_filter=data_filter)
update_config(config, [train_data, dev_data])
_config_debug(config)
word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
word2idx_dict = train_data.shared['word2idx']
idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
for idx in range(config.word_vocab_size)])
config.emb_mat = emb_mat
# construct model graph and variables (using default graph)
pprint(config.__flags, indent=2)
models = get_multi_gpu_models(config)
model = models[0]
print("num params: {}".format(get_num_params()))
trainer = MultiGPUTrainer(config, models)
evaluator = MultiGPUF1Evaluator(config, models, tensor_dict=model.tensor_dict if config.vis else None)
graph_handler = GraphHandler(config, model) # controls all tensors and variables in the graph, including loading /saving
# Variables
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
# Begin training
num_steps = config.num_steps or int(math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs
global_step = 0
for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus,
num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps):
global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step
get_summary = global_step % config.log_period == 0
loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary)
if get_summary:
graph_handler.add_summary(summary, global_step)
# occasional saving
if global_step % config.save_period == 0:
graph_handler.save(sess, global_step=global_step)
if not config.eval:
continue
# Occasional evaluation
if global_step % config.eval_period == 0:
num_steps = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus))
if 0 < config.val_num_batches < num_steps:
num_steps = config.val_num_batches
e_train = evaluator.get_evaluation_from_batches(
sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)
)
graph_handler.add_summaries(e_train.summaries, global_step)
e_dev = evaluator.get_evaluation_from_batches(
sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps))
graph_handler.add_summaries(e_dev.summaries, global_step)
if config.dump_eval:
graph_handler.dump_eval(e_dev)
if config.dump_answer:
graph_handler.dump_answer(e_dev)
if global_step % config.save_period != 0:
graph_handler.save(sess, global_step=global_step)
def _test(config):
test_data = read_data(config, 'test', True)
update_config(config, [test_data])
_config_debug(config)
if config.use_glove_for_unk:
word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
new_word2idx_dict = test_data.shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
config.new_emb_mat = new_emb_mat
pprint(config.__flags, indent=2)
models = get_multi_gpu_models(config)
model = models[0]
evaluator = MultiGPUF1Evaluator(config, models, tensor_dict=models[0].tensor_dict if config.vis else None)
graph_handler = GraphHandler(config, model)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus))
if 0 < config.test_num_batches < num_steps:
num_steps = config.test_num_batches
e = None
for multi_batch in tqdm(test_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps):
ei = evaluator.get_evaluation(sess, multi_batch)
e = ei if e is None else e + ei
if config.vis:
eval_subdir = os.path.join(config.eval_dir, "{}-{}".format(ei.data_type, str(ei.global_step).zfill(6)))
if not os.path.exists(eval_subdir):
os.mkdir(eval_subdir)
path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8))
graph_handler.dump_eval(ei, path=path)
print(e)
if config.dump_answer:
print("dumping answer ...")
graph_handler.dump_answer(e)
if config.dump_eval:
print("dumping eval ...")
graph_handler.dump_eval(e)
def _forward(config):
assert config.load
test_data = read_data(config, config.forward_name, True)
update_config(config, [test_data])
_config_debug(config)
if config.use_glove_for_unk:
word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
new_word2idx_dict = test_data.shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
config.new_emb_mat = new_emb_mat
pprint(config.__flags, indent=2)
models = get_multi_gpu_models(config)
model = models[0]
print("num params: {}".format(get_num_params()))
evaluator = ForwardEvaluator(config, model)
graph_handler = GraphHandler(config, model) # controls all tensors and variables in the graph, including loading /saving
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
num_batches = math.ceil(test_data.num_examples / config.batch_size)
if 0 < config.test_num_batches < num_batches:
num_batches = config.test_num_batches
e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
print(e)
if config.dump_answer:
print("dumping answer ...")
graph_handler.dump_answer(e, path=config.answer_path)
if config.dump_eval:
print("dumping eval ...")
graph_handler.dump_eval(e, path=config.eval_path)
def _get_args():
parser = argparse.ArgumentParser()
parser.add_argument("config_path")
return parser.parse_args()
class Config(object):
def __init__(self, **entries):
self.__dict__.update(entries)
def _run():
args = _get_args()
with open(args.config_path, 'r') as fh:
config = Config(**json.load(fh))
main(config)
if __name__ == "__main__":
_run()

View file

@ -0,0 +1,535 @@
import random
import itertools
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn import BasicLSTMCell
from basic.read_data import DataSet
from my.tensorflow import get_initializer
from my.tensorflow.nn import softsel, get_logits, highway_network, multi_conv1d
from my.tensorflow.rnn import bidirectional_dynamic_rnn
from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, AttentionCell
def get_multi_gpu_models(config):
models = []
with tf.variable_scope(tf.get_variable_scope()):
for gpu_idx in range(config.num_gpus):
with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/{}:{}".format(config.device_type, gpu_idx)):
if gpu_idx > 0:
tf.get_variable_scope().reuse_variables()
model = Model(config, scope, rep=gpu_idx == 0)
models.append(model)
# update the summary in a different scope to avoid reuse issue
with tf.variable_scope('loss_summary', reuse=False):
for gpu_idx in range(config.num_gpus):
with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/{}:{}".format(config.device_type, gpu_idx)):
model = models[gpu_idx]
rep = gpu_idx == 0
if rep:
model._build_var_ema()
if config.mode == 'train':
model._build_ema();
model.summary = tf.summary.merge_all()
model.summary = tf.summary.merge(tf.get_collection("summaries", scope=model.scope))
return models
class Model(object):
def __init__(self, config, scope, rep=True):
self.scope = scope
self.config = config
self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
initializer=tf.constant_initializer(0), trainable=False)
# Define forward inputs here
N, M, JX, JQ, VW, VC, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size
self.x = tf.placeholder('int32', [N, None, None], name='x')
self.cx = tf.placeholder('int32', [N, None, None, W], name='cx')
self.x_mask = tf.placeholder('bool', [N, None, None], name='x_mask')
self.q = tf.placeholder('int32', [N, None], name='q')
self.cq = tf.placeholder('int32', [N, None, W], name='cq')
self.q_mask = tf.placeholder('bool', [N, None], name='q_mask')
self.y = tf.placeholder('bool', [N, None, None], name='y')
self.y2 = tf.placeholder('bool', [N, None, None], name='y2')
self.wy = tf.placeholder('bool', [N, None, None], name='wy')
self.is_train = tf.placeholder('bool', [], name='is_train')
self.new_emb_mat = tf.placeholder('float', [None, config.word_emb_size], name='new_emb_mat')
self.na = tf.placeholder('bool', [N], name='na')
# Define misc
self.tensor_dict = {}
# Forward outputs / loss inputs
self.logits = None
self.yp = None
self.var_list = None
self.na_prob = None
# Loss outputs
self.loss = None
self._build_forward()
self._build_loss()
self.var_ema = None
# if rep:
# self._build_var_ema()
# if config.mode == 'train':
# self._build_ema()
# self.summary = tf.summary.merge_all()
# self.summary = tf.summary.merge(tf.get_collection("summaries", scope=self.scope))
def _build_forward(self):
config = self.config
N, M, JX, JQ, VW, VC, d, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
config.max_word_size
JX = tf.shape(self.x)[2]
JQ = tf.shape(self.q)[1]
M = tf.shape(self.x)[1]
dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
with tf.variable_scope("emb"):
if config.use_char_emb:
with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
with tf.variable_scope("char"):
Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc]
Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc]
Acx = tf.reshape(Acx, [-1, JX, W, dc])
Acq = tf.reshape(Acq, [-1, JQ, W, dc])
filter_sizes = list(map(int, config.out_channel_dims.split(',')))
heights = list(map(int, config.filter_heights.split(',')))
assert sum(filter_sizes) == dco, (filter_sizes, dco)
with tf.variable_scope("conv"):
xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
if config.share_cnn_weights:
tf.get_variable_scope().reuse_variables()
qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
else:
qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
xx = tf.reshape(xx, [-1, M, JX, dco])
qq = tf.reshape(qq, [-1, JQ, dco])
if config.use_word_emb:
with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
if config.mode == 'train':
word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
else:
word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
if config.use_glove_for_unk:
word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
with tf.name_scope("word"):
Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d]
Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d]
self.tensor_dict['x'] = Ax
self.tensor_dict['q'] = Aq
if config.use_char_emb:
xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di]
qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di]
else:
xx = Ax
qq = Aq
# highway network
if config.highway:
with tf.variable_scope("highway"):
xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
tf.get_variable_scope().reuse_variables()
qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
self.tensor_dict['xx'] = xx
self.tensor_dict['qq'] = qq
cell_fw = BasicLSTMCell(d, state_is_tuple=True)
cell_bw = BasicLSTMCell(d, state_is_tuple=True)
d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M]
q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N]
with tf.variable_scope("prepro"):
(fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d]
u = tf.concat(axis=2, values=[fw_u, bw_u])
if config.share_lstm_weights:
tf.get_variable_scope().reuse_variables()
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
else:
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
self.tensor_dict['u'] = u
self.tensor_dict['h'] = h
with tf.variable_scope("main"):
if config.dynamic_att:
p0 = h
u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim',
input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim',
input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim',
input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim',
input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
else:
p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
first_cell_fw = d_cell2_fw
second_cell_fw = d_cell3_fw
first_cell_bw = d_cell2_bw
second_cell_bw = d_cell3_bw
(fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d]
g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
(fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d]
g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])
logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1')
a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX]))
a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1])
(fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
x_len, dtype='float', scope='g2') # [N, M, JX, 2d]
g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
mask=self.x_mask,
is_train=self.is_train, func=config.answer_func, scope='logits2')
flat_logits = tf.reshape(logits, [-1, M * JX])
flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX]
flat_logits2 = tf.reshape(logits2, [-1, M * JX])
flat_yp2 = tf.nn.softmax(flat_logits2)
if config.na:
na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1]
concat_flat_logits = tf.concat(axis=1, values=[na_bias_tiled, flat_logits])
concat_flat_yp = tf.nn.softmax(concat_flat_logits)
na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1])
flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])
concat_flat_logits2 = tf.concat(axis=1, values=[na_bias_tiled, flat_logits2])
concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
na_prob2 = tf.squeeze(tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N]
flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])
self.concat_logits = concat_flat_logits
self.concat_logits2 = concat_flat_logits2
self.na_prob = na_prob * na_prob2
yp = tf.reshape(flat_yp, [-1, M, JX])
yp2 = tf.reshape(flat_yp2, [-1, M, JX])
wyp = tf.nn.sigmoid(logits2)
self.tensor_dict['g1'] = g1
self.tensor_dict['g2'] = g2
self.logits = flat_logits
self.logits2 = flat_logits2
self.yp = yp
self.yp2 = yp2
self.wyp = wyp
def _build_loss(self):
config = self.config
JX = tf.shape(self.x)[2]
M = tf.shape(self.x)[1]
JQ = tf.shape(self.q)[1]
loss_mask = tf.reduce_max(tf.cast(self.q_mask, 'float'), 1)
if config.wy:
losses = tf.nn.sigmoid_cross_entropy_with_logits(
logits=tf.reshape(self.logits2, [-1, M, JX]), labels=tf.cast(self.wy, 'float')) # [N, M, JX]
num_pos = tf.reduce_sum(tf.cast(self.wy, 'float'))
num_neg = tf.reduce_sum(tf.cast(self.x_mask, 'float')) - num_pos
damp_ratio = num_pos / num_neg
dampened_losses = losses * (
(tf.cast(self.x_mask, 'float') - tf.cast(self.wy, 'float')) * damp_ratio + tf.cast(self.wy, 'float'))
new_losses = tf.reduce_sum(dampened_losses, [1, 2])
ce_loss = tf.reduce_mean(loss_mask * new_losses)
"""
if config.na:
na = tf.reshape(self.na, [-1, 1])
concat_y = tf.concat(1, [na, tf.reshape(self.wy, [-1, M * JX])])
losses = tf.nn.softmax_cross_entropy_with_logits(
self.concat_logits, tf.cast(concat_y, 'float') / tf.reduce_sum(tf.cast(self.wy, 'float')))
else:
losses = tf.nn.softmax_cross_entropy_with_logits(
self.logits2, tf.cast(tf.reshape(self.wy, [-1, M * JX]), 'float') / tf.reduce_sum(tf.cast(self.wy, 'float')))
ce_loss = tf.reduce_mean(loss_mask * losses)
"""
tf.add_to_collection('losses', ce_loss)
else:
if config.na:
na = tf.reshape(self.na, [-1, 1])
concat_y = tf.concat(axis=1, values=[na, tf.reshape(self.y, [-1, M * JX])])
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.concat_logits, labels=tf.cast(concat_y, 'float'))
concat_y2 = tf.concat(axis=1, values=[na, tf.reshape(self.y2, [-1, M * JX])])
losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=self.concat_logits2, labels=tf.cast(concat_y2, 'float'))
else:
losses = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=tf.cast(tf.reshape(self.y, [-1, M * JX]), 'float'))
losses2 = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits2, labels=tf.cast(tf.reshape(self.y2, [-1, M * JX]), 'float'))
ce_loss = tf.reduce_mean(loss_mask * losses)
ce_loss2 = tf.reduce_mean(loss_mask * losses2)
tf.add_to_collection('losses', ce_loss)
tf.add_to_collection("losses", ce_loss2)
self.loss = tf.add_n(tf.get_collection('losses', scope=self.scope), name='loss')
tf.summary.scalar(self.loss.op.name, self.loss)
tf.add_to_collection('ema/scalar', self.loss)
def _build_ema(self):
self.ema = tf.train.ExponentialMovingAverage(self.config.decay)
ema = self.ema
tensors = tf.get_collection("ema/scalar", scope=self.scope) + tf.get_collection("ema/vector", scope=self.scope)
ema_op = ema.apply(tensors)
for var in tf.get_collection("ema/scalar", scope=self.scope):
ema_var = ema.average(var)
tf.summary.scalar(ema_var.op.name, ema_var)
for var in tf.get_collection("ema/vector", scope=self.scope):
ema_var = ema.average(var)
tf.summary.histogram(ema_var.op.name, ema_var)
with tf.control_dependencies([ema_op]):
self.loss = tf.identity(self.loss)
def _build_var_ema(self):
self.var_ema = tf.train.ExponentialMovingAverage(self.config.var_decay)
ema = self.var_ema
ema_op = ema.apply(tf.trainable_variables())
with tf.control_dependencies([ema_op]):
self.loss = tf.identity(self.loss)
def get_loss(self):
return self.loss
def get_global_step(self):
return self.global_step
def get_var_list(self):
return self.var_list
def get_feed_dict(self, batch, is_train, supervised=True):
assert isinstance(batch, DataSet)
config = self.config
N, M, JX, JQ, VW, VC, d, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size
feed_dict = {}
if config.len_opt:
"""
Note that this optimization results in variable GPU RAM usage (i.e. can cause OOM in the middle of training.)
First test without len_opt and make sure no OOM, and use len_opt
"""
if sum(len(sent) for para in batch.data['x'] for sent in para) == 0:
new_JX = 1
else:
new_JX = max(len(sent) for para in batch.data['x'] for sent in para)
JX = min(JX, new_JX)
if sum(len(ques) for ques in batch.data['q']) == 0:
new_JQ = 1
else:
new_JQ = max(len(ques) for ques in batch.data['q'])
JQ = min(JQ, new_JQ)
if config.cpu_opt:
if sum(len(para) for para in batch.data['x']) == 0:
new_M = 1
else:
new_M = max(len(para) for para in batch.data['x'])
M = min(M, new_M)
x = np.zeros([N, M, JX], dtype='int32')
cx = np.zeros([N, M, JX, W], dtype='int32')
x_mask = np.zeros([N, M, JX], dtype='bool')
q = np.zeros([N, JQ], dtype='int32')
cq = np.zeros([N, JQ, W], dtype='int32')
q_mask = np.zeros([N, JQ], dtype='bool')
feed_dict[self.x] = x
feed_dict[self.x_mask] = x_mask
feed_dict[self.cx] = cx
feed_dict[self.q] = q
feed_dict[self.cq] = cq
feed_dict[self.q_mask] = q_mask
feed_dict[self.is_train] = is_train
if config.use_glove_for_unk:
feed_dict[self.new_emb_mat] = batch.shared['new_emb_mat']
X = batch.data['x']
CX = batch.data['cx']
if supervised:
y = np.zeros([N, M, JX], dtype='bool')
y2 = np.zeros([N, M, JX], dtype='bool')
wy = np.zeros([N, M, JX], dtype='bool')
na = np.zeros([N], dtype='bool')
feed_dict[self.y] = y
feed_dict[self.y2] = y2
feed_dict[self.wy] = wy
feed_dict[self.na] = na
for i, (xi, cxi, yi, nai) in enumerate(zip(X, CX, batch.data['y'], batch.data['na'])):
if nai:
na[i] = nai
continue
start_idx, stop_idx = random.choice(yi)
j, k = start_idx
j2, k2 = stop_idx
if config.single:
X[i] = [xi[j]]
CX[i] = [cxi[j]]
j, j2 = 0, 0
if config.squash:
offset = sum(map(len, xi[:j]))
j, k = 0, k + offset
offset = sum(map(len, xi[:j2]))
j2, k2 = 0, k2 + offset
y[i, j, k] = True
y2[i, j2, k2-1] = True
if j == j2:
wy[i, j, k:k2] = True
else:
wy[i, j, k:len(batch.data['x'][i][j])] = True
wy[i, j2, :k2] = True
def _get_word(word):
d = batch.shared['word2idx']
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in d:
return d[each]
if config.use_glove_for_unk:
d2 = batch.shared['new_word2idx']
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in d2:
return d2[each] + len(d)
return 1
def _get_char(char):
d = batch.shared['char2idx']
if char in d:
return d[char]
return 1
for i, xi in enumerate(X):
if self.config.squash:
xi = [list(itertools.chain(*xi))]
for j, xij in enumerate(xi):
if j == config.max_num_sents:
break
for k, xijk in enumerate(xij):
if k == config.max_sent_size:
break
each = _get_word(xijk)
assert isinstance(each, int), each
x[i, j, k] = each
x_mask[i, j, k] = True
for i, cxi in enumerate(CX):
if self.config.squash:
cxi = [list(itertools.chain(*cxi))]
for j, cxij in enumerate(cxi):
if j == config.max_num_sents:
break
for k, cxijk in enumerate(cxij):
if k == config.max_sent_size:
break
for l, cxijkl in enumerate(cxijk):
if l == config.max_word_size:
break
cx[i, j, k, l] = _get_char(cxijkl)
for i, qi in enumerate(batch.data['q']):
for j, qij in enumerate(qi):
q[i, j] = _get_word(qij)
q_mask[i, j] = True
for i, cqi in enumerate(batch.data['cq']):
for j, cqij in enumerate(cqi):
for k, cqijk in enumerate(cqij):
cq[i, j, k] = _get_char(cqijk)
if k + 1 == config.max_word_size:
break
if supervised:
assert np.sum(~(x_mask | ~wy)) == 0
return feed_dict
def bi_attention(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
with tf.variable_scope(scope or "bi_attention"):
JX = tf.shape(h)[2]
M = tf.shape(h)[1]
JQ = tf.shape(u)[1]
h_aug = tf.tile(tf.expand_dims(h, 3), [1, 1, 1, JQ, 1])
u_aug = tf.tile(tf.expand_dims(tf.expand_dims(u, 1), 1), [1, M, JX, 1, 1])
if h_mask is None:
hu_mask = None
else:
h_mask_aug = tf.tile(tf.expand_dims(h_mask, 3), [1, 1, 1, JQ])
u_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(u_mask, 1), 1), [1, M, JX, 1])
hu_mask = h_mask_aug & u_mask_aug
u_logits = get_logits([h_aug, u_aug], None, True, wd=config.wd, mask=hu_mask,
is_train=is_train, func=config.logit_func, scope='u_logits') # [N, M, JX, JQ]
u_a = softsel(u_aug, u_logits) # [N, M, JX, d]
h_a = softsel(h, tf.reduce_max(u_logits, 3)) # [N, M, d]
h_a = tf.tile(tf.expand_dims(h_a, 2), [1, 1, JX, 1])
if tensor_dict is not None:
a_u = tf.nn.softmax(u_logits) # [N, M, JX, JQ]
a_h = tf.nn.softmax(tf.reduce_max(u_logits, 3))
tensor_dict['a_u'] = a_u
tensor_dict['a_h'] = a_h
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name)
for var in variables:
tensor_dict[var.name] = var
return u_a, h_a
def attention_layer(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
with tf.variable_scope(scope or "attention_layer"):
JX = tf.shape(h)[2]
M = tf.shape(h)[1]
JQ = tf.shape(u)[1]
if config.q2c_att or config.c2q_att:
u_a, h_a = bi_attention(config, is_train, h, u, h_mask=h_mask, u_mask=u_mask, tensor_dict=tensor_dict)
if not config.c2q_att:
u_a = tf.tile(tf.expand_dims(tf.expand_dims(tf.reduce_mean(u, 1), 1), 1), [1, M, JX, 1])
if config.q2c_att:
p0 = tf.concat(axis=3, values=[h, u_a, h * u_a, h * h_a])
else:
p0 = tf.concat(axis=3, values=[h, u_a, h * u_a])
return p0

View file

@ -0,0 +1,316 @@
import json
import os
import random
import itertools
import math
from collections import defaultdict
import numpy as np
from my.tensorflow import grouper
from my.utils import index
class Data(object):
def get_size(self):
raise NotImplementedError()
def get_by_idxs(self, idxs):
"""
Efficient way to obtain a batch of items from filesystem
:param idxs:
:return dict: {'X': [,], 'Y', }
"""
data = defaultdict(list)
for idx in idxs:
each_data = self.get_one(idx)
for key, val in each_data.items():
data[key].append(val)
return data
def get_one(self, idx):
raise NotImplementedError()
def get_empty(self):
raise NotImplementedError()
def __add__(self, other):
raise NotImplementedError()
class DataSet(object):
def __init__(self, data, data_type, shared=None, valid_idxs=None):
self.data = data # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
self.data_type = data_type
self.shared = shared
total_num_examples = self.get_data_size()
self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
self.num_examples = len(self.valid_idxs)
def _sort_key(self, idx):
rx = self.data['*x'][idx]
x = self.shared['x'][rx[0]][rx[1]]
return max(map(len, x))
def get_data_size(self):
if isinstance(self.data, dict):
return len(next(iter(self.data.values())))
elif isinstance(self.data, Data):
return self.data.get_size()
raise Exception()
def get_by_idxs(self, idxs):
if isinstance(self.data, dict):
out = defaultdict(list)
for key, val in self.data.items():
out[key].extend(val[idx] for idx in idxs)
return out
elif isinstance(self.data, Data):
return self.data.get_by_idxs(idxs)
raise Exception()
def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False):
"""
:param batch_size:
:param num_batches:
:param shuffle:
:param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
:return:
"""
num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
if num_batches is None:
num_batches = num_batches_per_epoch
num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
if shuffle:
random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
if cluster:
sorted_idxs = sorted(random_idxs, key=self._sort_key)
sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch)
else:
random_grouped = lambda: list(grouper(random_idxs, batch_size))
grouped = random_grouped
else:
raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
grouped = raw_grouped
batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs))
for _ in range(num_batches):
batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None)
batch_data = self.get_by_idxs(batch_idxs)
shared_batch_data = {}
for key, val in batch_data.items():
if key.startswith('*'):
assert self.shared is not None
shared_key = key[1:]
shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val]
batch_data.update(shared_batch_data)
batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
yield batch_idxs, batch_ds
def get_multi_batches(self, batch_size, num_batches_per_step, num_steps=None, shuffle=False, cluster=False):
batch_size_per_step = batch_size * num_batches_per_step
batches = self.get_batches(batch_size_per_step, num_batches=num_steps, shuffle=shuffle, cluster=cluster)
multi_batches = (tuple(zip(grouper(idxs, batch_size, shorten=True, num_groups=num_batches_per_step),
data_set.divide(num_batches_per_step))) for idxs, data_set in batches)
return multi_batches
def get_empty(self):
if isinstance(self.data, dict):
data = {key: [] for key in self.data}
elif isinstance(self.data, Data):
data = self.data.get_empty()
else:
raise Exception()
return DataSet(data, self.data_type, shared=self.shared)
def __add__(self, other):
if isinstance(self.data, dict):
data = {key: val + other.data[key] for key, val in self.data.items()}
elif isinstance(self.data, Data):
data = self.data + other.data
else:
raise Exception()
valid_idxs = list(self.valid_idxs) + [valid_idx + self.num_examples for valid_idx in other.valid_idxs]
return DataSet(data, self.data_type, shared=self.shared, valid_idxs=valid_idxs)
def divide(self, integer):
batch_size = int(math.ceil(self.num_examples / integer))
idxs_gen = grouper(self.valid_idxs, batch_size, shorten=True, num_groups=integer)
data_gen = (self.get_by_idxs(idxs) for idxs in idxs_gen)
ds_tuple = tuple(DataSet(data, self.data_type, shared=self.shared) for data in data_gen)
return ds_tuple
def load_metadata(config, data_type):
metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
with open(metadata_path, 'r') as fh:
metadata = json.load(fh)
for key, val in metadata.items():
config.__setattr__(key, val)
return metadata
def read_data(config, data_type, ref, data_filter=None):
data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
with open(data_path, 'r') as fh:
data = json.load(fh)
with open(shared_path, 'r') as fh:
shared = json.load(fh)
num_examples = len(next(iter(data.values())))
if data_filter is None:
valid_idxs = range(num_examples)
else:
mask = []
keys = data.keys()
values = data.values()
for vals in zip(*values):
each = {key: val for key, val in zip(keys, vals)}
mask.append(data_filter(each, shared))
valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
shared_path = config.shared_path or os.path.join(config.out_dir, "shared.json")
if not ref:
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
char_counter = shared['char_counter']
if config.finetune:
shared['word2idx'] = {word: idx + 2 for idx, word in
enumerate(word for word, count in word_counter.items()
if count > config.word_count_th or (config.known_if_glove and word in word2vec_dict))}
else:
assert config.known_if_glove
assert config.use_glove_for_unk
shared['word2idx'] = {word: idx + 2 for idx, word in
enumerate(word for word, count in word_counter.items()
if count > config.word_count_th and word not in word2vec_dict)}
shared['char2idx'] = {char: idx + 2 for idx, char in
enumerate(char for char, count in char_counter.items()
if count > config.char_count_th)}
NULL = "-NULL-"
UNK = "-UNK-"
shared['word2idx'][NULL] = 0
shared['word2idx'][UNK] = 1
shared['char2idx'][NULL] = 0
shared['char2idx'][UNK] = 1
json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx']}, open(shared_path, 'w'))
else:
new_shared = json.load(open(shared_path, 'r'))
for key, val in new_shared.items():
shared[key] = val
if config.use_glove_for_unk:
# create new word2idx and word2vec
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
new_word2idx_dict = {word: idx for idx, word in enumerate(word for word in word2vec_dict.keys() if word not in shared['word2idx'])}
shared['new_word2idx'] = new_word2idx_dict
offset = len(shared['word2idx'])
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
new_word2idx_dict = shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
# print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
shared['new_emb_mat'] = new_emb_mat
data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
return data_set
def get_squad_data_filter(config):
def data_filter(data_point, shared):
assert shared is not None
rx, rcx, q, cq, y = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
x, cx = shared['x'], shared['cx']
if len(q) > config.ques_size_th:
return False
# x filter
xi = x[rx[0]][rx[1]]
if config.squash:
for start, stop in y:
stop_offset = sum(map(len, xi[:stop[0]]))
if stop_offset + stop[1] > config.para_size_th:
return False
return True
if config.single:
for start, stop in y:
if start[0] != stop[0]:
return False
if config.data_filter == 'max':
for start, stop in y:
if stop[0] >= config.num_sents_th:
return False
if start[0] != stop[0]:
return False
if stop[1] >= config.sent_size_th:
return False
elif config.data_filter == 'valid':
if len(xi) > config.num_sents_th:
return False
if any(len(xij) > config.sent_size_th for xij in xi):
return False
elif config.data_filter == 'semi':
"""
Only answer sentence needs to be valid.
"""
for start, stop in y:
if stop[0] >= config.num_sents_th:
return False
if start[0] != start[0]:
return False
if len(xi[start[0]]) > config.sent_size_th:
return False
else:
raise Exception()
return True
return data_filter
def update_config(config, data_sets):
config.max_num_sents = 0
config.max_sent_size = 0
config.max_ques_size = 0
config.max_word_size = 0
config.max_para_size = 0
for data_set in data_sets:
data = data_set.data
shared = data_set.shared
for idx in data_set.valid_idxs:
rx = data['*x'][idx]
q = data['q'][idx]
sents = shared['x'][rx[0]][rx[1]]
config.max_para_size = max(config.max_para_size, sum(map(len, sents)))
config.max_num_sents = max(config.max_num_sents, len(sents))
config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
if len(q) > 0:
config.max_ques_size = max(config.max_ques_size, len(q))
config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
if config.mode == 'train':
config.max_num_sents = min(config.max_num_sents, config.num_sents_th)
config.max_sent_size = min(config.max_sent_size, config.sent_size_th)
config.max_para_size = min(config.max_para_size, config.para_size_th)
config.max_word_size = min(config.max_word_size, config.word_size_th)
config.char_vocab_size = len(data_sets[0].shared['char2idx'])
config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
config.word_vocab_size = len(data_sets[0].shared['word2idx'])
if config.single:
config.max_num_sents = 1
if config.squash:
config.max_sent_size = config.max_para_size
config.max_num_sents = 1

View file

@ -0,0 +1,29 @@
#!/usr/bin/env bash
source_path=$1
target_path=$2
inter_dir="inter_ensemble"
root_dir="save"
parg=""
marg=""
if [ "$3" = "debug" ]
then
parg="-d"
marg="--debug"
fi
# Preprocess data
python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
eargs=""
for num in 31 33 34 35 36 37 40 41 43 44 45 46; do
load_path="$root_dir/$num/save"
shared_path="$root_dir/$num/shared.json"
eval_path="$inter_dir/eval-$num.pklz"
eargs="$eargs $eval_path"
python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema &
done
wait
# Ensemble
python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs

View file

@ -0,0 +1,27 @@
#!/usr/bin/env bash
source_path=$1
target_path=$2
inter_dir="inter_single"
root_dir="save"
parg=""
marg=""
if [ "$3" = "debug" ]
then
parg="-d"
marg="--debug"
fi
# Preprocess data
python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
num=37
load_path="$root_dir/$num/save"
shared_path="$root_dir/$num/shared.json"
eval_path="$inter_dir/eval.pklz"
python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema
# Ensemble (for single run, just one input)
python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path

View file

@ -0,0 +1,76 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ title }}</title>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
<script>
$(document).ready(function(){
$(".att").each(function() {
// var val = parseFloat($(this).text());
var val = parseFloat($(this).attr("color"));
var scale = chroma.scale(['white', 'red']);
var color = scale(val).hex();
$(this).attr("bgcolor", color);
});
})
</script>
</head>
<style>
table, th, td {border: 1px solid black}
</style>
<body>
<h2>{{ title }}</h2>
<table>
<tr>
<th>ID</th>
<th>Question</th>
<th>Answers</th>
<th>Predicted</th>
<th>Score</th>
<th>Paragraph</th>
</tr>
{% for row in rows %}
<tr>
<td>{{ row.id }}</td>
<td>
{% for qj in row.ques %}
{{ qj }}
{% endfor %}
</td>
<td>
{% for aa in row.a %}
<li>{{ aa }}</li>
{% endfor %}
</td>
<td>{{ row.ap }}</td>
<td>{{ row.score }}</td>
<td>
<table>
{% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
<tr>
{% set rowloop = loop %}
{% for xjk, ypjk in zip(xj, ypj) %}
<td class="att" color="{{ ypjk }}">
{% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
<b>{{ xjk }}</b>
{% else %}
{{ xjk }}
{% endif %}
</td>
{% endfor %}
</tr>
<tr>
{% for xjk, yp2jk in zip(xj, yp2j) %}
<td class="att" color="{{ yp2jk }}">-</td>
{% endfor %}
</tr>
{% endfor %}
</table>
</td>
</tr>
{% endfor %}
</table>
</body>
</html>

View file

@ -0,0 +1,73 @@
import tensorflow as tf
from basic.model import Model
from my.tensorflow import average_gradients
class Trainer(object):
def __init__(self, config, model):
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdamOptimizer(config.init_lr)
self.loss = model.get_loss()
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.summary = model.summary
self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def get_train_op(self):
return self.train_op
def step(self, sess, batch, get_summary=False):
assert isinstance(sess, tf.Session)
_, ds = batch
feed_dict = self.model.get_feed_dict(ds, True)
if get_summary:
loss, summary, train_op = \
sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
else:
loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
summary = None
return loss, summary, train_op
class MultiGPUTrainer(object):
def __init__(self, config, models):
model = models[0]
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdamOptimizer(config.init_lr)
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.summary = model.summary
self.models = models
losses = []
grads_list = []
for gpu_idx, model in enumerate(models):
with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)):
loss = model.get_loss()
grads = self.opt.compute_gradients(loss, var_list=self.var_list)
losses.append(loss)
grads_list.append(grads)
self.loss = tf.add_n(losses)/len(losses)
self.grads = average_gradients(grads_list)
self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def step(self, sess, batches, get_summary=False):
assert isinstance(sess, tf.Session)
feed_dict = {}
for batch, model in zip(batches, self.models):
_, ds = batch
feed_dict.update(model.get_feed_dict(ds, True))
if get_summary:
loss, summary, train_op = \
sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
else:
loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
summary = None
return loss, summary, train_op

View file

@ -0,0 +1,140 @@
import shutil
from collections import OrderedDict
import http.server
import socketserver
import argparse
import json
import os
import numpy as np
from tqdm import tqdm
import pickle
import gzip
from jinja2 import Environment, FileSystemLoader
from squad.utils import get_best_span, get_best_span_wy
def bool_(string):
if string == 'True':
return True
elif string == 'False':
return False
else:
raise Exception()
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default='basic')
parser.add_argument("--data_type", type=str, default='dev')
parser.add_argument("--step", type=int, default=5000)
parser.add_argument("--template_name", type=str, default="visualizer.html")
parser.add_argument("--num_per_page", type=int, default=100)
parser.add_argument("--data_dir", type=str, default="data/squad")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--open", type=str, default='False')
parser.add_argument("--run_id", type=str, default="0")
parser.add_argument("-w", "--wy", action='store_true')
args = parser.parse_args()
return args
def _decode(decoder, sent):
return " ".join(decoder[idx] for idx in sent)
def accuracy2_visualizer(args):
model_name = args.model_name
data_type = args.data_type
num_per_page = args.num_per_page
data_dir = args.data_dir
run_id = args.run_id.zfill(2)
step = args.step
eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.pklz".format(data_type, str(step).zfill(6)))
print("loading {}".format(eval_path))
eval_ = pickle.load(gzip.open(eval_path, 'r'))
_id = 0
html_dir = "/tmp/list_results%d" % _id
while os.path.exists(html_dir):
_id += 1
html_dir = "/tmp/list_results%d" % _id
if os.path.exists(html_dir):
shutil.rmtree(html_dir)
os.mkdir(html_dir)
cur_dir = os.path.dirname(os.path.realpath(__file__))
templates_dir = os.path.join(cur_dir, 'templates')
env = Environment(loader=FileSystemLoader(templates_dir))
env.globals.update(zip=zip, reversed=reversed)
template = env.get_template(args.template_name)
data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
print("loading {}".format(data_path))
data = json.load(open(data_path, 'r'))
print("loading {}".format(shared_path))
shared = json.load(open(shared_path, 'r'))
rows = []
for i, (idx, yi, ypi, yp2i, wypi) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2', 'wyp')])), total=len(eval_['idxs'])):
id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
x = shared['x'][rx[0]][rx[1]]
ques = [" ".join(q)]
para = [[word for word in sent] for sent in x]
span, score = get_best_span_wy(wypi, 0.5) if args.wy else get_best_span(ypi, yp2i)
ap = get_segment(para, span)
# score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
row = {
'id': id_,
'title': "Hello world!",
'ques': ques,
'para': para,
'y': yi[0][0],
'y2': yi[0][1],
'yp': wypi if args.wy else ypi,
'yp2': wypi if args.wy else yp2i,
'a': answers,
'ap': ap,
'score': score
}
rows.append(row)
if i % num_per_page == 0:
html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
var_dict = {'title': "Accuracy Visualization",
'rows': rows
}
with open(html_path, "wb") as f:
f.write(template.render(**var_dict).encode('UTF-8'))
rows = []
os.chdir(html_dir)
port = args.port
host = args.host
# Overriding to suppress log message
class MyHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass
handler = MyHandler
httpd = socketserver.TCPServer((host, port), handler)
if args.open == 'True':
os.system("open http://%s:%d" % (args.host, args.port))
print("serving at %s:%d" % (host, port))
httpd.serve_forever()
def get_segment(para, span):
return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
if __name__ == "__main__":
ARGS = get_args()
accuracy2_visualizer(ARGS)

View file

View file

@ -0,0 +1,103 @@
import os
import tensorflow as tf
from basic_cnn.main import main as m
flags = tf.app.flags
flags.DEFINE_string("model_name", "basic_cnn", "Model name [basic]")
flags.DEFINE_string("data_dir", "data/cnn", "Data dir [data/cnn]")
flags.DEFINE_string("root_dir", "/Users/minjoons/data/cnn/questions", "root dir [~/data/cnn/questions]")
flags.DEFINE_string("run_id", "0", "Run ID [0]")
flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
flags.DEFINE_integer("load_step", 0, "load step [0]")
flags.DEFINE_integer("early_stop", 4, "early stop [4]")
flags.DEFINE_string("mode", "test", "train | dev | test | forward [test]")
flags.DEFINE_boolean("load", True, "load saved data? [True]")
flags.DEFINE_boolean("progress", True, "Show progress? [True]")
flags.DEFINE_integer("log_period", 100, "Log period [100]")
flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
flags.DEFINE_integer("char_out_size", 100, "Char out size [100]")
flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob [0.8]")
flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
flags.DEFINE_float("wd", 0.0, "Weight decay [0.0]")
flags.DEFINE_bool("lower_word", True, "lower word [True]")
flags.DEFINE_bool("dump_eval", False, "dump eval? [True]")
flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
flags.DEFINE_string("model", "2", "config 1 |2 [2]")
flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
flags.DEFINE_integer("sent_size_th", 60, "sent size th [64]")
flags.DEFINE_integer("num_sents_th", 200, "num sents th [8]")
flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
flags.DEFINE_bool("finetune", False, "finetune? [False]")
flags.DEFINE_bool("feed_gt", False, "feed gt prev token during training [False]")
flags.DEFINE_bool("feed_hard", False, "feed hard argmax prev token during testing [False]")
flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
flags.DEFINE_bool("eval", True, "eval? [True]")
flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
flags.DEFINE_string("forward_name", "single", "Forward name [single]")
flags.DEFINE_string("answer_path", "", "Answer path []")
flags.DEFINE_string("load_path", "", "Load path []")
flags.DEFINE_string("shared_path", "", "Shared path []")
flags.DEFINE_string("device", "/cpu:0", "default device [/cpu:0]")
flags.DEFINE_integer("num_gpus", 1, "num of gpus [1]")
flags.DEFINE_string("out_channel_dims", "100", "Out channel dims, separated by commas [100]")
flags.DEFINE_string("filter_heights", "5", "Filter heights, separated by commas [5]")
flags.DEFINE_bool("share_cnn_weights", True, "Share CNN weights [False]")
flags.DEFINE_bool("share_lstm_weights", True, "Share LSTM weights [True]")
flags.DEFINE_bool("two_prepro_layers", False, "Use two layers for preprocessing? [False]")
flags.DEFINE_bool("aug_att", False, "Augment attention layers with more features? [False]")
flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
flags.DEFINE_float("keep_prob", 1.0, "keep prob [1.0]")
flags.DEFINE_string("prev_mode", "a", "prev mode gy | y | a [a]")
flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
flags.DEFINE_bool("sh", False, "use superhighway [False]")
flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
flags.DEFINE_float("filter_ratio", 1.0, "filter ratio [1.0]")
flags.DEFINE_bool("bi", False, "bi-directional attention? [False]")
flags.DEFINE_integer("width", 5, "width around entity [5]")
def main(_):
config = flags.FLAGS
config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
m(config)
if __name__ == "__main__":
tf.app.run()

View file

@ -0,0 +1,494 @@
import itertools
from collections import defaultdict
import numpy as np
import tensorflow as tf
import os
from basic_cnn.read_data import DataSet
from my.nltk_utils import span_f1
from my.tensorflow import padded_reshape
from my.utils import argmax
class Evaluation(object):
def __init__(self, data_type, global_step, idxs, yp, tensor_dict=None):
self.data_type = data_type
self.global_step = global_step
self.idxs = idxs
self.yp = yp
self.num_examples = len(yp)
self.tensor_dict = None
self.dict = {'data_type': data_type,
'global_step': global_step,
'yp': yp,
'idxs': idxs,
'num_examples': self.num_examples}
if tensor_dict is not None:
self.tensor_dict = {key: val.tolist() for key, val in tensor_dict.items()}
for key, val in self.tensor_dict.items():
self.dict[key] = val
self.summaries = None
def __repr__(self):
return "{} step {}".format(self.data_type, self.global_step)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_idxs = self.idxs + other.idxs
new_tensor_dict = None
if self.tensor_dict is not None:
new_tensor_dict = {key: val + other.tensor_dict[key] for key, val in self.tensor_dict.items()}
return Evaluation(self.data_type, self.global_step, new_idxs, new_yp, tensor_dict=new_tensor_dict)
def __radd__(self, other):
return self.__add__(other)
class LabeledEvaluation(Evaluation):
def __init__(self, data_type, global_step, idxs, yp, y, id2answer_dict, tensor_dict=None):
super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
self.y = y
self.dict['y'] = y
self.id2answer_dict = id2answer_dict
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_idxs = self.idxs + other.idxs
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
new_id2answer_dict['scores'] = new_id2score_dict
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_id2answer_dict, tensor_dict=new_tensor_dict)
class AccuracyEvaluation(LabeledEvaluation):
def __init__(self, data_type, global_step, idxs, yp, y, id2answer_dict, correct, loss, tensor_dict=None):
super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y, id2answer_dict, tensor_dict=tensor_dict)
self.loss = loss
self.correct = correct
self.id2answer_dict = id2answer_dict
self.acc = sum(correct) / len(correct)
self.dict['loss'] = loss
self.dict['correct'] = correct
self.dict['acc'] = self.acc
loss_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/loss'.format(data_type), simple_value=self.loss)])
acc_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/acc'.format(data_type), simple_value=self.acc)])
self.summaries = [loss_summary, acc_summary]
def __repr__(self):
return "{} step {}: accuracy={}={}/{}, loss={}".format(self.data_type, self.global_step, self.acc,
sum(self.correct), self.num_examples, self.loss)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_correct = self.correct + other.correct
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
new_id2score_dict = dict(list(self.id2answer_dict['scores'].items()) + list(other.id2answer_dict['scores'].items()))
new_id2answer_dict['scores'] = new_id2score_dict
new_tensor_dict = None
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_id2answer_dict, new_correct, new_loss, tensor_dict=new_tensor_dict)
class Evaluator(object):
def __init__(self, config, model, tensor_dict=None):
self.config = config
self.model = model
self.global_step = model.global_step
self.yp = model.yp
self.tensor_dict = {} if tensor_dict is None else tensor_dict
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), tensor_dict=tensor_dict)
return e
def get_evaluation_from_batches(self, sess, batches):
e = sum(self.get_evaluation(sess, batch) for batch in batches)
return e
class LabeledEvaluator(Evaluator):
def __init__(self, config, model, tensor_dict=None):
super(LabeledEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.y = model.y
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp, vals = sess.run([self.global_step, self.yp, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
y = feed_dict[self.y]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), tensor_dict=tensor_dict)
return e
class AccuracyEvaluator(LabeledEvaluator):
def __init__(self, config, model, tensor_dict=None):
super(AccuracyEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.loss = model.loss
def get_evaluation(self, sess, batch):
idxs, data_set = self._split_batch(batch)
assert isinstance(data_set, DataSet)
feed_dict = self._get_feed_dict(batch)
y = data_set.data['y']
global_step, yp, loss, vals = sess.run([self.global_step, self.yp, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
correct, probs, preds = zip(*[self.__class__.compare(data_set.get_one(idx), ypi) for idx, ypi in zip(data_set.valid_idxs, yp)])
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
ids = data_set.data['ids']
id2score_dict = {id_: prob for id_, prob in zip(ids, probs)}
id2answer_dict = {id_: pred for id_, pred in zip(ids, preds)}
id2answer_dict['scores'] = id2score_dict
e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y, id2answer_dict, correct, float(loss), tensor_dict=tensor_dict)
return e
@staticmethod
def compare(data, ypi):
prob = float(np.max(ypi))
yi = data['y']
for start, stop in yi:
if start == int(np.argmax(ypi)):
return True, prob, " "
return False, prob, " "
def _split_batch(self, batch):
return batch
def _get_feed_dict(self, batch):
return self.model.get_feed_dict(batch[1], False)
class CNNAccuracyEvaluator(AccuracyEvaluator):
@staticmethod
def compare(data, ypi):
# ypi: [N, M, JX] numbers
yi = data['y'][0] # entity
xi = data['x'][0] # [N, M, JX] words
dist = defaultdict(int)
for ypij, xij in zip(ypi, xi):
for ypijk, xijk in zip(ypij, xij):
if xijk.startswith("@"):
dist[xijk] += ypijk
pred, prob = max(dist.items(), key=lambda item: item[1])
assert pred.startswith("@")
assert yi.startswith("@")
return pred == yi, prob, pred
class AccuracyEvaluator2(AccuracyEvaluator):
@staticmethod
def compare(yi, ypi):
for start, stop in yi:
para_start = int(np.argmax(np.max(ypi, 1)))
sent_start = int(np.argmax(ypi[para_start]))
if tuple(start) == (para_start, sent_start):
return True
return False
class ForwardEvaluation(Evaluation):
def __init__(self, data_type, global_step, idxs, yp, yp2, loss, id2answer_dict, tensor_dict=None):
super(ForwardEvaluation, self).__init__(data_type, global_step, idxs, yp, tensor_dict=tensor_dict)
self.yp2 = yp2
self.loss = loss
self.dict['loss'] = loss
self.dict['yp2'] = yp2
self.id2answer_dict = id2answer_dict
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_yp2 = self.yp2 + other.yp2
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_yp)
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
if self.tensor_dict is not None:
new_tensor_dict = {key: np.concatenate((val, other.tensor_dict[key]), axis=0) for key, val in self.tensor_dict.items()}
return ForwardEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_loss, new_id2answer_dict, tensor_dict=new_tensor_dict)
def __repr__(self):
return "{} step {}: loss={:.4f}".format(self.data_type, self.global_step, self.loss)
class F1Evaluation(AccuracyEvaluation):
def __init__(self, data_type, global_step, idxs, yp, yp2, y, correct, loss, f1s, id2answer_dict, tensor_dict=None):
super(F1Evaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss, tensor_dict=tensor_dict)
self.yp2 = yp2
self.f1s = f1s
self.f1 = float(np.mean(f1s))
self.dict['yp2'] = yp2
self.dict['f1s'] = f1s
self.dict['f1'] = self.f1
self.id2answer_dict = id2answer_dict
f1_summary = tf.Summary(value=[tf.Summary.Value(tag='{}/f1'.format(data_type), simple_value=self.f1)])
self.summaries.append(f1_summary)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_yp2 = self.yp2 + other.yp2
new_y = self.y + other.y
new_correct = self.correct + other.correct
new_f1s = self.f1s + other.f1s
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
new_id2answer_dict = dict(list(self.id2answer_dict.items()) + list(other.id2answer_dict.items()))
return F1Evaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_correct, new_loss, new_f1s, new_id2answer_dict)
def __repr__(self):
return "{} step {}: accuracy={:.4f}, f1={:.4f}, loss={:.4f}".format(self.data_type, self.global_step, self.acc, self.f1, self.loss)
class F1Evaluator(LabeledEvaluator):
def __init__(self, config, model, tensor_dict=None):
super(F1Evaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.yp2 = model.yp2
self.loss = model.loss
def get_evaluation(self, sess, batch):
idxs, data_set = self._split_batch(batch)
assert isinstance(data_set, DataSet)
feed_dict = self._get_feed_dict(batch)
global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
y = data_set.data['y']
if self.config.squash:
new_y = []
for xi, yi in zip(data_set.data['x'], y):
new_yi = []
for start, stop in yi:
start_offset = sum(map(len, xi[:start[0]]))
stop_offset = sum(map(len, xi[:stop[0]]))
new_start = 0, start_offset + start[1]
new_stop = 0, stop_offset + stop[1]
new_yi.append((new_start, new_stop))
new_y.append(new_yi)
y = new_y
if self.config.single:
new_y = []
for yi in y:
new_yi = []
for start, stop in yi:
new_start = 0, start[1]
new_stop = 0, stop[1]
new_yi.append((new_start, new_stop))
new_y.append(new_yi)
y = new_y
yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
spans = [get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)]
def _get(xi, span):
if len(xi) <= span[0][0]:
return [""]
if len(xi[span[0][0]]) <= span[1][1]:
return [""]
return xi[span[0][0]][span[0][1]:span[1][1]]
id2answer_dict = {id_: " ".join(_get(xi, span))
for id_, xi, span in zip(data_set.data['ids'], data_set.data['x'], spans)}
correct = [self.__class__.compare2(yi, span) for yi, span in zip(y, spans)]
f1s = [self.__class__.span_f1(yi, span) for yi, span in zip(y, spans)]
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = F1Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y,
correct, float(loss), f1s, id2answer_dict, tensor_dict=tensor_dict)
return e
def _split_batch(self, batch):
return batch
def _get_feed_dict(self, batch):
return self.model.get_feed_dict(batch[1], False)
@staticmethod
def compare(yi, ypi, yp2i):
for start, stop in yi:
aypi = argmax(ypi)
mask = np.zeros(yp2i.shape)
mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
return True
return False
@staticmethod
def compare2(yi, span):
for start, stop in yi:
if tuple(start) == span[0] and tuple(stop) == span[1]:
return True
return False
@staticmethod
def span_f1(yi, span):
max_f1 = 0
for start, stop in yi:
if start[0] == span[0][0]:
true_span = start[1], stop[1]
pred_span = span[0][1], span[1][1]
f1 = span_f1(true_span, pred_span)
max_f1 = max(f1, max_f1)
return max_f1
class MultiGPUF1Evaluator(F1Evaluator):
def __init__(self, config, models, tensor_dict=None):
super(MultiGPUF1Evaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
self.models = models
with tf.name_scope("eval_concat"):
N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
self.yp2 = tf.concat(axis=0, values=[padded_reshape(model.yp2, [N, M, JX]) for model in models])
self.loss = tf.add_n([model.loss for model in models])/len(models)
def _split_batch(self, batches):
idxs_list, data_sets = zip(*batches)
idxs = sum(idxs_list, ())
data_set = sum(data_sets, data_sets[0].get_empty())
return idxs, data_set
def _get_feed_dict(self, batches):
feed_dict = {}
for model, (_, data_set) in zip(self.models, batches):
feed_dict.update(model.get_feed_dict(data_set, False))
return feed_dict
class MultiGPUCNNAccuracyEvaluator(CNNAccuracyEvaluator):
def __init__(self, config, models, tensor_dict=None):
super(MultiGPUCNNAccuracyEvaluator, self).__init__(config, models[0], tensor_dict=tensor_dict)
self.models = models
with tf.name_scope("eval_concat"):
N, M, JX = config.batch_size, config.max_num_sents, config.max_sent_size
self.yp = tf.concat(axis=0, values=[padded_reshape(model.yp, [N, M, JX]) for model in models])
self.loss = tf.add_n([model.loss for model in models])/len(models)
def _split_batch(self, batches):
idxs_list, data_sets = zip(*batches)
idxs = sum(idxs_list, ())
data_set = sum(data_sets, data_sets[0].get_empty())
return idxs, data_set
def _get_feed_dict(self, batches):
feed_dict = {}
for model, (_, data_set) in zip(self.models, batches):
feed_dict.update(model.get_feed_dict(data_set, False))
return feed_dict
class ForwardEvaluator(Evaluator):
def __init__(self, config, model, tensor_dict=None):
super(ForwardEvaluator, self).__init__(config, model, tensor_dict=tensor_dict)
self.yp2 = model.yp2
self.loss = model.loss
def get_evaluation(self, sess, batch):
idxs, data_set = batch
assert isinstance(data_set, DataSet)
feed_dict = self.model.get_feed_dict(data_set, False)
global_step, yp, yp2, loss, vals = sess.run([self.global_step, self.yp, self.yp2, self.loss, list(self.tensor_dict.values())], feed_dict=feed_dict)
yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
spans = [get_best_span(ypi, yp2i) for ypi, yp2i in zip(yp, yp2)]
def _get(xi, span):
if len(xi) <= span[0][0]:
return [""]
if len(xi[span[0][0]]) <= span[1][1]:
return [""]
return xi[span[0][0]][span[0][1]:span[1][1]]
id2answer_dict = {id_: " ".join(_get(xi, span))
for id_, xi, span in zip(data_set.data['ids'], data_set.data['x'], spans)}
tensor_dict = dict(zip(self.tensor_dict.keys(), vals))
e = ForwardEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), float(loss), id2answer_dict, tensor_dict=tensor_dict)
return e
@staticmethod
def compare(yi, ypi, yp2i):
for start, stop in yi:
aypi = argmax(ypi)
mask = np.zeros(yp2i.shape)
mask[aypi[0], aypi[1]:] = np.ones([yp2i.shape[1] - aypi[1]])
if tuple(start) == aypi and (stop[0], stop[1]-1) == argmax(yp2i * mask):
return True
return False
@staticmethod
def compare2(yi, span):
for start, stop in yi:
if tuple(start) == span[0] and tuple(stop) == span[1]:
return True
return False
@staticmethod
def span_f1(yi, span):
max_f1 = 0
for start, stop in yi:
if start[0] == span[0][0]:
true_span = start[1], stop[1]
pred_span = span[0][1], span[1][1]
f1 = span_f1(true_span, pred_span)
max_f1 = max(f1, max_f1)
return max_f1
def get_best_span(ypi, yp2i):
max_val = 0
best_word_span = (0, 1)
best_sent_idx = 0
for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
argmax_j1 = 0
for j in range(len(ypif)):
val1 = ypif[argmax_j1]
if val1 < ypif[j]:
val1 = ypif[j]
argmax_j1 = j
val2 = yp2if[j]
if val1 * val2 > max_val:
best_word_span = (argmax_j1, j)
best_sent_idx = f
max_val = val1 * val2
return (best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)
def get_span_score_pairs(ypi, yp2i):
span_score_pairs = []
for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
for j in range(len(ypif)):
for k in range(j, len(yp2if)):
span = ((f, j), (f, k+1))
score = ypif[j] * yp2if[k]
span_score_pairs.append((span, score))
return span_score_pairs

View file

@ -0,0 +1,70 @@
import gzip
import json
from json import encoder
import os
import tensorflow as tf
from basic_cnn.evaluator import Evaluation, F1Evaluation
from my.utils import short_floats
import pickle
class GraphHandler(object):
def __init__(self, config):
self.config = config
self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
self.writer = None
self.save_path = os.path.join(config.save_dir, config.model_name)
def initialize(self, sess):
if self.config.load:
self._load(sess)
else:
sess.run(tf.global_variables_initializer())
if self.config.mode == 'train':
self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
def save(self, sess, global_step=None):
self.saver.save(sess, self.save_path, global_step=global_step)
def _load(self, sess):
config = self.config
if config.load_path:
save_path = config.load_path
elif config.load_step > 0:
save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
else:
save_dir = config.save_dir
checkpoint = tf.train.get_checkpoint_state(save_dir)
assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
save_path = checkpoint.model_checkpoint_path
print("Loading saved model from {}".format(save_path))
self.saver.restore(sess, save_path)
def add_summary(self, summary, global_step):
self.writer.add_summary(summary, global_step)
def add_summaries(self, summaries, global_step):
for summary in summaries:
self.add_summary(summary, global_step)
def dump_eval(self, e, precision=2, path=None):
assert isinstance(e, Evaluation)
if self.config.dump_pickle:
path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
with gzip.open(path, 'wb', compresslevel=3) as fh:
pickle.dump(e.dict, fh)
else:
path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
with open(path, 'w') as fh:
json.dump(short_floats(e.dict, precision), fh)
def dump_answer(self, e, path=None):
assert isinstance(e, Evaluation)
path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
with open(path, 'w') as fh:
json.dump(e.id2answer_dict, fh)

View file

@ -0,0 +1,238 @@
import argparse
import json
import math
import os
import shutil
from pprint import pprint
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from basic_cnn.evaluator import F1Evaluator, Evaluator, ForwardEvaluator, MultiGPUF1Evaluator, CNNAccuracyEvaluator, \
MultiGPUCNNAccuracyEvaluator
from basic_cnn.graph_handler import GraphHandler
from basic_cnn.model import Model, get_multi_gpu_models
from basic_cnn.trainer import Trainer, MultiGPUTrainer
from basic_cnn.read_data import read_data, get_cnn_data_filter, update_config
def main(config):
set_dirs(config)
with tf.device(config.device):
if config.mode == 'train':
_train(config)
elif config.mode == 'test' or config.mode == 'dev':
_test(config)
elif config.mode == 'forward':
_forward(config)
else:
raise ValueError("invalid value for 'mode': {}".format(config.mode))
def _config_draft(config):
if config.draft:
config.num_steps = 2
config.eval_period = 1
config.log_period = 1
config.save_period = 1
config.eval_num_batches = 1
def _train(config):
# load_metadata(config, 'train') # this updates the config file according to metadata file
data_filter = get_cnn_data_filter(config)
train_data = read_data(config, 'train', config.load, data_filter=data_filter)
dev_data = read_data(config, 'dev', True, data_filter=data_filter)
# test_data = read_data(config, 'test', True, data_filter=data_filter)
update_config(config, [train_data, dev_data])
_config_draft(config)
word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
word2idx_dict = train_data.shared['word2idx']
idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
for idx in range(config.word_vocab_size)])
config.emb_mat = emb_mat
# construct model graph and variables (using default graph)
pprint(config.__flags, indent=2)
# model = Model(config)
models = get_multi_gpu_models(config)
model = models[0]
trainer = MultiGPUTrainer(config, models)
evaluator = MultiGPUCNNAccuracyEvaluator(config, models, tensor_dict=model.tensor_dict if config.vis else None)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
# Variables
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
# begin training
print(train_data.num_examples)
num_steps = config.num_steps or int(math.ceil(train_data.num_examples / (config.batch_size * config.num_gpus))) * config.num_epochs
global_step = 0
for batches in tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus,
num_steps=num_steps, shuffle=True, cluster=config.cluster), total=num_steps):
global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step
get_summary = global_step % config.log_period == 0
loss, summary, train_op = trainer.step(sess, batches, get_summary=get_summary)
if get_summary:
graph_handler.add_summary(summary, global_step)
# occasional saving
if global_step % config.save_period == 0:
graph_handler.save(sess, global_step=global_step)
if not config.eval:
continue
# Occasional evaluation
if global_step % config.eval_period == 0:
num_steps = math.ceil(dev_data.num_examples / (config.batch_size * config.num_gpus))
if 0 < config.eval_num_batches < num_steps:
num_steps = config.eval_num_batches
e_train = evaluator.get_evaluation_from_batches(
sess, tqdm(train_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps)
)
graph_handler.add_summaries(e_train.summaries, global_step)
e_dev = evaluator.get_evaluation_from_batches(
sess, tqdm(dev_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps), total=num_steps))
graph_handler.add_summaries(e_dev.summaries, global_step)
if config.dump_eval:
graph_handler.dump_eval(e_dev)
if config.dump_answer:
graph_handler.dump_answer(e_dev)
if global_step % config.save_period != 0:
graph_handler.save(sess, global_step=global_step)
def _test(config):
assert config.load
test_data = read_data(config, config.mode, True)
update_config(config, [test_data])
_config_draft(config)
if config.use_glove_for_unk:
word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
new_word2idx_dict = test_data.shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
# print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
config.new_emb_mat = new_emb_mat
pprint(config.__flags, indent=2)
models = get_multi_gpu_models(config)
evaluator = MultiGPUCNNAccuracyEvaluator(config, models, tensor_dict=models[0].tensor_dict if config.vis else None)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
num_steps = math.ceil(test_data.num_examples / (config.batch_size * config.num_gpus))
if 0 < config.eval_num_batches < num_steps:
num_steps = config.eval_num_batches
e = None
for multi_batch in tqdm(test_data.get_multi_batches(config.batch_size, config.num_gpus, num_steps=num_steps, cluster=config.cluster), total=num_steps):
ei = evaluator.get_evaluation(sess, multi_batch)
e = ei if e is None else e + ei
if config.vis:
eval_subdir = os.path.join(config.eval_dir, "{}-{}".format(ei.data_type, str(ei.global_step).zfill(6)))
if not os.path.exists(eval_subdir):
os.mkdir(eval_subdir)
path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8))
graph_handler.dump_eval(ei, path=path)
print(e)
if config.dump_answer:
print("dumping answer ...")
graph_handler.dump_answer(e)
if config.dump_eval:
print("dumping eval ...")
graph_handler.dump_eval(e)
def _forward(config):
assert config.load
test_data = read_data(config, config.forward_name, True)
update_config(config, [test_data])
_config_draft(config)
if config.use_glove_for_unk:
word2vec_dict = test_data.shared['lower_word2vec'] if config.lower_word else test_data.shared['word2vec']
new_word2idx_dict = test_data.shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
# print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
config.new_emb_mat = new_emb_mat
pprint(config.__flags, indent=2)
models = get_multi_gpu_models(config)
model = models[0]
evaluator = ForwardEvaluator(config, model)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
graph_handler.initialize(sess)
num_batches = math.ceil(test_data.num_examples / config.batch_size)
if 0 < config.eval_num_batches < num_batches:
num_batches = config.eval_num_batches
e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
print(e)
if config.dump_answer:
print("dumping answer ...")
graph_handler.dump_answer(e, path=config.answer_path)
if config.dump_eval:
print("dumping eval ...")
graph_handler.dump_eval(e)
def set_dirs(config):
# create directories
if not config.load and os.path.exists(config.out_dir):
shutil.rmtree(config.out_dir)
config.save_dir = os.path.join(config.out_dir, "save")
config.log_dir = os.path.join(config.out_dir, "log")
config.eval_dir = os.path.join(config.out_dir, "eval")
config.answer_dir = os.path.join(config.out_dir, "answer")
if not os.path.exists(config.out_dir):
os.makedirs(config.out_dir)
if not os.path.exists(config.save_dir):
os.mkdir(config.save_dir)
if not os.path.exists(config.log_dir):
os.mkdir(config.log_dir)
if not os.path.exists(config.answer_dir):
os.mkdir(config.answer_dir)
if not os.path.exists(config.eval_dir):
os.mkdir(config.eval_dir)
def _get_args():
parser = argparse.ArgumentParser()
parser.add_argument("config_path")
return parser.parse_args()
class Config(object):
def __init__(self, **entries):
self.__dict__.update(entries)
def _run():
args = _get_args()
with open(args.config_path, 'r') as fh:
config = Config(**json.load(fh))
main(config)
if __name__ == "__main__":
_run()

View file

@ -0,0 +1,375 @@
import random
import itertools
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.rnn_cell import BasicLSTMCell, GRUCell
from basic_cnn.read_data import DataSet
from basic_cnn.superhighway import SHCell
from my.tensorflow import exp_mask, get_initializer, VERY_SMALL_NUMBER
from my.tensorflow.nn import linear, double_linear_logits, linear_logits, softsel, dropout, get_logits, softmax, \
highway_network, multi_conv1d
from my.tensorflow.rnn import bidirectional_dynamic_rnn, dynamic_rnn
from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, AttentionCell
def bi_attention(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
"""
h_a:
all u attending on h
choosing an element of h that max-matches u
First creates confusion matrix between h and u
Then take max of the attention weights over u row
Finally softmax over
u_a:
each h attending on u
:param h: [N, M, JX, d]
:param u: [N, JQ, d]
:param h_mask: [N, M, JX]
:param u_mask: [N, B]
:param scope:
:return: [N, M, d], [N, M, JX, d]
"""
with tf.variable_scope(scope or "bi_attention"):
N, M, JX, JQ, d = config.batch_size, config.max_num_sents, config.max_sent_size, config.max_ques_size, config.hidden_size
JX = tf.shape(h)[2]
h_aug = tf.tile(tf.expand_dims(h, 3), [1, 1, 1, JQ, 1])
u_aug = tf.tile(tf.expand_dims(tf.expand_dims(u, 1), 1), [1, M, JX, 1, 1])
if h_mask is None:
and_mask = None
else:
h_mask_aug = tf.tile(tf.expand_dims(h_mask, 3), [1, 1, 1, JQ])
u_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(u_mask, 1), 1), [1, M, JX, 1])
and_mask = h_mask_aug & u_mask_aug
u_logits = get_logits([h_aug, u_aug], None, True, wd=config.wd, mask=and_mask,
is_train=is_train, func=config.logit_func, scope='u_logits') # [N, M, JX, JQ]
u_a = softsel(u_aug, u_logits) # [N, M, JX, d]
if tensor_dict is not None:
# a_h = tf.nn.softmax(h_logits) # [N, M, JX]
a_u = tf.nn.softmax(u_logits) # [N, M, JX, JQ]
# tensor_dict['a_h'] = a_h
tensor_dict['a_u'] = a_u
if config.bi:
h_a = softsel(h, tf.reduce_max(u_logits, 3)) # [N, M, d]
h_a = tf.tile(tf.expand_dims(h_a, 2), [1, 1, JX, 1])
else:
h_a = None
return u_a, h_a
def attention_layer(config, is_train, h, u, h_mask=None, u_mask=None, scope=None, tensor_dict=None):
with tf.variable_scope(scope or "attention_layer"):
u_a, h_a = bi_attention(config, is_train, h, u, h_mask=h_mask, u_mask=u_mask, tensor_dict=tensor_dict)
if config.bi:
p0 = tf.concat(axis=3, values=[h , u_a, h * u_a, h * h_a])
else:
p0 = tf.concat(axis=3, values=[h , u_a, h * u_a])
return p0
class Model(object):
def __init__(self, config, scope):
self.scope = scope
self.config = config
self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
initializer=tf.constant_initializer(0), trainable=False)
# Define forward inputs here
N, M, JX, JQ, VW, VC, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size
self.x = tf.placeholder('int32', [N, M, None], name='x')
self.cx = tf.placeholder('int32', [N, M, None, W], name='cx')
self.x_mask = tf.placeholder('bool', [N, M, None], name='x_mask')
self.q = tf.placeholder('int32', [N, JQ], name='q')
self.cq = tf.placeholder('int32', [N, JQ, W], name='cq')
self.q_mask = tf.placeholder('bool', [N, JQ], name='q_mask')
self.y = tf.placeholder('bool', [N, M, JX], name='y')
self.is_train = tf.placeholder('bool', [], name='is_train')
self.new_emb_mat = tf.placeholder('float', [None, config.word_emb_size], name='new_emb_mat')
# Define misc
self.tensor_dict = {}
# Forward outputs / loss inputs
self.logits = None
self.yp = None
self.var_list = None
# Loss outputs
self.loss = None
self._build_forward()
self._build_loss()
if config.mode == 'train':
self._build_ema()
self.summary = tf.summary.merge_all()
self.summary = tf.summary.merge(tf.get_collection("summaries", scope=self.scope))
def _build_forward(self):
config = self.config
N, M, JX, JQ, VW, VC, d, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
config.max_word_size
JX = tf.shape(self.x)[2]
dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
with tf.variable_scope("emb"):
with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
with tf.variable_scope("char"):
Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc]
Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc]
Acx = tf.reshape(Acx, [-1, JX, W, dc])
Acq = tf.reshape(Acq, [-1, JQ, W, dc])
filter_sizes = list(map(int, config.out_channel_dims.split(',')))
heights = list(map(int, config.filter_heights.split(',')))
assert sum(filter_sizes) == dco
with tf.variable_scope("conv"):
xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
if config.share_cnn_weights:
tf.get_variable_scope().reuse_variables()
qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
else:
qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
xx = tf.reshape(xx, [-1, M, JX, dco])
qq = tf.reshape(qq, [-1, JQ, dco])
if config.use_word_emb:
with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
if config.mode == 'train':
word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
else:
word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
if config.use_glove_for_unk:
word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
with tf.name_scope("word"):
Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d]
Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d]
self.tensor_dict['x'] = Ax
self.tensor_dict['q'] = Aq
xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di]
qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di]
# highway network
with tf.variable_scope("highway"):
xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
tf.get_variable_scope().reuse_variables()
qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
self.tensor_dict['xx'] = xx
self.tensor_dict['qq'] = qq
cell = BasicLSTMCell(d, state_is_tuple=True)
d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M]
q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N]
with tf.variable_scope("prepro"):
(fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d]
u = tf.concat(axis=2, values=[fw_u, bw_u])
if config.two_prepro_layers:
(fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, u, q_len, dtype='float', scope='u2') # [N, J, d], [N, d]
u = tf.concat(axis=2, values=[fw_u, bw_u])
if config.share_lstm_weights:
tf.get_variable_scope().reuse_variables()
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
if config.two_prepro_layers:
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, h, x_len, dtype='float', scope='u2') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
else:
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
if config.two_prepro_layers:
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, h, x_len, dtype='float', scope='h2') # [N, M, JX, 2d]
h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d]
self.tensor_dict['u'] = u
self.tensor_dict['h'] = h
with tf.variable_scope("main"):
p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
(fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d]
g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
# p1 = attention_layer(config, self.is_train, g0, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p1")
(fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(d_cell, d_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d]
g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])
# logits = u_logits(config, self.is_train, g1, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits")
# [N, M, JX]
logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1')
a1i = softsel(tf.reshape(g1, [N, M*JX, 2*d]), tf.reshape(logits, [N, M*JX]))
if config.feed_gt:
logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER)
logits = tf.cond(self.is_train, lambda: logy, lambda: logits)
if config.feed_hard:
hard_yp = tf.argmax(tf.reshape(logits, [N, M*JX]), 1)
hard_logits = tf.reshape(tf.one_hot(hard_yp, M*JX), [N, M, JX]) # [N, M, JX]
logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits)
flat_logits = tf.reshape(logits, [-1, M * JX])
flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX]
yp = tf.reshape(flat_yp, [-1, M, JX])
self.tensor_dict['g1'] = g1
self.logits = flat_logits
self.yp = yp
def _build_loss(self):
config = self.config
N, M, JX, JQ, VW, VC = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size
JX = tf.shape(self.x)[2]
loss_mask = tf.reduce_max(tf.cast(self.q_mask, 'float'), 1)
losses = -tf.log(tf.reduce_sum(self.yp * tf.cast(self.y, 'float'), [1, 2]) + VERY_SMALL_NUMBER)
ce_loss = tf.reduce_mean(loss_mask * losses)
tf.add_to_collection('losses', ce_loss)
self.loss = tf.add_n(tf.get_collection('losses', scope=self.scope), name='loss')
tf.summary.scalar(self.loss.op.name, self.loss)
tf.add_to_collection('ema/scalar', self.loss)
def _build_ema(self):
ema = tf.train.ExponentialMovingAverage(self.config.decay)
ema_op = ema.apply(tf.get_collection("ema/scalar", scope=self.scope) + tf.get_collection("ema/histogram", scope=self.scope))
for var in tf.get_collection("ema/scalar", scope=self.scope):
ema_var = ema.average(var)
tf.summary.scalar(ema_var.op.name, ema_var)
for var in tf.get_collection("ema/histogram", scope=self.scope):
ema_var = ema.average(var)
tf.summary.histogram(ema_var.op.name, ema_var)
with tf.control_dependencies([ema_op]):
self.loss = tf.identity(self.loss)
def get_loss(self):
return self.loss
def get_global_step(self):
return self.global_step
def get_var_list(self):
return self.var_list
def get_feed_dict(self, batch, is_train, supervised=True):
assert isinstance(batch, DataSet)
config = self.config
N, M, JX, JQ, VW, VC, d, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size
feed_dict = {}
if config.len_opt:
"""
Note that this optimization results in variable GPU RAM usage (i.e. can cause OOM in the middle of training.)
First test without len_opt and make sure no OOM, and use len_opt
"""
if sum(len(para) for para in batch.data['x']) == 0:
new_JX = 1
else:
new_JX = max(len(para) for para in batch.data['x'])
JX = min(JX, new_JX)
# print(JX)
x = np.zeros([N, M, JX], dtype='int32')
cx = np.zeros([N, M, JX, W], dtype='int32')
x_mask = np.zeros([N, M, JX], dtype='bool')
q = np.zeros([N, JQ], dtype='int32')
cq = np.zeros([N, JQ, W], dtype='int32')
q_mask = np.zeros([N, JQ], dtype='bool')
feed_dict[self.x] = x
feed_dict[self.x_mask] = x_mask
feed_dict[self.cx] = cx
feed_dict[self.q] = q
feed_dict[self.cq] = cq
feed_dict[self.q_mask] = q_mask
feed_dict[self.is_train] = is_train
if config.use_glove_for_unk:
feed_dict[self.new_emb_mat] = batch.shared['new_emb_mat']
X = batch.data['x']
CX = batch.data['cx']
def _get_word(word):
if word.startswith("@"):
return 2
d = batch.shared['word2idx']
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in d:
return d[each]
if config.use_glove_for_unk:
d2 = batch.shared['new_word2idx']
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in d2:
return d2[each] + len(d)
return 1
def _get_char(char):
d = batch.shared['char2idx']
if char in d:
return d[char]
return 1
if supervised:
y = np.zeros([N, M, JX], dtype='int32')
feed_dict[self.y] = y
for i, (xi, yi) in enumerate(zip(batch.data['x'], batch.data['y'])):
count = 0
for j, xij in enumerate(xi):
for k, xijk in enumerate(xij):
if xijk == yi:
y[i, j, k] = True
count += 1
assert count > 0
for i, xi in enumerate(X):
for j, xij in enumerate(xi):
for k, xijk in enumerate(xij):
each = _get_word(xijk)
x[i, j, k] = each
x_mask[i, j, k] = True
for i, cxi in enumerate(CX):
for j, cxij in enumerate(cxi):
for k, cxijk in enumerate(cxij):
for l, cxijkl in enumerate(cxijk):
cx[i, j, k, l] = _get_char(cxijkl)
if l + 1 == config.max_word_size:
break
for i, qi in enumerate(batch.data['q']):
for j, qij in enumerate(qi):
q[i, j] = _get_word(qij)
q_mask[i, j] = True
for i, cqi in enumerate(batch.data['cq']):
for j, cqij in enumerate(cqi):
for k, cqijk in enumerate(cqij):
cq[i, j, k] = _get_char(cqijk)
if k + 1 == config.max_word_size:
break
return feed_dict
def get_multi_gpu_models(config):
models = []
for gpu_idx in range(config.num_gpus):
with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device("/gpu:{}".format(gpu_idx)):
model = Model(config, scope)
tf.get_variable_scope().reuse_variables()
models.append(model)
return models

View file

@ -0,0 +1,294 @@
import json
import os
import random
import itertools
import math
from collections import defaultdict
import numpy as np
from cnn_dm.prepro import para2sents
from my.tensorflow import grouper
from my.utils import index
class Data(object):
def get_size(self):
raise NotImplementedError()
def get_by_idxs(self, idxs):
"""
Efficient way to obtain a batch of items from filesystem
:param idxs:
:return dict: {'X': [,], 'Y', }
"""
data = defaultdict(list)
for idx in idxs:
each_data = self.get_one(idx)
for key, val in each_data.items():
data[key].append(val)
return data
def get_one(self, idx):
raise NotImplementedError()
def get_empty(self):
raise NotImplementedError()
def __add__(self, other):
raise NotImplementedError()
class MyData(Data):
def __init__(self, config, root_dir, file_names):
self.root_dir = root_dir
self.file_names = file_names
self.config = config
def get_one(self, idx):
file_name = self.file_names[idx]
with open(os.path.join(self.root_dir, file_name), 'r') as fh:
url = fh.readline().strip()
_ = fh.readline()
para = fh.readline().strip()
_ = fh.readline()
ques = fh.readline().strip()
_ = fh.readline()
answer = fh.readline().strip()
_ = fh.readline()
cands = list(line.strip() for line in fh)
cand_ents = list(cand.split(":")[0] for cand in cands)
wordss = para2sents(para, self.config.width)
ques_words = ques.split(" ")
x = wordss
cx = [[list(word) for word in words] for words in wordss]
q = ques_words
cq = [list(word) for word in ques_words]
y = answer
c = cand_ents
data = {'x': x, 'cx': cx, 'q': q, 'cq': cq, 'y': y, 'c': c, 'ids': file_name}
return data
def get_empty(self):
return MyData(self.config, self.root_dir, [])
def __add__(self, other):
file_names = self.file_names + other.file_names
return MyData(self.config, self.root_dir, file_names)
def get_size(self):
return len(self.file_names)
class DataSet(object):
def __init__(self, data, data_type, shared=None, valid_idxs=None):
self.data = data # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
self.data_type = data_type
self.shared = shared
total_num_examples = self.get_data_size()
self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
self.num_examples = total_num_examples
def _sort_key(self, idx):
rx = self.data['*x'][idx]
x = self.shared['x'][rx[0]][rx[1]]
return max(map(len, x))
def get_data_size(self):
if isinstance(self.data, dict):
return len(next(iter(self.data.values())))
elif isinstance(self.data, Data):
return self.data.get_size()
raise Exception()
def get_by_idxs(self, idxs):
if isinstance(self.data, dict):
out = defaultdict(list)
for key, val in self.data.items():
out[key].extend(val[idx] for idx in idxs)
return out
elif isinstance(self.data, Data):
return self.data.get_by_idxs(idxs)
raise Exception()
def get_one(self, idx):
if isinstance(self.data, dict):
out = {key: [val[idx]] for key, val in self.data.items()}
return out
elif isinstance(self.data, Data):
return self.data.get_one(idx)
def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False):
"""
:param batch_size:
:param num_batches:
:param shuffle:
:param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
:return:
"""
num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
if num_batches is None:
num_batches = num_batches_per_epoch
num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
if shuffle:
random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
if cluster:
sorted_idxs = sorted(random_idxs, key=self._sort_key)
sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch)
else:
random_grouped = lambda: list(grouper(random_idxs, batch_size))
grouped = random_grouped
else:
raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
grouped = raw_grouped
batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs))
for _ in range(num_batches):
batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None)
batch_data = self.get_by_idxs(batch_idxs)
shared_batch_data = {}
for key, val in batch_data.items():
if key.startswith('*'):
assert self.shared is not None
shared_key = key[1:]
shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val]
batch_data.update(shared_batch_data)
batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
yield batch_idxs, batch_ds
def get_multi_batches(self, batch_size, num_batches_per_step, num_steps=None, shuffle=False, cluster=False):
batch_size_per_step = batch_size * num_batches_per_step
batches = self.get_batches(batch_size_per_step, num_batches=num_steps, shuffle=shuffle, cluster=cluster)
multi_batches = (tuple(zip(grouper(idxs, batch_size, shorten=True, num_groups=num_batches_per_step),
data_set.divide(num_batches_per_step))) for idxs, data_set in batches)
return multi_batches
def get_empty(self):
if isinstance(self.data, dict):
data = {key: [] for key in self.data}
elif isinstance(self.data, Data):
data = self.data.get_empty()
else:
raise Exception()
return DataSet(data, self.data_type, shared=self.shared)
def __add__(self, other):
if isinstance(self.data, dict):
data = {key: val + other.data[key] for key, val in self.data.items()}
elif isinstance(self.data, Data):
data = self.data + other.data
else:
raise Exception()
valid_idxs = list(self.valid_idxs) + [valid_idx + self.num_examples for valid_idx in other.valid_idxs]
return DataSet(data, self.data_type, shared=self.shared, valid_idxs=valid_idxs)
def divide(self, integer):
batch_size = int(math.ceil(self.num_examples / integer))
idxs_gen = grouper(self.valid_idxs, batch_size, shorten=True, num_groups=integer)
data_gen = (self.get_by_idxs(idxs) for idxs in idxs_gen)
ds_tuple = tuple(DataSet(data, self.data_type, shared=self.shared) for data in data_gen)
return ds_tuple
class MyDataSet(DataSet):
def __init__(self, data, data_type, shared=None, valid_idxs=None):
super(MyDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
shared['max_num_sents'] = len(self.get_one(self.num_examples-1)['x'])
def _sort_key(self, idx):
return idx
def read_data(config, data_type, ref, data_filter=None):
shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
with open(shared_path, 'r') as fh:
shared = json.load(fh)
paths = shared['sorted']
if config.filter_ratio < 1.0:
stop = int(round(len(paths) * config.filter_ratio))
paths = paths[:stop]
num_examples = len(paths)
valid_idxs = range(num_examples)
print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
shared_path = config.shared_path or os.path.join(config.out_dir, "shared.json")
if not ref:
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
char_counter = shared['char_counter']
if config.finetune:
shared['word2idx'] = {word: idx + 3 for idx, word in
enumerate(word for word, count in word_counter.items()
if count > config.word_count_th or (config.known_if_glove and word in word2vec_dict))}
else:
assert config.known_if_glove
assert config.use_glove_for_unk
shared['word2idx'] = {word: idx + 3 for idx, word in
enumerate(word for word, count in word_counter.items()
if count > config.word_count_th and word not in word2vec_dict)}
shared['char2idx'] = {char: idx + 2 for idx, char in
enumerate(char for char, count in char_counter.items()
if count > config.char_count_th)}
NULL = "-NULL-"
UNK = "-UNK-"
ENT = "-ENT-"
shared['word2idx'][NULL] = 0
shared['word2idx'][UNK] = 1
shared['word2idx'][ENT] = 2
shared['char2idx'][NULL] = 0
shared['char2idx'][UNK] = 1
json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx']}, open(shared_path, 'w'))
else:
new_shared = json.load(open(shared_path, 'r'))
for key, val in new_shared.items():
shared[key] = val
if config.use_glove_for_unk:
# create new word2idx and word2vec
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
new_word2idx_dict = {word: idx for idx, word in enumerate(word for word in word2vec_dict.keys() if word not in shared['word2idx'])}
shared['new_word2idx'] = new_word2idx_dict
offset = len(shared['word2idx'])
word2vec_dict = shared['lower_word2vec'] if config.lower_word else shared['word2vec']
new_word2idx_dict = shared['new_word2idx']
idx2vec_dict = {idx: word2vec_dict[word] for word, idx in new_word2idx_dict.items()}
# print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
new_emb_mat = np.array([idx2vec_dict[idx] for idx in range(len(idx2vec_dict))], dtype='float32')
shared['new_emb_mat'] = new_emb_mat
data = MyData(config, os.path.join(config.root_dir, data_type), paths)
data_set = MyDataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
return data_set
def get_cnn_data_filter(config):
return True
def update_config(config, data_sets):
config.max_num_sents = 0
config.max_sent_size = 0
config.max_ques_size = 0
config.max_word_size = 0
for data_set in data_sets:
shared = data_set.shared
config.max_sent_size = max(config.max_sent_size, shared['max_sent_size'])
config.max_ques_size = max(config.max_ques_size, shared['max_ques_size'])
config.max_word_size = max(config.max_word_size, shared['max_word_size'])
config.max_num_sents = max(config.max_num_sents, shared['max_num_sents'])
config.max_word_size = min(config.max_word_size, config.word_size_th)
config.char_vocab_size = len(data_sets[0].shared['char2idx'])
config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
config.word_vocab_size = len(data_sets[0].shared['word2idx'])

View file

@ -0,0 +1,47 @@
import tensorflow as tf
from tensorflow.python.ops.rnn_cell import RNNCell
from my.tensorflow.nn import linear
class SHCell(RNNCell):
"""
Super-Highway Cell
"""
def __init__(self, input_size, logit_func='tri_linear', scalar=False):
self._state_size = input_size
self._output_size = input_size
self._logit_func = logit_func
self._scalar = scalar
@property
def state_size(self):
return self._state_size
@property
def output_size(self):
return self._output_size
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(scope or "SHCell"):
a_size = 1 if self._scalar else self._state_size
h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs)
if self._logit_func == 'mul_linear':
args = [h * u, state * u]
a = tf.nn.sigmoid(linear(args, a_size, True))
elif self._logit_func == 'linear':
args = [h, u, state]
a = tf.nn.sigmoid(linear(args, a_size, True))
elif self._logit_func == 'tri_linear':
args = [h, u, state, h * u, state * u]
a = tf.nn.sigmoid(linear(args, a_size, True))
elif self._logit_func == 'double':
args = [h, u, state]
a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True))
else:
raise Exception()
new_state = a * state + (1 - a) * h
outputs = state
return outputs, new_state

View file

@ -0,0 +1,76 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ title }}</title>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
<script>
$(document).ready(function(){
$(".att").each(function() {
// var val = parseFloat($(this).text());
var val = parseFloat($(this).attr("color"));
var scale = chroma.scale(['white', 'red']);
var color = scale(val).hex();
$(this).attr("bgcolor", color);
});
})
</script>
</head>
<style>
table, th, td {border: 1px solid black}
</style>
<body>
<h2>{{ title }}</h2>
<table>
<tr>
<th>ID</th>
<th>Question</th>
<th>Answers</th>
<th>Predicted</th>
<th>Score</th>
<th>Paragraph</th>
</tr>
{% for row in rows %}
<tr>
<td>{{ row.id }}</td>
<td>
{% for qj in row.ques %}
{{ qj }}
{% endfor %}
</td>
<td>
{% for aa in row.a %}
<li>{{ aa }}</li>
{% endfor %}
</td>
<td>{{ row.ap }}</td>
<td>{{ row.score }}</td>
<td>
<table>
{% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
<tr>
{% set rowloop = loop %}
{% for xjk, ypjk in zip(xj, ypj) %}
<td class="att" color="{{ ypjk }}">
{% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
<b>{{ xjk }}</b>
{% else %}
{{ xjk }}
{% endif %}
</td>
{% endfor %}
</tr>
<tr>
{% for xjk, yp2jk in zip(xj, yp2j) %}
<td class="att" color="{{ yp2jk }}">-</td>
{% endfor %}
</tr>
{% endfor %}
</table>
</td>
</tr>
{% endfor %}
</table>
</body>
</html>

View file

@ -0,0 +1,73 @@
import tensorflow as tf
from basic_cnn.model import Model
from my.tensorflow import average_gradients
class Trainer(object):
def __init__(self, config, model):
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
self.loss = model.get_loss()
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.summary = model.summary
self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def get_train_op(self):
return self.train_op
def step(self, sess, batch, get_summary=False):
assert isinstance(sess, tf.Session)
_, ds = batch
feed_dict = self.model.get_feed_dict(ds, True)
if get_summary:
loss, summary, train_op = \
sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
else:
loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
summary = None
return loss, summary, train_op
class MultiGPUTrainer(object):
def __init__(self, config, models):
model = models[0]
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.summary = model.summary
self.models = models
losses = []
grads_list = []
for gpu_idx, model in enumerate(models):
with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
loss = model.get_loss()
grads = self.opt.compute_gradients(loss, var_list=self.var_list)
losses.append(loss)
grads_list.append(grads)
self.loss = tf.add_n(losses)/len(losses)
self.grads = average_gradients(grads_list)
self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
def step(self, sess, batches, get_summary=False):
assert isinstance(sess, tf.Session)
feed_dict = {}
for batch, model in zip(batches, self.models):
_, ds = batch
feed_dict.update(model.get_feed_dict(ds, True))
if get_summary:
loss, summary, train_op = \
sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
else:
loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
summary = None
return loss, summary, train_op

View file

@ -0,0 +1,137 @@
import shutil
from collections import OrderedDict
import http.server
import socketserver
import argparse
import json
import os
import numpy as np
from tqdm import tqdm
from jinja2 import Environment, FileSystemLoader
from basic_cnn.evaluator import get_span_score_pairs, get_best_span
def bool_(string):
if string == 'True':
return True
elif string == 'False':
return False
else:
raise Exception()
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default='basic')
parser.add_argument("--data_type", type=str, default='dev')
parser.add_argument("--step", type=int, default=5000)
parser.add_argument("--template_name", type=str, default="visualizer.html")
parser.add_argument("--num_per_page", type=int, default=100)
parser.add_argument("--data_dir", type=str, default="data/squad")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--open", type=str, default='False')
parser.add_argument("--run_id", type=str, default="0")
args = parser.parse_args()
return args
def _decode(decoder, sent):
return " ".join(decoder[idx] for idx in sent)
def accuracy2_visualizer(args):
model_name = args.model_name
data_type = args.data_type
num_per_page = args.num_per_page
data_dir = args.data_dir
run_id = args.run_id.zfill(2)
step = args.step
eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
print("loading {}".format(eval_path))
eval_ = json.load(open(eval_path, 'r'))
_id = 0
html_dir = "/tmp/list_results%d" % _id
while os.path.exists(html_dir):
_id += 1
html_dir = "/tmp/list_results%d" % _id
if os.path.exists(html_dir):
shutil.rmtree(html_dir)
os.mkdir(html_dir)
cur_dir = os.path.dirname(os.path.realpath(__file__))
templates_dir = os.path.join(cur_dir, 'templates')
env = Environment(loader=FileSystemLoader(templates_dir))
env.globals.update(zip=zip, reversed=reversed)
template = env.get_template(args.template_name)
data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
print("loading {}".format(data_path))
data = json.load(open(data_path, 'r'))
print("loading {}".format(shared_path))
shared = json.load(open(shared_path, 'r'))
rows = []
for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])):
id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
x = shared['x'][rx[0]][rx[1]]
ques = [" ".join(q)]
para = [[word for word in sent] for sent in x]
span = get_best_span(ypi, yp2i)
ap = get_segment(para, span)
score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
row = {
'id': id_,
'title': "Hello world!",
'ques': ques,
'para': para,
'y': yi[0][0],
'y2': yi[0][1],
'yp': ypi,
'yp2': yp2i,
'a': answers,
'ap': ap,
'score': score
}
rows.append(row)
if i % num_per_page == 0:
html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
var_dict = {'title': "Accuracy Visualization",
'rows': rows
}
with open(html_path, "wb") as f:
f.write(template.render(**var_dict).encode('UTF-8'))
rows = []
os.chdir(html_dir)
port = args.port
host = args.host
# Overriding to suppress log message
class MyHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass
handler = MyHandler
httpd = socketserver.TCPServer((host, port), handler)
if args.open == 'True':
os.system("open http://%s:%d" % (args.host, args.port))
print("serving at %s:%d" % (host, port))
httpd.serve_forever()
def get_segment(para, span):
return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
if __name__ == "__main__":
ARGS = get_args()
accuracy2_visualizer(ARGS)

View file

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,38 @@
import json
import os
import sys
root_dir = sys.argv[1]
answer_path = sys.argv[2]
file_names = os.listdir(root_dir)
num_correct = 0
num_wrong = 0
with open(answer_path, 'r') as fh:
id2answer_dict = json.load(fh)
for file_name in file_names:
if not file_name.endswith(".question"):
continue
with open(os.path.join(root_dir, file_name), 'r') as fh:
url = fh.readline().strip()
_ = fh.readline()
para = fh.readline().strip()
_ = fh.readline()
ques = fh.readline().strip()
_ = fh.readline()
answer = fh.readline().strip()
_ = fh.readline()
if file_name in id2answer_dict:
pred = id2answer_dict[file_name]
if pred == answer:
num_correct += 1
else:
num_wrong += 1
else:
num_wrong += 1
total = num_correct + num_wrong
acc = float(num_correct) / total
print("{} = {} / {}".format(acc, num_correct, total))

View file

@ -0,0 +1,185 @@
import argparse
import json
import os
# data: q, cq, (dq), (pq), y, *x, *cx
# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
# no metadata
from collections import Counter
from tqdm import tqdm
from my.utils import process_tokens
from squad.utils import get_word_span, process_tokens
def bool_(arg):
if arg == 'True':
return True
elif arg == 'False':
return False
raise Exception(arg)
def main():
args = get_args()
prepro(args)
def get_args():
parser = argparse.ArgumentParser()
home = os.path.expanduser("~")
source_dir = os.path.join(home, "data", "cnn", 'questions')
target_dir = "data/cnn"
glove_dir = os.path.join(home, "data", "glove")
parser.add_argument("--source_dir", default=source_dir)
parser.add_argument("--target_dir", default=target_dir)
parser.add_argument("--glove_dir", default=glove_dir)
parser.add_argument("--glove_corpus", default='6B')
parser.add_argument("--glove_vec_size", default=100, type=int)
parser.add_argument("--debug", default=False, type=bool_)
parser.add_argument("--num_sents_th", default=200, type=int)
parser.add_argument("--ques_size_th", default=30, type=int)
parser.add_argument("--width", default=5, type=int)
# TODO : put more args here
return parser.parse_args()
def prepro(args):
prepro_each(args, 'train')
prepro_each(args, 'dev')
prepro_each(args, 'test')
def para2sents(para, width):
"""
Turn para into double array of words (wordss)
Where each sentence is up to 5 word neighbors of each entity
:param para:
:return:
"""
words = para.split(" ")
sents = []
for i, word in enumerate(words):
if word.startswith("@"):
start = max(i - width, 0)
stop = min(i + width + 1, len(words))
sent = words[start:stop]
sents.append(sent)
return sents
def get_word2vec(args, word_counter):
glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
total = sizes[args.glove_corpus]
word2vec_dict = {}
with open(glove_path, 'r', encoding='utf-8') as fh:
for line in tqdm(fh, total=total):
array = line.lstrip().rstrip().split(" ")
word = array[0]
vector = list(map(float, array[1:]))
if word in word_counter:
word2vec_dict[word] = vector
elif word.capitalize() in word_counter:
word2vec_dict[word.capitalize()] = vector
elif word.lower() in word_counter:
word2vec_dict[word.lower()] = vector
elif word.upper() in word_counter:
word2vec_dict[word.upper()] = vector
print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
return word2vec_dict
def prepro_each(args, mode):
source_dir = os.path.join(args.source_dir, mode)
word_counter = Counter()
lower_word_counter = Counter()
ent_counter = Counter()
char_counter = Counter()
max_sent_size = 0
max_word_size = 0
max_ques_size = 0
max_num_sents = 0
file_names = list(os.listdir(source_dir))
if args.debug:
file_names = file_names[:1000]
lens = []
out_file_names = []
for file_name in tqdm(file_names, total=len(file_names)):
if file_name.endswith(".question"):
with open(os.path.join(source_dir, file_name), 'r') as fh:
url = fh.readline().strip()
_ = fh.readline()
para = fh.readline().strip()
_ = fh.readline()
ques = fh.readline().strip()
_ = fh.readline()
answer = fh.readline().strip()
_ = fh.readline()
cands = list(line.strip() for line in fh)
cand_ents = list(cand.split(":")[0] for cand in cands)
sents = para2sents(para, args.width)
ques_words = ques.split(" ")
# Filtering
if len(sents) > args.num_sents_th or len(ques_words) > args.ques_size_th:
continue
max_sent_size = max(max(map(len, sents)), max_sent_size)
max_ques_size = max(len(ques_words), max_ques_size)
max_word_size = max(max(len(word) for sent in sents for word in sent), max_word_size)
max_num_sents = max(len(sents), max_num_sents)
for word in ques_words:
if word.startswith("@"):
ent_counter[word] += 1
word_counter[word] += 1
else:
word_counter[word] += 1
lower_word_counter[word.lower()] += 1
for c in word:
char_counter[c] += 1
for sent in sents:
for word in sent:
if word.startswith("@"):
ent_counter[word] += 1
word_counter[word] += 1
else:
word_counter[word] += 1
lower_word_counter[word.lower()] += 1
for c in word:
char_counter[c] += 1
out_file_names.append(file_name)
lens.append(len(sents))
num_examples = len(out_file_names)
assert len(out_file_names) == len(lens)
sorted_file_names, lens = zip(*sorted(zip(out_file_names, lens), key=lambda each: each[1]))
assert lens[-1] == max_num_sents
word2vec_dict = get_word2vec(args, word_counter)
lower_word2vec_dit = get_word2vec(args, lower_word_counter)
shared = {'word_counter': word_counter, 'ent_counter': ent_counter, 'char_counter': char_counter,
'lower_word_counter': lower_word_counter,
'max_num_sents': max_num_sents, 'max_sent_size': max_sent_size, 'max_word_size': max_word_size,
'max_ques_size': max_ques_size,
'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dit, 'sorted': sorted_file_names,
'num_examples': num_examples}
print("max num sents: {}".format(max_num_sents))
print("max ques size: {}".format(max_ques_size))
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
shared_path = os.path.join(args.target_dir, "shared_{}.json".format(mode))
with open(shared_path, 'w') as fh:
json.dump(shared, fh)
if __name__ == "__main__":
main()

25
tensorflow/SQuAD/download.sh Executable file
View file

@ -0,0 +1,25 @@
#!/usr/bin/env bash
DATA_DIR=$HOME/data
mkdir $DATA_DIR
# Download SQuAD
SQUAD_DIR=$DATA_DIR/squad
mkdir $SQUAD_DIR
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
# Download CNN and DailyMail
# Download at: http://cs.nyu.edu/~kcho/DMQA/
# Download GloVe
GLOVE_DIR=$DATA_DIR/glove
mkdir $GLOVE_DIR
wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
# Download NLTK (for tokenizer)
# Make sure that nltk is installed!
python3 -m nltk.downloader -d $HOME/nltk_data punkt

View file

View file

@ -0,0 +1,55 @@
import logging
import requests
import nltk
import json
import networkx as nx
import time
class CoreNLPInterface(object):
def __init__(self, url, port):
self._url = url
self._port = port
def get(self, type_, in_, num_max_requests=100):
in_ = in_.encode("utf-8")
url = "http://{}:{}/{}".format(self._url, self._port, type_)
out = None
for _ in range(num_max_requests):
try:
r = requests.post(url, data=in_)
out = r.content.decode('utf-8')
if out == 'error':
out = None
break
except:
time.sleep(1)
return out
def split_doc(self, doc):
out = self.get("doc", doc)
return out if out is None else json.loads(out)
def split_sent(self, sent):
out = self.get("sent", sent)
return out if out is None else json.loads(out)
def get_dep(self, sent):
out = self.get("dep", sent)
return out if out is None else json.loads(out)
def get_const(self, sent):
out = self.get("const", sent)
return out
def get_const_tree(self, sent):
out = self.get_const(sent)
return out if out is None else nltk.tree.Tree.fromstring(out)
@staticmethod
def dep2tree(dep):
tree = nx.DiGraph()
for dep, i, gov, j, label in dep:
tree.add_edge(gov, dep, label=label)
return tree

View file

@ -0,0 +1,129 @@
import nltk
import numpy as np
def _set_span(t, i):
if isinstance(t[0], str):
t.span = (i, i+len(t))
else:
first = True
for c in t:
cur_span = _set_span(c, i)
i = cur_span[1]
if first:
min_ = cur_span[0]
first = False
max_ = cur_span[1]
t.span = (min_, max_)
return t.span
def set_span(t):
assert isinstance(t, nltk.tree.Tree)
try:
return _set_span(t, 0)
except:
print(t)
exit()
def tree_contains_span(tree, span):
"""
Assumes that tree span has been set with set_span
Returns true if any subtree of t has exact span as the given span
:param t:
:param span:
:return bool:
"""
return span in set(t.span for t in tree.subtrees())
def span_len(span):
return span[1] - span[0]
def span_overlap(s1, s2):
start = max(s1[0], s2[0])
stop = min(s1[1], s2[1])
if stop > start:
return start, stop
return None
def span_prec(true_span, pred_span):
overlap = span_overlap(true_span, pred_span)
if overlap is None:
return 0
return span_len(overlap) / span_len(pred_span)
def span_recall(true_span, pred_span):
overlap = span_overlap(true_span, pred_span)
if overlap is None:
return 0
return span_len(overlap) / span_len(true_span)
def span_f1(true_span, pred_span):
p = span_prec(true_span, pred_span)
r = span_recall(true_span, pred_span)
if p == 0 or r == 0:
return 0.0
return 2 * p * r / (p + r)
def find_max_f1_span(tree, span):
return find_max_f1_subtree(tree, span).span
def find_max_f1_subtree(tree, span):
return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0]
def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'):
set_span(tree)
D = tree.height() - 1
B = len(tree.leaves())
row_size = row_size or D
col_size = col_size or B
matrix = np.zeros([row_size, col_size], dtype=dtype)
mask = np.zeros([row_size, col_size, col_size], dtype='bool')
for subtree in tree.subtrees():
row = subtree.height() - 2
col = subtree.span[0]
matrix[row, col] = node2num(subtree)
for subsub in subtree.subtrees():
if isinstance(subsub, nltk.tree.Tree):
mask[row, col, subsub.span[0]] = True
if not isinstance(subsub[0], nltk.tree.Tree):
c = subsub.span[0]
for r in range(row):
mask[r, c, c] = True
else:
mask[row, col, col] = True
return matrix, mask
def load_compressed_tree(s):
def compress_tree(tree):
assert not isinstance(tree, str)
if len(tree) == 1:
if isinstance(tree[0], nltk.tree.Tree):
return compress_tree(tree[0])
else:
return tree
else:
for i, t in enumerate(tree):
if isinstance(t, nltk.tree.Tree):
tree[i] = compress_tree(t)
else:
tree[i] = t
return tree
return compress_tree(nltk.tree.Tree.fromstring(s))

View file

@ -0,0 +1 @@
from my.tensorflow.general import *

View file

@ -0,0 +1,177 @@
from itertools import zip_longest
import itertools
import tensorflow as tf
from functools import reduce
from operator import mul
import numpy as np
VERY_BIG_NUMBER = 1e30
VERY_SMALL_NUMBER = 1e-30
VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER
def get_initializer(matrix):
def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
return _initializer
def variable_on_cpu(name, shape, initializer):
"""Helper to create a Variable stored on CPU memory.
Args:
name: name of the variable
shape: list of ints
initializer: initializer for Variable
Returns:
Variable Tensor
"""
with tf.device('/cpu:0'):
var = tf.get_variable(name, shape, initializer=initializer)
return var
def variable_with_weight_decay(name, shape, stddev, wd):
"""Helper to create an initialized Variable with weight decay.
Note that the Variable is initialized with a truncated normal distribution.
A weight decay is added only if one is specified.
Args:
name: name of the variable
shape: list of ints
stddev: standard deviation of a truncated Gaussian
wd: add L2Loss weight decay multiplied by this float. If None, weight
decay is not added for this Variable.
Returns:
Variable Tensor
"""
var = variable_on_cpu(name, shape,
tf.truncated_normal_initializer(stddev=stddev))
if wd:
weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, var in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
assert g is not None, var.name
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def mask(val, mask, name=None):
if name is None:
name = 'mask'
return tf.multiply(val, tf.cast(mask, 'float'), name=name)
def exp_mask(val, mask, name=None):
"""Give very negative number to unmasked elements in val.
For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
Typically, this effectively masks in exponential space (e.g. softmax)
Args:
val: values to be masked
mask: masking boolean tensor, same shape as tensor
name: name for output tensor
Returns:
Same shape as val, where some elements are very small (exponentially zero)
"""
if name is None:
name = "exp_mask"
return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)
def flatten(tensor, keep):
fixed_shape = tensor.get_shape().as_list()
start = len(fixed_shape) - keep
left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
flat = tf.reshape(tensor, out_shape)
return flat
def reconstruct(tensor, ref, keep):
ref_shape = ref.get_shape().as_list()
tensor_shape = tensor.get_shape().as_list()
ref_stop = len(ref_shape) - keep
tensor_start = len(tensor_shape) - keep
pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
# pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
# keep_shape = tensor.get_shape().as_list()[-keep:]
target_shape = pre_shape + keep_shape
out = tf.reshape(tensor, target_shape)
return out
def add_wd(wd, scope=None):
scope = scope or tf.get_variable_scope().name
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
with tf.name_scope("weight_decay"):
for var in variables:
weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name="{}/wd".format(var.op.name))
tf.add_to_collection('losses', weight_decay)
def grouper(iterable, n, fillvalue=None, shorten=False, num_groups=None):
args = [iter(iterable)] * n
out = zip_longest(*args, fillvalue=fillvalue)
out = list(out)
if num_groups is not None:
default = (fillvalue, ) * n
assert isinstance(num_groups, int)
out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
if shorten:
assert fillvalue is None
out = (tuple(e for e in each if e is not None) for each in out)
return out
def padded_reshape(tensor, shape, mode='CONSTANT', name=None):
paddings = [[0, shape[i] - tf.shape(tensor)[i]] for i in range(len(shape))]
return tf.pad(tensor, paddings, mode=mode, name=name)
def get_num_params():
num_params = 0
for variable in tf.trainable_variables():
shape = variable.get_shape()
num_params += reduce(mul, [dim.value for dim in shape], 1)
return num_params

View file

@ -0,0 +1,180 @@
from tensorflow.python.ops.rnn_cell_impl import _linear
from tensorflow.python.util import nest
import tensorflow as tf
from my.tensorflow import flatten, reconstruct, add_wd, exp_mask
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
is_train=None):
if args is None or (nest.is_sequence(args) and not args):
raise ValueError("`args` must be specified")
if not nest.is_sequence(args):
args = [args]
flat_args = [flatten(arg, 1) for arg in args]
if input_keep_prob < 1.0:
assert is_train is not None
flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
for arg in flat_args]
with tf.variable_scope(scope or 'Linear'):
flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start))
out = reconstruct(flat_out, args[0], 1)
if squeeze:
out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
if wd:
add_wd(wd)
return out
def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None):
with tf.name_scope(name or "dropout"):
if keep_prob < 1.0:
d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed)
out = tf.cond(is_train, lambda: d, lambda: x)
return out
return x
def softmax(logits, mask=None, scope=None):
with tf.name_scope(scope or "Softmax"):
if mask is not None:
logits = exp_mask(logits, mask)
flat_logits = flatten(logits, 1)
flat_out = tf.nn.softmax(flat_logits)
out = reconstruct(flat_out, logits, 1)
return out
def softsel(target, logits, mask=None, scope=None):
"""
:param target: [ ..., J, d] dtype=float
:param logits: [ ..., J], dtype=float
:param mask: [ ..., J], dtype=bool
:param scope:
:return: [..., d], dtype=float
"""
with tf.name_scope(scope or "Softsel"):
a = softmax(logits, mask=mask)
target_rank = len(target.get_shape().as_list())
out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2)
return out
def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
with tf.variable_scope(scope or "Double_Linear_Logits"):
first = tf.tanh(linear(args, size, bias, bias_start=bias_start, scope='first',
wd=wd, input_keep_prob=input_keep_prob, is_train=is_train))
second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second',
wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
if mask is not None:
second = exp_mask(second, mask)
return second
def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
with tf.variable_scope(scope or "Linear_Logits"):
logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first',
wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
if mask is not None:
logits = exp_mask(logits, mask)
return logits
def sum_logits(args, mask=None, name=None):
with tf.name_scope(name or "sum_logits"):
if args is None or (nest.is_sequence(args) and not args):
raise ValueError("`args` must be specified")
if not nest.is_sequence(args):
args = [args]
rank = len(args[0].get_shape())
logits = sum(tf.reduce_sum(arg, rank-1) for arg in args)
if mask is not None:
logits = exp_mask(logits, mask)
return logits
def get_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None, func=None):
if func is None:
func = "sum"
if func == 'sum':
return sum_logits(args, mask=mask, name=scope)
elif func == 'linear':
return linear_logits(args, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
is_train=is_train)
elif func == 'double':
return double_linear_logits(args, size, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
is_train=is_train)
elif func == 'dot':
assert len(args) == 2
arg = args[0] * args[1]
return sum_logits([arg], mask=mask, name=scope)
elif func == 'mul_linear':
assert len(args) == 2
arg = args[0] * args[1]
return linear_logits([arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
is_train=is_train)
elif func == 'proj':
assert len(args) == 2
d = args[1].get_shape()[-1]
proj = linear([args[0]], d, False, bias_start=bias_start, scope=scope, wd=wd, input_keep_prob=input_keep_prob,
is_train=is_train)
return sum_logits([proj * args[1]], mask=mask)
elif func == 'tri_linear':
assert len(args) == 2
new_arg = args[0] * args[1]
return linear_logits([args[0], args[1], new_arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
is_train=is_train)
else:
raise Exception()
def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
with tf.variable_scope(scope or "highway_layer"):
d = arg.get_shape()[-1]
trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
trans = tf.nn.relu(trans)
gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
gate = tf.nn.sigmoid(gate)
out = gate * trans + (1 - gate) * arg
return out
def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
with tf.variable_scope(scope or "highway_network"):
prev = arg
cur = None
for layer_idx in range(num_layers):
cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd,
input_keep_prob=input_keep_prob, is_train=is_train)
prev = cur
return cur
def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None):
with tf.variable_scope(scope or "conv1d"):
num_channels = in_.get_shape()[-1]
filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float')
bias = tf.get_variable("bias", shape=[filter_size], dtype='float')
strides = [1, 1, 1, 1]
if is_train is not None and keep_prob < 1.0:
in_ = dropout(in_, keep_prob, is_train)
xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias # [N*M, JX, W/filter_stride, d]
out = tf.reduce_max(tf.nn.relu(xxc), 2) # [-1, JX, d]
return out
def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None):
with tf.variable_scope(scope or "multi_conv1d"):
assert len(filter_sizes) == len(heights)
outs = []
for filter_size, height in zip(filter_sizes, heights):
if filter_size == 0:
continue
out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height))
outs.append(out)
concat_out = tf.concat(axis=2, values=outs)
return concat_out

View file

@ -0,0 +1,81 @@
import tensorflow as tf
from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \
bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
from my.tensorflow import flatten, reconstruct
def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
dtype=None, parallel_iterations=None, swap_memory=False,
time_major=False, scope=None):
assert not time_major # TODO : to be implemented later!
flat_inputs = flatten(inputs, 2) # [-1, J, d]
flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
initial_state=initial_state, dtype=dtype,
parallel_iterations=parallel_iterations, swap_memory=swap_memory,
time_major=time_major, scope=scope)
outputs = reconstruct(flat_outputs, inputs, 2)
return outputs, final_state
def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
dtype=None, parallel_iterations=None, swap_memory=False,
time_major=False, scope=None):
assert not time_major # TODO : to be implemented later!
flat_inputs = flatten(inputs, 2) # [-1, J, d]
flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \
else tf.reverse_sequence(flat_inputs, sequence_length, 1)
flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
initial_state=initial_state, dtype=dtype,
parallel_iterations=parallel_iterations, swap_memory=swap_memory,
time_major=time_major, scope=scope)
flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \
else tf.reverse_sequence(flat_outputs, sequence_length, 1)
outputs = reconstruct(flat_outputs, inputs, 2)
return outputs, final_state
def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
initial_state_fw=None, initial_state_bw=None,
dtype=None, parallel_iterations=None,
swap_memory=False, time_major=False, scope=None):
assert not time_major
flat_inputs = flatten(inputs, 2) # [-1, J, d]
flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
(flat_fw_outputs, flat_bw_outputs), final_state = \
_bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
time_major=time_major, scope=scope)
fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
# FIXME : final state is not reshaped!
return (fw_outputs, bw_outputs), final_state
def bidirectional_rnn(cell_fw, cell_bw, inputs,
initial_state_fw=None, initial_state_bw=None,
dtype=None, sequence_length=None, scope=None):
flat_inputs = flatten(inputs, 2) # [-1, J, d]
flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
(flat_fw_outputs, flat_bw_outputs), final_state = \
tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
dtype=dtype, scope=scope)
fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
# FIXME : final state is not reshaped!
return (fw_outputs, bw_outputs), final_state

View file

@ -0,0 +1,223 @@
import tensorflow as tf
from tensorflow.contrib.rnn import DropoutWrapper, RNNCell, LSTMStateTuple
from my.tensorflow import exp_mask, flatten
from my.tensorflow.nn import linear, softsel, double_linear_logits
class SwitchableDropoutWrapper(DropoutWrapper):
def __init__(self, cell, is_train, input_keep_prob=1.0, output_keep_prob=1.0,
seed=None):
super(SwitchableDropoutWrapper, self).__init__(cell, input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob,
seed=seed)
self.is_train = is_train
def __call__(self, inputs, state, scope=None):
outputs_do, new_state_do = super(SwitchableDropoutWrapper, self).__call__(inputs, state, scope=scope)
tf.get_variable_scope().reuse_variables()
outputs, new_state = self._cell(inputs, state, scope)
outputs = tf.cond(self.is_train, lambda: outputs_do, lambda: outputs)
if isinstance(state, tuple):
new_state = state.__class__(*[tf.cond(self.is_train, lambda: new_state_do_i, lambda: new_state_i)
for new_state_do_i, new_state_i in zip(new_state_do, new_state)])
else:
new_state = tf.cond(self.is_train, lambda: new_state_do, lambda: new_state)
return outputs, new_state
class TreeRNNCell(RNNCell):
def __init__(self, cell, input_size, reduce_func):
self._cell = cell
self._input_size = input_size
self._reduce_func = reduce_func
def __call__(self, inputs, state, scope=None):
"""
:param inputs: [N*B, I + B]
:param state: [N*B, d]
:param scope:
:return: [N*B, d]
"""
with tf.variable_scope(scope or self.__class__.__name__):
d = self.state_size
x = tf.slice(inputs, [0, 0], [-1, self._input_size]) # [N*B, I]
mask = tf.slice(inputs, [0, self._input_size], [-1, -1]) # [N*B, B]
B = tf.shape(mask)[1]
prev_state = tf.expand_dims(tf.reshape(state, [-1, B, d]), 1) # [N, B, d] -> [N, 1, B, d]
mask = tf.tile(tf.expand_dims(tf.reshape(mask, [-1, B, B]), -1), [1, 1, 1, d]) # [N, B, B, d]
# prev_state = self._reduce_func(tf.tile(prev_state, [1, B, 1, 1]), 2)
prev_state = self._reduce_func(exp_mask(prev_state, mask), 2) # [N, B, d]
prev_state = tf.reshape(prev_state, [-1, d]) # [N*B, d]
return self._cell(x, prev_state)
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return self._cell.output_size
class NoOpCell(RNNCell):
def __init__(self, num_units):
self._num_units = num_units
def __call__(self, inputs, state, scope=None):
return state, state
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
class MatchCell(RNNCell):
def __init__(self, cell, input_size, q_len):
self._cell = cell
self._input_size = input_size
# FIXME : This won't be needed with good shape guessing
self._q_len = q_len
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return self._cell.output_size
def __call__(self, inputs, state, scope=None):
"""
:param inputs: [N, d + JQ + JQ * d]
:param state: [N, d]
:param scope:
:return:
"""
with tf.variable_scope(scope or self.__class__.__name__):
c_prev, h_prev = state
x = tf.slice(inputs, [0, 0], [-1, self._input_size])
q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len]) # [N, JQ]
qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1])
qs = tf.reshape(qs, [-1, self._q_len, self._input_size]) # [N, JQ, d]
x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1]) # [N, JQ, d]
h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1]) # [N, JQ, d]
f = tf.tanh(linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f')) # [N, JQ, d]
a = tf.nn.softmax(exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask)) # [N, JQ]
q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1)
z = tf.concat(axis=1, values=[x, q]) # [N, 2d]
return self._cell(z, state)
class AttentionCell(RNNCell):
def __init__(self, cell, memory, mask=None, controller=None, mapper=None, input_keep_prob=1.0, is_train=None):
"""
Early fusion attention cell: uses the (inputs, state) to control the current attention.
:param cell:
:param memory: [N, M, m]
:param mask:
:param controller: (inputs, prev_state, memory) -> memory_logits
"""
self._cell = cell
self._memory = memory
self._mask = mask
self._flat_memory = flatten(memory, 2)
self._flat_mask = flatten(mask, 1)
if controller is None:
controller = AttentionCell.get_linear_controller(True, is_train=is_train)
self._controller = controller
if mapper is None:
mapper = AttentionCell.get_concat_mapper()
elif mapper == 'sim':
mapper = AttentionCell.get_sim_mapper()
self._mapper = mapper
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return self._cell.output_size
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(scope or "AttentionCell"):
memory_logits = self._controller(inputs, state, self._flat_memory)
sel_mem = softsel(self._flat_memory, memory_logits, mask=self._flat_mask) # [N, m]
new_inputs, new_state = self._mapper(inputs, state, sel_mem)
return self._cell(new_inputs, state)
@staticmethod
def get_double_linear_controller(size, bias, input_keep_prob=1.0, is_train=None):
def double_linear_controller(inputs, state, memory):
"""
:param inputs: [N, i]
:param state: [N, d]
:param memory: [N, M, m]
:return: [N, M]
"""
rank = len(memory.get_shape())
_memory_size = tf.shape(memory)[rank-2]
tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
if isinstance(state, tuple):
tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
for each in state]
else:
tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]
# [N, M, d]
in_ = tf.concat([tiled_inputs] + tiled_states + [memory], axis=2)
out = double_linear_logits(in_, size, bias, input_keep_prob=input_keep_prob,
is_train=is_train)
return out
return double_linear_controller
@staticmethod
def get_linear_controller(bias, input_keep_prob=1.0, is_train=None):
def linear_controller(inputs, state, memory):
rank = len(memory.get_shape())
_memory_size = tf.shape(memory)[rank-2]
tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
if isinstance(state, tuple):
tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
for each in state]
else:
tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]
# [N, M, d]
in_ = tf.concat([tiled_inputs] + tiled_states + [memory], axis=2)
out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train)
return out
return linear_controller
@staticmethod
def get_concat_mapper():
def concat_mapper(inputs, state, sel_mem):
"""
:param inputs: [N, i]
:param state: [N, d]
:param sel_mem: [N, m]
:return: (new_inputs, new_state) tuple
"""
return tf.concat(axis=1, values=[inputs, sel_mem]), state
return concat_mapper
@staticmethod
def get_sim_mapper():
def sim_mapper(inputs, state, sel_mem):
"""
Assume that inputs and sel_mem are the same size
:param inputs: [N, i]
:param state: [N, d]
:param sel_mem: [N, i]
:return: (new_inputs, new_state) tuple
"""
return tf.concat(axis=1, values=[inputs, sel_mem, inputs * sel_mem, tf.abs(inputs - sel_mem)]), state
return sim_mapper

View file

@ -0,0 +1,58 @@
import json
from collections import deque
import numpy as np
from tqdm import tqdm
def mytqdm(list_, desc="", show=True):
if show:
pbar = tqdm(list_)
pbar.set_description(desc)
return pbar
return list_
def json_pretty_dump(obj, fh):
return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': '))
def index(l, i):
return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]]
def fill(l, shape, dtype=None):
out = np.zeros(shape, dtype=dtype)
stack = deque()
stack.appendleft(((), l))
while len(stack) > 0:
indices, cur = stack.pop()
if len(indices) < shape:
for i, sub in enumerate(cur):
stack.appendleft([indices + (i,), sub])
else:
out[indices] = cur
return out
def short_floats(o, precision):
class ShortFloat(float):
def __repr__(self):
return '%.{}g'.format(precision) % self
def _short_floats(obj):
if isinstance(obj, float):
return ShortFloat(obj)
elif isinstance(obj, dict):
return dict((k, _short_floats(v)) for k, v in obj.items())
elif isinstance(obj, (list, tuple)):
return tuple(map(_short_floats, obj))
return obj
return _short_floats(o)
def argmax(x):
return np.unravel_index(x.argmax(), x.shape)

View file

@ -0,0 +1,50 @@
import argparse
import os
import shutil
from zipfile import ZipFile
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('paths', nargs='+')
parser.add_argument('-o', '--out', default='save.zip')
args = parser.parse_args()
return args
def zip_save(args):
temp_dir = "."
save_dir = os.path.join(temp_dir, "save")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for save_source_path in tqdm(args.paths):
# path = "out/basic/30/save/basic-18000"
# target_path = "save_dir/30/save"
# also output full path name to "save_dir/30/readme.txt
# need to also extract "out/basic/30/shared.json"
temp, _ = os.path.split(save_source_path) # "out/basic/30/save", _
model_dir, _ = os.path.split(temp) # "out/basic/30, _
_, model_name = os.path.split(model_dir)
cur_dir = os.path.join(save_dir, model_name)
if not os.path.exists(cur_dir):
os.makedirs(cur_dir)
save_target_path = os.path.join(cur_dir, "save")
shared_target_path = os.path.join(cur_dir, "shared.json")
readme_path = os.path.join(cur_dir, "readme.txt")
shared_source_path = os.path.join(model_dir, "shared.json")
shutil.copy(save_source_path, save_target_path)
shutil.copy(shared_source_path, shared_target_path)
with open(readme_path, 'w') as fh:
fh.write(save_source_path)
os.system("zip {} -r {}".format(args.out, save_dir))
def main():
args = get_args()
zip_save(args)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,3 @@
nltk
tqdm
jinja2

View file

@ -0,0 +1 @@
python3 -m basic.cli --mode train --noload --len_opt --cluster

View file

View file

@ -0,0 +1,157 @@
import json
import sys
from tqdm import tqdm
from my.corenlp_interface import CoreNLPInterface
in_path = sys.argv[1]
out_path = sys.argv[2]
url = sys.argv[3]
port = int(sys.argv[4])
data = json.load(open(in_path, 'r'))
h = CoreNLPInterface(url, port)
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def to_hex(s):
return " ".join(map(hex, map(ord, s)))
def handle_nobreak(cand, text):
if cand == text:
return cand
if cand.replace(u'\u00A0', ' ') == text:
return cand
elif cand == text.replace(u'\u00A0', ' '):
return text
raise Exception("{} '{}' {} '{}'".format(cand, to_hex(cand), text, to_hex(text)))
# resolving unicode complication
wrong_loc_count = 0
loc_diffs = []
for article in data['data']:
for para in article['paragraphs']:
para['context'] = para['context'].replace(u'\u000A', '')
para['context'] = para['context'].replace(u'\u00A0', ' ')
context = para['context']
for qa in para['qas']:
for answer in qa['answers']:
answer['text'] = answer['text'].replace(u'\u00A0', ' ')
text = answer['text']
answer_start = answer['answer_start']
if context[answer_start:answer_start + len(text)] == text:
if text.lstrip() == text:
pass
else:
answer_start += len(text) - len(text.lstrip())
answer['answer_start'] = answer_start
text = text.lstrip()
answer['text'] = text
else:
wrong_loc_count += 1
text = text.lstrip()
answer['text'] = text
starts = list(find_all(context, text))
if len(starts) == 1:
answer_start = starts[0]
elif len(starts) > 1:
new_answer_start = min(starts, key=lambda s: abs(s - answer_start))
loc_diffs.append(abs(new_answer_start - answer_start))
answer_start = new_answer_start
else:
raise Exception()
answer['answer_start'] = answer_start
answer_stop = answer_start + len(text)
answer['answer_stop'] = answer_stop
assert para['context'][answer_start:answer_stop] == answer['text'], "{} {}".format(
para['context'][answer_start:answer_stop], answer['text'])
print(wrong_loc_count, loc_diffs)
mismatch_count = 0
dep_fail_count = 0
no_answer_count = 0
size = sum(len(article['paragraphs']) for article in data['data'])
pbar = tqdm(range(size))
for ai, article in enumerate(data['data']):
for pi, para in enumerate(article['paragraphs']):
context = para['context']
sents = h.split_doc(context)
words = h.split_sent(context)
sent_starts = []
ref_idx = 0
for sent in sents:
new_idx = context.find(sent, ref_idx)
sent_starts.append(new_idx)
ref_idx = new_idx + len(sent)
para['sents'] = sents
para['words'] = words
para['sent_starts'] = sent_starts
consts = list(map(h.get_const, sents))
para['consts'] = consts
deps = list(map(h.get_dep, sents))
para['deps'] = deps
for qa in para['qas']:
question = qa['question']
question_const = h.get_const(question)
qa['const'] = question_const
question_dep = h.get_dep(question)
qa['dep'] = question_dep
qa['words'] = h.split_sent(question)
for answer in qa['answers']:
answer_start = answer['answer_start']
text = answer['text']
answer_stop = answer_start + len(text)
# answer_words = h.split_sent(text)
word_idxs = []
answer_words = []
for sent_idx, (sent, sent_start, dep) in enumerate(zip(sents, sent_starts, deps)):
if dep is None:
print("dep parse failed at {} {} {}".format(ai, pi, sent_idx))
dep_fail_count += 1
continue
nodes, edges = dep
words = [node[0] for node in nodes]
for word_idx, (word, _, _, start, _) in enumerate(nodes):
global_start = sent_start + start
global_stop = global_start + len(word)
if answer_start <= global_start < answer_stop or answer_start < global_stop <= answer_stop:
word_idxs.append((sent_idx, word_idx))
answer_words.append(word)
if len(word_idxs) > 0:
answer['answer_word_start'] = word_idxs[0]
answer['answer_word_stop'] = word_idxs[-1][0], word_idxs[-1][1] + 1
if not text.startswith(answer_words[0]):
print("'{}' '{}'".format(text, ' '.join(answer_words)))
mismatch_count += 1
else:
answer['answer_word_start'] = None
answer['answer_word_stop'] = None
no_answer_count += 1
pbar.update(1)
pbar.close()
print(mismatch_count, dep_fail_count, no_answer_count)
print("saving...")
json.dump(data, open(out_path, 'w'))

View file

@ -0,0 +1,271 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"\n",
"aug_data_path = \"/Users/minjoons/data/squad/dev-v1.0-aug.json\"\n",
"aug_data = json.load(open(aug_data_path, 'r'))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(['Denver', 'Broncos'], 'Denver Broncos')\n",
"(['Denver', 'Broncos'], 'Denver Broncos')\n",
"(['Denver', 'Broncos'], 'Denver Broncos ')\n",
"(['Carolina', 'Panthers'], 'Carolina Panthers')\n"
]
}
],
"source": [
"def compare_answers():\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" nodes, edges = dep\n",
" if dep is not None:\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
" yield answer_words, text\n",
"\n",
"ca = compare_answers()\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8\n"
]
}
],
"source": [
"def counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is None:\n",
" count += 1\n",
" print(count)\n",
"counter()\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": [
"def bad_node_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" sents = para['sents']\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" for node in nodes:\n",
" if len(node) != 5:\n",
" count += 1\n",
" print(count)\n",
"bad_node_counter() "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7\n"
]
}
],
"source": [
"def noanswer_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" if word_start is None:\n",
" count += 1\n",
" print(count)\n",
"noanswer_counter()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10600\n"
]
}
],
"source": [
"print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10348\n"
]
}
],
"source": [
"import nltk\n",
"\n",
"def _set_span(t, i):\n",
" if isinstance(t[0], str):\n",
" t.span = (i, i+len(t))\n",
" else:\n",
" first = True\n",
" for c in t:\n",
" cur_span = _set_span(c, i)\n",
" i = cur_span[1]\n",
" if first:\n",
" min_ = cur_span[0]\n",
" first = False\n",
" max_ = cur_span[1]\n",
" t.span = (min_, max_)\n",
" return t.span\n",
"\n",
"\n",
"def set_span(t):\n",
" assert isinstance(t, nltk.tree.Tree)\n",
" try:\n",
" return _set_span(t, 0)\n",
" except:\n",
" print(t)\n",
" exit()\n",
"\n",
"def same_span_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" consts = para['consts']\n",
" for const in consts:\n",
" tree = nltk.tree.Tree.fromstring(const)\n",
" set_span(tree)\n",
" if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):\n",
" count += 1\n",
" print(count)\n",
"same_span_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View file

@ -0,0 +1,314 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"\n",
"aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
"aug_data = json.load(open(aug_data_path, 'r'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
"(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
"(['the', 'Main', 'Building'], 'the Main Building')\n",
"(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
]
}
],
"source": [
"def compare_answers():\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" nodes, edges = dep\n",
" if dep is not None:\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
" yield answer_words, text\n",
"\n",
"ca = compare_answers()\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"q: k\n",
"q: j\n",
"q: n\n",
"q: b\n",
"q: v\n",
"x: .\n",
"x: :208\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"q: dd\n",
"q: dd\n",
"q: dd\n",
"q: dd\n",
"q: d\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :411\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :40\n",
"x: .\n",
"x: *\n",
"x: :14\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :131\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"53 10\n"
]
}
],
"source": [
"def nodep_counter():\n",
" x_count = 0\n",
" q_count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for sent, dep in zip(para['sents'], deps):\n",
" if dep is None:\n",
" print(\"x:\", sent)\n",
" x_count += 1\n",
" for qa in para['qas']:\n",
" if qa['dep'] is None:\n",
" print(\"q:\", qa['question'])\n",
" q_count += 1\n",
" print(x_count, q_count)\n",
"nodep_counter()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": [
"def bad_node_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" sents = para['sents']\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" for node in nodes:\n",
" if len(node) != 5:\n",
" count += 1\n",
" print(count)\n",
"bad_node_counter() "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36\n"
]
}
],
"source": [
"def noanswer_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" if word_start is None:\n",
" count += 1\n",
" print(count)\n",
"noanswer_counter()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"106\n"
]
}
],
"source": [
"def mult_sent_answer_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" if word_start is not None and word_start[0] != word_stop[0]:\n",
" count += 1\n",
" print(count)\n",
"mult_sent_answer_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View file

@ -0,0 +1,94 @@
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions)))

View file

@ -0,0 +1,94 @@
""" Official evaluation script for v1.1 of the SQuAD dataset. [Changed name for external importing]"""
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions)))

View file

@ -0,0 +1,50 @@
import argparse
import json
import os
# data: q, cq, (dq), (pq), y, *x, *cx
# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
# no metadata
import random
from collections import Counter
from tqdm import tqdm
from squad.utils import get_word_span, get_word_idx, process_tokens
def main():
args = get_args()
neg_squad(args)
def get_args():
parser = argparse.ArgumentParser()
home = os.path.expanduser("~")
parser.add_argument("source_path")
parser.add_argument("target_path")
parser.add_argument('-d', "--debug", action='store_true')
parser.add_argument('-r', "--aug_ratio", default=1, type=int)
# TODO : put more args here
return parser.parse_args()
def neg_squad(args):
with open(args.source_path, 'r') as fp:
squad = json.load(fp)
with open(args.source_path, 'r') as fp:
ref_squad = json.load(fp)
for ai, article in enumerate(ref_squad['data']):
for pi, para in enumerate(article['paragraphs']):
cands = list(range(pi)) + list(range(pi+1, len(article['paragraphs'])))
samples = random.sample(cands, args.aug_ratio)
for sample in samples:
for qi, ques in enumerate(article['paragraphs'][sample]['qas']):
new_ques = {'question': ques['question'], 'answers': [], 'answer_start': 0, 'id': "neg_" + ques['id']}
squad['data'][ai]['paragraphs'][pi]['qas'].append(new_ques)
with open(args.target_path, 'w') as fp:
json.dump(squad, fp)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,241 @@
import argparse
import json
import os
# data: q, cq, (dq), (pq), y, *x, *cx
# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
# no metadata
from collections import Counter
from tqdm import tqdm
from squad.utils import get_word_span, get_word_idx, process_tokens
def main():
args = get_args()
prepro(args)
def get_args():
parser = argparse.ArgumentParser()
home = os.path.expanduser("~")
source_dir = os.path.join(home, "data", "squad")
target_dir = "data/squad"
glove_dir = os.path.join(home, "data", "glove")
parser.add_argument('-s', "--source_dir", default=source_dir)
parser.add_argument('-t', "--target_dir", default=target_dir)
parser.add_argument("--train_name", default='train-v1.1.json')
parser.add_argument('-d', "--debug", action='store_true')
parser.add_argument("--train_ratio", default=0.9, type=int)
parser.add_argument("--glove_corpus", default="6B")
parser.add_argument("--glove_dir", default=glove_dir)
parser.add_argument("--glove_vec_size", default=100, type=int)
parser.add_argument("--mode", default="full", type=str)
parser.add_argument("--single_path", default="", type=str)
parser.add_argument("--tokenizer", default="PTB", type=str)
parser.add_argument("--url", default="vision-server2.corp.ai2", type=str)
parser.add_argument("--port", default=8000, type=int)
parser.add_argument("--split", action='store_true')
parser.add_argument("--suffix", default="")
# TODO : put more args here
return parser.parse_args()
def create_all(args):
out_path = os.path.join(args.source_dir, "all-v1.1.json")
if os.path.exists(out_path):
return
train_path = os.path.join(args.source_dir, args.train_name)
train_data = json.load(open(train_path, 'r'))
dev_path = os.path.join(args.source_dir, args.dev_name)
dev_data = json.load(open(dev_path, 'r'))
train_data['data'].extend(dev_data['data'])
print("dumping all data ...")
json.dump(train_data, open(out_path, 'w'))
def prepro(args):
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
if args.mode == 'full':
prepro_each(args, 'train', out_name='train')
prepro_each(args, 'dev', out_name='dev')
prepro_each(args, 'dev', out_name='test')
elif args.mode == 'all':
create_all(args)
prepro_each(args, 'dev', 0.0, 0.0, out_name='dev')
prepro_each(args, 'dev', 0.0, 0.0, out_name='test')
prepro_each(args, 'all', out_name='train')
elif args.mode == 'single':
assert len(args.single_path) > 0
prepro_each(args, "NULL", out_name="single", in_path=args.single_path)
else:
prepro_each(args, 'train', 0.0, args.train_ratio, out_name='train')
prepro_each(args, 'train', args.train_ratio, 1.0, out_name='dev')
prepro_each(args, 'dev', out_name='test')
def save(args, data, shared, data_type):
data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
json.dump(data, open(data_path, 'w'))
json.dump(shared, open(shared_path, 'w'))
def get_word2vec(args, word_counter):
glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
total = sizes[args.glove_corpus]
word2vec_dict = {}
with open(glove_path, 'r', encoding='utf-8') as fh:
for line in tqdm(fh, total=total):
array = line.lstrip().rstrip().split(" ")
word = array[0]
vector = list(map(float, array[1:]))
if word in word_counter:
word2vec_dict[word] = vector
elif word.capitalize() in word_counter:
word2vec_dict[word.capitalize()] = vector
elif word.lower() in word_counter:
word2vec_dict[word.lower()] = vector
elif word.upper() in word_counter:
word2vec_dict[word.upper()] = vector
print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
return word2vec_dict
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
if args.tokenizer == "PTB":
import nltk
sent_tokenize = nltk.sent_tokenize
def word_tokenize(tokens):
return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
elif args.tokenizer == 'Stanford':
from my.corenlp_interface import CoreNLPInterface
interface = CoreNLPInterface(args.url, args.port)
sent_tokenize = interface.split_doc
word_tokenize = interface.split_sent
else:
raise Exception()
if not args.split:
sent_tokenize = lambda para: [para]
source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
source_data = json.load(open(source_path, 'r'))
q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
na = []
cy = []
x, cx = [], []
answerss = []
p = []
word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
start_ai = int(round(len(source_data['data']) * start_ratio))
stop_ai = int(round(len(source_data['data']) * stop_ratio))
for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
xp, cxp = [], []
pp = []
x.append(xp)
cx.append(cxp)
p.append(pp)
for pi, para in enumerate(article['paragraphs']):
# wordss
context = para['context']
context = context.replace("''", '" ')
context = context.replace("``", '" ')
xi = list(map(word_tokenize, sent_tokenize(context)))
xi = [process_tokens(tokens) for tokens in xi] # process tokens
# given xi, add chars
cxi = [[list(xijk) for xijk in xij] for xij in xi]
xp.append(xi)
cxp.append(cxi)
pp.append(context)
for xij in xi:
for xijk in xij:
word_counter[xijk] += len(para['qas'])
lower_word_counter[xijk.lower()] += len(para['qas'])
for xijkl in xijk:
char_counter[xijkl] += len(para['qas'])
rxi = [ai, pi]
assert len(x) - 1 == ai
assert len(x[ai]) - 1 == pi
for qa in para['qas']:
# get words
qi = word_tokenize(qa['question'])
qi = process_tokens(qi)
cqi = [list(qij) for qij in qi]
yi = []
cyi = []
answers = []
for answer in qa['answers']:
answer_text = answer['text']
answers.append(answer_text)
answer_start = answer['answer_start']
answer_stop = answer_start + len(answer_text)
# TODO : put some function that gives word_start, word_stop here
yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
# yi0 = answer['answer_word_start'] or [0, 0]
# yi1 = answer['answer_word_stop'] or [0, 1]
assert len(xi[yi0[0]]) > yi0[1]
assert len(xi[yi1[0]]) >= yi1[1]
w0 = xi[yi0[0]][yi0[1]]
w1 = xi[yi1[0]][yi1[1]-1]
i0 = get_word_idx(context, xi, yi0)
i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
cyi0 = answer_start - i0
cyi1 = answer_stop - i1 - 1
# print(answer_text, w0[cyi0:], w1[:cyi1+1])
assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
assert answer_text[-1] == w1[cyi1]
assert cyi0 < 32, (answer_text, w0)
assert cyi1 < 32, (answer_text, w1)
yi.append([yi0, yi1])
cyi.append([cyi0, cyi1])
if len(qa['answers']) == 0:
yi.append([(0, 0), (0, 1)])
cyi.append([0, 1])
na.append(True)
else:
na.append(False)
for qij in qi:
word_counter[qij] += 1
lower_word_counter[qij.lower()] += 1
for qijk in qij:
char_counter[qijk] += 1
q.append(qi)
cq.append(cqi)
y.append(yi)
cy.append(cyi)
rx.append(rxi)
rcx.append(rxi)
ids.append(qa['id'])
idxs.append(len(idxs))
answerss.append(answers)
if args.debug:
break
word2vec_dict = get_word2vec(args, word_counter)
lower_word2vec_dict = get_word2vec(args, lower_word_counter)
# add context here
data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na}
shared = {'x': x, 'cx': cx, 'p': p,
'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}
print("saving ...")
save(args, data, shared, out_name)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,183 @@
import argparse
import json
import os
# data: q, cq, (dq), (pq), y, *x, *cx
# shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
# no metadata
from collections import Counter
import nltk
from tqdm import tqdm
from my.nltk_utils import load_compressed_tree
def bool_(arg):
if arg == 'True':
return True
elif arg == 'False':
return False
raise Exception()
def main():
args = get_args()
prepro(args)
def get_args():
parser = argparse.ArgumentParser()
home = os.path.expanduser("~")
source_dir = os.path.join(home, "data", "squad")
target_dir = "data/squad"
glove_dir = os.path.join(home, "data", "glove")
parser.add_argument("--source_dir", default=source_dir)
parser.add_argument("--target_dir", default=target_dir)
parser.add_argument("--debug", default=False, type=bool_)
parser.add_argument("--train_ratio", default=0.9, type=int)
parser.add_argument("--glove_corpus", default="6B")
parser.add_argument("--glove_dir", default=glove_dir)
parser.add_argument("--glove_vec_size", default=100, type=int)
parser.add_argument("--full_train", default=False, type=bool_)
# TODO : put more args here
return parser.parse_args()
def prepro(args):
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
if args.full_train:
data_train, shared_train = prepro_each(args, 'train')
data_dev, shared_dev = prepro_each(args, 'dev')
else:
data_train, shared_train = prepro_each(args, 'train', 0.0, args.train_ratio)
data_dev, shared_dev = prepro_each(args, 'train', args.train_ratio, 1.0)
data_test, shared_test = prepro_each(args, 'dev')
print("saving ...")
save(args, data_train, shared_train, 'train')
save(args, data_dev, shared_dev, 'dev')
save(args, data_test, shared_test, 'test')
def save(args, data, shared, data_type):
data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
json.dump(data, open(data_path, 'w'))
json.dump(shared, open(shared_path, 'w'))
def get_word2vec(args, word_counter):
glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
total = sizes[args.glove_corpus]
word2vec_dict = {}
with open(glove_path, 'r') as fh:
for line in tqdm(fh, total=total):
array = line.lstrip().rstrip().split(" ")
word = array[0]
vector = list(map(float, array[1:]))
if word in word_counter:
word2vec_dict[word] = vector
elif word.capitalize() in word_counter:
word2vec_dict[word.capitalize()] = vector
elif word.lower() in word_counter:
word2vec_dict[word.lower()] = vector
elif word.upper() in word_counter:
word2vec_dict[word.upper()] = vector
print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
return word2vec_dict
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
source_data = json.load(open(source_path, 'r'))
q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
x, cx, tx, stx = [], [], [], []
answerss = []
word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
pos_counter = Counter()
start_ai = int(round(len(source_data['data']) * start_ratio))
stop_ai = int(round(len(source_data['data']) * stop_ratio))
for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
xp, cxp, txp, stxp = [], [], [], []
x.append(xp)
cx.append(cxp)
tx.append(txp)
stx.append(stxp)
for pi, para in enumerate(article['paragraphs']):
xi = []
for dep in para['deps']:
if dep is None:
xi.append([])
else:
xi.append([node[0] for node in dep[0]])
cxi = [[list(xijk) for xijk in xij] for xij in xi]
xp.append(xi)
cxp.append(cxi)
txp.append(para['consts'])
stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
trees = map(nltk.tree.Tree.fromstring, para['consts'])
for tree in trees:
for subtree in tree.subtrees():
pos_counter[subtree.label()] += 1
for xij in xi:
for xijk in xij:
word_counter[xijk] += len(para['qas'])
lower_word_counter[xijk.lower()] += len(para['qas'])
for xijkl in xijk:
char_counter[xijkl] += len(para['qas'])
rxi = [ai, pi]
assert len(x) - 1 == ai
assert len(x[ai]) - 1 == pi
for qa in para['qas']:
dep = qa['dep']
qi = [] if dep is None else [node[0] for node in dep[0]]
cqi = [list(qij) for qij in qi]
yi = []
answers = []
for answer in qa['answers']:
answers.append(answer['text'])
yi0 = answer['answer_word_start'] or [0, 0]
yi1 = answer['answer_word_stop'] or [0, 1]
assert len(xi[yi0[0]]) > yi0[1]
assert len(xi[yi1[0]]) >= yi1[1]
yi.append([yi0, yi1])
for qij in qi:
word_counter[qij] += 1
lower_word_counter[qij.lower()] += 1
for qijk in qij:
char_counter[qijk] += 1
q.append(qi)
cq.append(cqi)
y.append(yi)
rx.append(rxi)
rcx.append(rxi)
ids.append(qa['id'])
idxs.append(len(idxs))
answerss.append(answers)
if args.debug:
break
word2vec_dict = get_word2vec(args, word_counter)
lower_word2vec_dict = get_word2vec(args, lower_word_counter)
data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
'idxs': idxs, 'ids': ids, 'answerss': answerss}
shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}
return data, shared
if __name__ == "__main__":
main()

View file

@ -0,0 +1,146 @@
import re
import numpy as np
def get_2d_spans(text, tokenss):
spanss = []
cur_idx = 0
for tokens in tokenss:
spans = []
for token in tokens:
if text.find(token, cur_idx) < 0:
print(tokens)
print("{} {} {}".format(token, cur_idx, text))
raise Exception()
cur_idx = text.find(token, cur_idx)
spans.append((cur_idx, cur_idx + len(token)))
cur_idx += len(token)
spanss.append(spans)
return spanss
def get_word_span(context, wordss, start, stop):
spanss = get_2d_spans(context, wordss)
idxs = []
for sent_idx, spans in enumerate(spanss):
for word_idx, span in enumerate(spans):
if not (stop <= span[0] or start >= span[1]):
idxs.append((sent_idx, word_idx))
assert len(idxs) > 0, "{} {} {} {}".format(context, spanss, start, stop)
return idxs[0], (idxs[-1][0], idxs[-1][1] + 1)
def get_phrase(context, wordss, span):
"""
Obtain phrase as substring of context given start and stop indices in word level
:param context:
:param wordss:
:param start: [sent_idx, word_idx]
:param stop: [sent_idx, word_idx]
:return:
"""
start, stop = span
flat_start = get_flat_idx(wordss, start)
flat_stop = get_flat_idx(wordss, stop)
words = sum(wordss, [])
char_idx = 0
char_start, char_stop = None, None
for word_idx, word in enumerate(words):
char_idx = context.find(word, char_idx)
assert char_idx >= 0
if word_idx == flat_start:
char_start = char_idx
char_idx += len(word)
if word_idx == flat_stop - 1:
char_stop = char_idx
assert char_start is not None
assert char_stop is not None
return context[char_start:char_stop]
def get_flat_idx(wordss, idx):
return sum(len(words) for words in wordss[:idx[0]]) + idx[1]
def get_word_idx(context, wordss, idx):
spanss = get_2d_spans(context, wordss)
return spanss[idx[0]][idx[1]][0]
def process_tokens(temp_tokens):
tokens = []
for token in temp_tokens:
flag = False
l = ("-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
# \u2013 is en-dash. Used for number to nubmer
# l = ("-", "\u2212", "\u2014", "\u2013")
# l = ("\u2013",)
tokens.extend(re.split("([{}])".format("".join(l)), token))
return tokens
def get_best_span(ypi, yp2i):
max_val = 0
best_word_span = (0, 1)
best_sent_idx = 0
for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
argmax_j1 = 0
for j in range(len(ypif)):
val1 = ypif[argmax_j1]
if val1 < ypif[j]:
val1 = ypif[j]
argmax_j1 = j
val2 = yp2if[j]
if val1 * val2 > max_val:
best_word_span = (argmax_j1, j)
best_sent_idx = f
max_val = val1 * val2
return ((best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)), float(max_val)
def get_best_span_wy(wypi, th):
chunk_spans = []
scores = []
chunk_start = None
score = 0
l = 0
th = min(th, np.max(wypi))
for f, wypif in enumerate(wypi):
for j, wypifj in enumerate(wypif):
if wypifj >= th:
if chunk_start is None:
chunk_start = f, j
score += wypifj
l += 1
else:
if chunk_start is not None:
chunk_stop = f, j
chunk_spans.append((chunk_start, chunk_stop))
scores.append(score/l)
score = 0
l = 0
chunk_start = None
if chunk_start is not None:
chunk_stop = f, j+1
chunk_spans.append((chunk_start, chunk_stop))
scores.append(score/l)
score = 0
l = 0
chunk_start = None
return max(zip(chunk_spans, scores), key=lambda pair: pair[1])
def get_span_score_pairs(ypi, yp2i):
span_score_pairs = []
for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
for j in range(len(ypif)):
for k in range(j, len(yp2if)):
span = ((f, j), (f, k+1))
score = ypif[j] * yp2if[k]
span_score_pairs.append((span, score))
return span_score_pairs

View file

View file

@ -0,0 +1,57 @@
import os
from pprint import pprint
import tensorflow as tf
from tree.main import main as m
flags = tf.app.flags
flags.DEFINE_string("model_name", "tree", "Model name [tree]")
flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
flags.DEFINE_integer("run_id", 0, "Run ID [0]")
flags.DEFINE_integer("batch_size", 128, "Batch size [128]")
flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
flags.DEFINE_integer("num_steps", 0, "Number of steps [0]")
flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
flags.DEFINE_integer("load_step", 0, "load step [0]")
flags.DEFINE_integer("early_stop", 4, "early stop [4]")
flags.DEFINE_string("mode", "test", "train | test | forward [test]")
flags.DEFINE_boolean("load", True, "load saved data? [True]")
flags.DEFINE_boolean("progress", True, "Show progress? [True]")
flags.DEFINE_integer("log_period", 100, "Log period [100]")
flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]")
flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]")
flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]")
flags.DEFINE_bool("lower_word", True, "lower word [True]")
flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
flags.DEFINE_integer("word_count_th", 100, "word count th [100]")
flags.DEFINE_integer("char_count_th", 500, "char count th [500]")
flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]")
flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]")
flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]")
def main(_):
config = flags.FLAGS
config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2))
m(config)
if __name__ == "__main__":
tf.app.run()

View file

@ -0,0 +1,197 @@
import numpy as np
import tensorflow as tf
from tree.read_data import DataSet
from my.nltk_utils import span_f1
class Evaluation(object):
def __init__(self, data_type, global_step, idxs, yp):
self.data_type = data_type
self.global_step = global_step
self.idxs = idxs
self.yp = yp
self.num_examples = len(yp)
self.dict = {'data_type': data_type,
'global_step': global_step,
'yp': yp,
'idxs': idxs,
'num_examples': self.num_examples}
self.summaries = None
def __repr__(self):
return "{} step {}".format(self.data_type, self.global_step)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_idxs = self.idxs + other.idxs
return Evaluation(self.data_type, self.global_step, new_idxs, new_yp)
def __radd__(self, other):
return self.__add__(other)
class LabeledEvaluation(Evaluation):
def __init__(self, data_type, global_step, idxs, yp, y):
super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp)
self.y = y
self.dict['y'] = y
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_idxs = self.idxs + other.idxs
return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y)
class AccuracyEvaluation(LabeledEvaluation):
def __init__(self, data_type, global_step, idxs, yp, y, correct, loss):
super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y)
self.loss = loss
self.correct = correct
self.acc = sum(correct) / len(correct)
self.dict['loss'] = loss
self.dict['correct'] = correct
self.dict['acc'] = self.acc
loss_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=self.loss)])
acc_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/acc', simple_value=self.acc)])
self.summaries = [loss_summary, acc_summary]
def __repr__(self):
return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_y = self.y + other.y
new_correct = self.correct + other.correct
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss)
class Evaluator(object):
def __init__(self, config, model):
self.config = config
self.model = model
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist())
return e
def get_evaluation_from_batches(self, sess, batches):
e = sum(self.get_evaluation(sess, batch) for batch in batches)
return e
class LabeledEvaluator(Evaluator):
def get_evaluation(self, sess, batch):
idxs, data_set = batch
feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
yp = yp[:data_set.num_examples]
y = feed_dict[self.model.y]
e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist())
return e
class AccuracyEvaluator(LabeledEvaluator):
def get_evaluation(self, sess, batch):
idxs, data_set = batch
assert isinstance(data_set, DataSet)
feed_dict = self.model.get_feed_dict(data_set, False)
global_step, yp, loss = sess.run([self.model.global_step, self.model.yp, self.model.loss], feed_dict=feed_dict)
y = feed_dict[self.model.y]
yp = yp[:data_set.num_examples]
correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), correct, float(loss))
return e
@staticmethod
def compare(yi, ypi):
return int(np.argmax(yi)) == int(np.argmax(ypi))
class AccuracyEvaluator2(AccuracyEvaluator):
@staticmethod
def compare(yi, ypi):
i = int(np.argmax(yi.flatten()))
j = int(np.argmax(ypi.flatten()))
# print(i, j, i == j)
return i == j
class TempEvaluation(AccuracyEvaluation):
def __init__(self, data_type, global_step, idxs, yp, yp2, y, y2, correct, loss, f1s):
super(TempEvaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss)
self.y2 = y2
self.yp2 = yp2
self.f1s = f1s
self.f1 = float(np.mean(f1s))
self.dict['y2'] = y2
self.dict['yp2'] = yp2
self.dict['f1s'] = f1s
self.dict['f1'] = self.f1
f1_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=self.f1)])
self.summaries.append(f1_summary)
def __add__(self, other):
if other == 0:
return self
assert self.data_type == other.data_type
assert self.global_step == other.global_step
new_idxs = self.idxs + other.idxs
new_yp = self.yp + other.yp
new_yp2 = self.yp2 + other.yp2
new_y = self.y + other.y
new_y2 = self.y2 + other.y2
new_correct = self.correct + other.correct
new_f1s = self.f1s + other.f1s
new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
return TempEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_y2, new_correct, new_loss, new_f1s)
class TempEvaluator(LabeledEvaluator):
def get_evaluation(self, sess, batch):
idxs, data_set = batch
assert isinstance(data_set, DataSet)
feed_dict = self.model.get_feed_dict(data_set, False)
global_step, yp, yp2, loss = sess.run([self.model.global_step, self.model.yp, self.model.yp2, self.model.loss], feed_dict=feed_dict)
y, y2 = feed_dict[self.model.y], feed_dict[self.model.y2]
yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
correct = [self.__class__.compare(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
f1s = [self.__class__.span_f1(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
e = TempEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y.tolist(), y2.tolist(), correct, float(loss), f1s)
return e
@staticmethod
def compare(yi, y2i, ypi, yp2i):
i = int(np.argmax(yi.flatten()))
j = int(np.argmax(ypi.flatten()))
k = int(np.argmax(y2i.flatten()))
l = int(np.argmax(yp2i.flatten()))
# print(i, j, i == j)
return i == j and k == l
@staticmethod
def span_f1(yi, y2i, ypi, yp2i):
true_span = (np.argmax(yi.flatten()), np.argmax(y2i.flatten())+1)
pred_span = (np.argmax(ypi.flatten()), np.argmax(yp2i.flatten())+1)
f1 = span_f1(true_span, pred_span)
return f1

View file

@ -0,0 +1,54 @@
import json
from json import encoder
import os
import tensorflow as tf
from tree.evaluator import Evaluation
from my.utils import short_floats
class GraphHandler(object):
def __init__(self, config):
self.config = config
self.saver = tf.train.Saver()
self.writer = None
self.save_path = os.path.join(config.save_dir, config.model_name)
def initialize(self, sess):
if self.config.load:
self._load(sess)
else:
sess.run(tf.global_variables_initializer())
if self.config.mode == 'train':
self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
def save(self, sess, global_step=None):
self.saver.save(sess, self.save_path, global_step=global_step)
def _load(self, sess):
config = self.config
if config.load_step > 0:
save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
else:
save_dir = config.save_dir
checkpoint = tf.train.get_checkpoint_state(save_dir)
assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
save_path = checkpoint.model_checkpoint_path
print("Loading saved model from {}".format(save_path))
self.saver.restore(sess, save_path)
def add_summary(self, summary, global_step):
self.writer.add_summary(summary, global_step)
def add_summaries(self, summaries, global_step):
for summary in summaries:
self.add_summary(summary, global_step)
def dump_eval(self, e, precision=2):
assert isinstance(e, Evaluation)
path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
with open(path, 'w') as fh:
json.dump(short_floats(e.dict, precision), fh)

View file

@ -0,0 +1,187 @@
import argparse
import json
import math
import os
import shutil
from pprint import pprint
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from tree.evaluator import AccuracyEvaluator2, Evaluator
from tree.graph_handler import GraphHandler
from tree.model import Model
from tree.trainer import Trainer
from tree.read_data import load_metadata, read_data, get_squad_data_filter, update_config
def main(config):
set_dirs(config)
if config.mode == 'train':
_train(config)
elif config.mode == 'test':
_test(config)
elif config.mode == 'forward':
_forward(config)
else:
raise ValueError("invalid value for 'mode': {}".format(config.mode))
def _config_draft(config):
if config.draft:
config.num_steps = 10
config.eval_period = 10
config.log_period = 1
config.save_period = 10
config.eval_num_batches = 1
def _train(config):
# load_metadata(config, 'train') # this updates the config file according to metadata file
data_filter = get_squad_data_filter(config)
train_data = read_data(config, 'train', config.load, data_filter=data_filter)
dev_data = read_data(config, 'dev', True, data_filter=data_filter)
update_config(config, [train_data, dev_data])
_config_draft(config)
word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
word2idx_dict = train_data.shared['word2idx']
idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
for idx in range(config.word_vocab_size)])
config.emb_mat = emb_mat
# construct model graph and variables (using default graph)
pprint(config.__flags, indent=2)
model = Model(config)
trainer = Trainer(config, model)
evaluator = AccuracyEvaluator2(config, model)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
# Variables
sess = tf.Session()
graph_handler.initialize(sess)
# begin training
num_steps = config.num_steps or int(config.num_epochs * train_data.num_examples / config.batch_size)
max_acc = 0
noupdate_count = 0
global_step = 0
for _, batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_steps, shuffle=True), total=num_steps):
global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step
get_summary = global_step % config.log_period == 0
loss, summary, train_op = trainer.step(sess, batch, get_summary=get_summary)
if get_summary:
graph_handler.add_summary(summary, global_step)
# Occasional evaluation and saving
if global_step % config.save_period == 0:
graph_handler.save(sess, global_step=global_step)
if global_step % config.eval_period == 0:
num_batches = math.ceil(dev_data.num_examples / config.batch_size)
if 0 < config.eval_num_batches < num_batches:
num_batches = config.eval_num_batches
e = evaluator.get_evaluation_from_batches(
sess, tqdm(dev_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
graph_handler.add_summaries(e.summaries, global_step)
if e.acc > max_acc:
max_acc = e.acc
noupdate_count = 0
else:
noupdate_count += 1
if noupdate_count == config.early_stop:
break
if config.dump_eval:
graph_handler.dump_eval(e)
if global_step % config.save_period != 0:
graph_handler.save(sess, global_step=global_step)
def _test(config):
test_data = read_data(config, 'test', True)
update_config(config, [test_data])
_config_draft(config)
pprint(config.__flags, indent=2)
model = Model(config)
evaluator = AccuracyEvaluator2(config, model)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
sess = tf.Session()
graph_handler.initialize(sess)
num_batches = math.ceil(test_data.num_examples / config.batch_size)
if 0 < config.eval_num_batches < num_batches:
num_batches = config.eval_num_batches
e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
print(e)
if config.dump_eval:
graph_handler.dump_eval(e)
def _forward(config):
forward_data = read_data(config, 'forward', True)
_config_draft(config)
pprint(config.__flag, indent=2)
model = Model(config)
evaluator = Evaluator(config, model)
graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
sess = tf.Session()
graph_handler.initialize(sess)
num_batches = math.ceil(forward_data.num_examples / config.batch_size)
if 0 < config.eval_num_batches < num_batches:
num_batches = config.eval_num_batches
e = evaluator.get_evaluation_from_batches(sess, tqdm(forward_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
print(e)
if config.dump_eval:
graph_handler.dump_eval(e)
def set_dirs(config):
# create directories
if not config.load and os.path.exists(config.out_dir):
shutil.rmtree(config.out_dir)
config.save_dir = os.path.join(config.out_dir, "save")
config.log_dir = os.path.join(config.out_dir, "log")
config.eval_dir = os.path.join(config.out_dir, "eval")
if not os.path.exists(config.out_dir):
os.makedirs(config.out_dir)
if not os.path.exists(config.save_dir):
os.mkdir(config.save_dir)
if not os.path.exists(config.log_dir):
os.mkdir(config.eval_dir)
def _get_args():
parser = argparse.ArgumentParser()
parser.add_argument("config_path")
return parser.parse_args()
class Config(object):
def __init__(self, **entries):
self.__dict__.update(entries)
def _run():
args = _get_args()
with open(args.config_path, 'r') as fh:
config = Config(**json.load(fh))
main(config)
if __name__ == "__main__":
_run()

View file

@ -0,0 +1,248 @@
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.rnn_cell import BasicLSTMCell
from my.nltk_utils import tree2matrix, find_max_f1_subtree, load_compressed_tree, set_span
from tree.read_data import DataSet
from my.tensorflow import exp_mask, get_initializer
from my.tensorflow.nn import linear
from my.tensorflow.rnn import bidirectional_dynamic_rnn, dynamic_rnn
from my.tensorflow.rnn_cell import SwitchableDropoutWrapper, NoOpCell, TreeRNNCell
class Model(object):
def __init__(self, config):
self.config = config
self.global_step = tf.get_variable('global_step', shape=[], dtype='int32',
initializer=tf.constant_initializer(0), trainable=False)
# Define forward inputs here
N, M, JX, JQ, VW, VC, W, H = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.max_word_size, config.max_tree_height
self.x = tf.placeholder('int32', [None, M, JX], name='x')
self.cx = tf.placeholder('int32', [None, M, JX, W], name='cx')
self.q = tf.placeholder('int32', [None, JQ], name='q')
self.cq = tf.placeholder('int32', [None, JQ, W], name='cq')
self.tx = tf.placeholder('int32', [None, M, H, JX], name='tx')
self.tx_edge_mask = tf.placeholder('bool', [None, M, H, JX, JX], name='tx_edge_mask')
self.y = tf.placeholder('bool', [None, M, H, JX], name='y')
self.is_train = tf.placeholder('bool', [], name='is_train')
# Define misc
# Forward outputs / loss inputs
self.logits = None
self.yp = None
self.var_list = None
# Loss outputs
self.loss = None
self._build_forward()
self._build_loss()
self.ema_op = self._get_ema_op()
self.summary = tf.summary.merge_all()
def _build_forward(self):
config = self.config
N, M, JX, JQ, VW, VC, d, dc, W = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
config.char_emb_size, config.max_word_size
H = config.max_tree_height
x_mask = self.x > 0
q_mask = self.q > 0
tx_mask = self.tx > 0 # [N, M, H, JX]
with tf.variable_scope("char_emb"):
char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc]
Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc]
filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float')
bias = tf.get_variable("bias", shape=[d], dtype='float')
strides = [1, 1, 1, 1]
Acx = tf.reshape(Acx, [-1, JX, W, dc])
Acq = tf.reshape(Acq, [-1, JQ, W, dc])
xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d]
qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d]
xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d])
qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d])
with tf.variable_scope("word_emb"):
if config.mode == 'train':
word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat))
else:
word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float')
Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d]
Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d]
# Ax = linear([Ax], d, False, scope='Ax_reshape')
# Aq = linear([Aq], d, False, scope='Aq_reshape')
xx = tf.concat(axis=3, values=[xxc, Ax]) # [N, M, JX, 2d]
qq = tf.concat(axis=2, values=[qqc, Aq]) # [N, JQ, 2d]
D = d + config.word_emb_size
with tf.variable_scope("pos_emb"):
pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float')
Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d]
cell = BasicLSTMCell(D, state_is_tuple=True)
cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M]
q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N]
with tf.variable_scope("rnn"):
(fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d]
tf.get_variable_scope().reuse_variables()
(fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d]
u = (fw_u + bw_u) / 2.0
h = (fw_h + bw_h) / 2.0
with tf.variable_scope("h"):
no_op_cell = NoOpCell(D)
tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max)
initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D]
inputs = tf.concat(axis=4, values=[Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX]
inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX]
length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX])
# length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1])
h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D]
h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D]
u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d]
dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX]
# self.logits = tf.reshape(dot, [N, M * H * JX])
self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX]
self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def _build_loss(self):
config = self.config
N, M, JX, JQ, VW, VC = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size
H = config.max_tree_height
ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=tf.cast(tf.reshape(self.y, [N, M * H * JX]), 'float')))
tf.add_to_collection('losses', ce_loss)
self.loss = tf.add_n(tf.get_collection('losses'), name='loss')
tf.summary.scalar(self.loss.op.name, self.loss)
tf.add_to_collection('ema/scalar', self.loss)
def _get_ema_op(self):
ema = tf.train.ExponentialMovingAverage(self.config.decay)
ema_op = ema.apply(tf.get_collection("ema/scalar") + tf.get_collection("ema/histogram"))
for var in tf.get_collection("ema/scalar"):
ema_var = ema.average(var)
tf.summary.scalar(ema_var.op.name, ema_var)
for var in tf.get_collection("ema/histogram"):
ema_var = ema.average(var)
tf.summary.histogram(ema_var.op.name, ema_var)
return ema_op
def get_loss(self):
return self.loss
def get_global_step(self):
return self.global_step
def get_var_list(self):
return self.var_list
def get_feed_dict(self, batch, is_train, supervised=True):
assert isinstance(batch, DataSet)
config = self.config
N, M, JX, JQ, VW, VC, d, W, H = \
config.batch_size, config.max_num_sents, config.max_sent_size, \
config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, config.max_word_size, \
config.max_tree_height
feed_dict = {}
x = np.zeros([N, M, JX], dtype='int32')
cx = np.zeros([N, M, JX, W], dtype='int32')
q = np.zeros([N, JQ], dtype='int32')
cq = np.zeros([N, JQ, W], dtype='int32')
tx = np.zeros([N, M, H, JX], dtype='int32')
tx_edge_mask = np.zeros([N, M, H, JX, JX], dtype='bool')
feed_dict[self.x] = x
feed_dict[self.cx] = cx
feed_dict[self.q] = q
feed_dict[self.cq] = cq
feed_dict[self.tx] = tx
feed_dict[self.tx_edge_mask] = tx_edge_mask
feed_dict[self.is_train] = is_train
def _get_word(word):
d = batch.shared['word2idx']
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in d:
return d[each]
return 1
def _get_char(char):
d = batch.shared['char2idx']
if char in d:
return d[char]
return 1
def _get_pos(tree):
d = batch.shared['pos2idx']
if tree.label() in d:
return d[tree.label()]
return 1
for i, xi in enumerate(batch.data['x']):
for j, xij in enumerate(xi):
for k, xijk in enumerate(xij):
x[i, j, k] = _get_word(xijk)
for i, cxi in enumerate(batch.data['cx']):
for j, cxij in enumerate(cxi):
for k, cxijk in enumerate(cxij):
for l, cxijkl in enumerate(cxijk):
cx[i, j, k, l] = _get_char(cxijkl)
if l + 1 == config.max_word_size:
break
for i, qi in enumerate(batch.data['q']):
for j, qij in enumerate(qi):
q[i, j] = _get_word(qij)
for i, cqi in enumerate(batch.data['cq']):
for j, cqij in enumerate(cqi):
for k, cqijk in enumerate(cqij):
cq[i, j, k] = _get_char(cqijk)
if k + 1 == config.max_word_size:
break
for i, txi in enumerate(batch.data['stx']):
for j, txij in enumerate(txi):
txij_mat, txij_mask = tree2matrix(nltk.tree.Tree.fromstring(txij), _get_pos, row_size=H, col_size=JX)
tx[i, j, :, :], tx_edge_mask[i, j, :, :, :] = txij_mat, txij_mask
if supervised:
y = np.zeros([N, M, H, JX], dtype='bool')
feed_dict[self.y] = y
for i, yi in enumerate(batch.data['y']):
start_idx, stop_idx = yi
sent_idx = start_idx[0]
if start_idx[0] == stop_idx[0]:
span = [start_idx[1], stop_idx[1]]
else:
span = [start_idx[1], len(batch.data['x'][sent_idx])]
tree = nltk.tree.Tree.fromstring(batch.data['stx'][i][sent_idx])
set_span(tree)
best_subtree = find_max_f1_subtree(tree, span)
def _get_y(t):
return t == best_subtree
yij, _ = tree2matrix(tree, _get_y, H, JX, dtype='bool')
y[i, sent_idx, :, :] = yij
return feed_dict

View file

@ -0,0 +1,159 @@
import json
import os
import random
import itertools
import math
import nltk
from my.nltk_utils import load_compressed_tree
from my.utils import index
class DataSet(object):
def __init__(self, data, data_type, shared=None, valid_idxs=None):
total_num_examples = len(next(iter(data.values())))
self.data = data # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
self.data_type = data_type
self.shared = shared
self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
self.num_examples = len(self.valid_idxs)
def get_batches(self, batch_size, num_batches=None, shuffle=False):
num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
if num_batches is None:
num_batches = num_batches_per_epoch
num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs))
if shuffle else self.valid_idxs
for _ in range(num_epochs))
for _ in range(num_batches):
batch_idxs = tuple(itertools.islice(idxs, batch_size))
batch_data = {}
for key, val in self.data.items():
if key.startswith('*'):
assert self.shared is not None
shared_key = key[1:]
batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs]
else:
batch_data[key] = list(map(val.__getitem__, batch_idxs))
batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
yield batch_idxs, batch_ds
class SquadDataSet(DataSet):
def __init__(self, data, data_type, shared=None, valid_idxs=None):
super(SquadDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
def load_metadata(config, data_type):
metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
with open(metadata_path, 'r') as fh:
metadata = json.load(fh)
for key, val in metadata.items():
config.__setattr__(key, val)
return metadata
def read_data(config, data_type, ref, data_filter=None):
data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
with open(data_path, 'r') as fh:
data = json.load(fh)
with open(shared_path, 'r') as fh:
shared = json.load(fh)
num_examples = len(next(iter(data.values())))
if data_filter is None:
valid_idxs = range(num_examples)
else:
mask = []
keys = data.keys()
values = data.values()
for vals in zip(*values):
each = {key: val for key, val in zip(keys, vals)}
mask.append(data_filter(each, shared))
valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
shared_path = os.path.join(config.out_dir, "shared.json")
if not ref:
word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
char_counter = shared['char_counter']
pos_counter = shared['pos_counter']
shared['word2idx'] = {word: idx + 2 for idx, word in
enumerate(word for word, count in word_counter.items()
if count > config.word_count_th)}
shared['char2idx'] = {char: idx + 2 for idx, char in
enumerate(char for char, count in char_counter.items()
if count > config.char_count_th)}
shared['pos2idx'] = {pos: idx + 2 for idx, pos in enumerate(pos_counter.keys())}
NULL = "-NULL-"
UNK = "-UNK-"
shared['word2idx'][NULL] = 0
shared['word2idx'][UNK] = 1
shared['char2idx'][NULL] = 0
shared['char2idx'][UNK] = 1
shared['pos2idx'][NULL] = 0
shared['pos2idx'][UNK] = 1
json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx'],
'pos2idx': shared['pos2idx']}, open(shared_path, 'w'))
else:
new_shared = json.load(open(shared_path, 'r'))
for key, val in new_shared.items():
shared[key] = val
data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
return data_set
def get_squad_data_filter(config):
def data_filter(data_point, shared):
assert shared is not None
rx, rcx, q, cq, y = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
x, cx, stx = shared['x'], shared['cx'], shared['stx']
if len(q) > config.ques_size_th:
return False
xi = x[rx[0]][rx[1]]
if len(xi) > config.num_sents_th:
return False
if any(len(xij) > config.sent_size_th for xij in xi):
return False
stxi = stx[rx[0]][rx[1]]
if any(nltk.tree.Tree.fromstring(s).height() > config.tree_height_th for s in stxi):
return False
return True
return data_filter
def update_config(config, data_sets):
config.max_num_sents = 0
config.max_sent_size = 0
config.max_ques_size = 0
config.max_word_size = 0
config.max_tree_height = 0
for data_set in data_sets:
data = data_set.data
shared = data_set.shared
for idx in data_set.valid_idxs:
rx = data['*x'][idx]
q = data['q'][idx]
sents = shared['x'][rx[0]][rx[1]]
trees = map(nltk.tree.Tree.fromstring, shared['stx'][rx[0]][rx[1]])
config.max_tree_height = max(config.max_tree_height, max(tree.height() for tree in trees))
config.max_num_sents = max(config.max_num_sents, len(sents))
config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
if len(q) > 0:
config.max_ques_size = max(config.max_ques_size, len(q))
config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
config.max_word_size = min(config.max_word_size, config.word_size_th)
config.char_vocab_size = len(data_sets[0].shared['char2idx'])
config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
config.word_vocab_size = len(data_sets[0].shared['word2idx'])
config.pos_vocab_size = len(data_sets[0].shared['pos2idx'])

View file

@ -0,0 +1,67 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ title }}</title>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
<script>
$(document).ready(function(){
$(".att").each(function() {
// var val = parseFloat($(this).text());
var val = parseFloat($(this).attr("color"));
var scale = chroma.scale(['white', 'red']);
var color = scale(val).hex();
$(this).attr("bgcolor", color);
});
})
</script>
</head>
<style>
table, th, td {border: 1px solid black}
</style>
<body>
<h2>{{ title }}</h2>
<table>
<tr>
<th>ID</th>
<th>Question</th>
<th>Answer</th>
<th>Paragraph</th>
</tr>
{% for row in rows %}
<tr>
<td>{{ row.id }}</td>
<td>
{% for qj in row.ques %}
{{ qj }}
{% endfor %}
</td>
<td>{{ row.a }}</td>
<td>
<table>
{% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %}
<tr>
{% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %}
<td class="att" color="{{ ypjk }}">
{% if yjk or y2jk %}
<b>{{ xjk }}</b>
{% else %}
{{ xjk }}
{% endif %}
</td>
{% endfor %}
</tr>
<tr>
{% for xjk, yp2jk in zip(xj, yp2j) %}
<td class="att" color="{{ yp2jk }}">-</td>
{% endfor %}
</tr>
{% endfor %}
</table>
</td>
</tr>
{% endfor %}
</table>
</body>
</html>

View file

@ -0,0 +1,294 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import nltk\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n",
"(PRP I)\n",
"(VP (VBP am) (NNP Sam))\n",
"(VBP am)\n",
"(NNP Sam)\n",
"(. .)\n",
"(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n"
]
}
],
"source": [
"string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
"tree = nltk.tree.Tree.fromstring(string)\n",
"\n",
"def load_compressed_tree(s):\n",
"\n",
" def compress_tree(tree):\n",
" if len(tree) == 1:\n",
" if isinstance(tree[0], nltk.tree.Tree):\n",
" return compress_tree(tree[0])\n",
" else:\n",
" return tree\n",
" else:\n",
" for i, t in enumerate(tree):\n",
" tree[i] = compress_tree(t)\n",
" return tree\n",
"\n",
" return compress_tree(nltk.tree.Tree.fromstring(s))\n",
"tree = load_compressed_tree(string)\n",
"for t in tree.subtrees():\n",
" print(t)\n",
" \n",
"print(str(tree))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(ROOT I am Sam .)\n"
]
}
],
"source": [
"print(tree.flatten())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ROOT', 'S', 'NP', 'PRP', 'VP', 'VBP', 'NP', 'NNP', '.']\n"
]
}
],
"source": [
"print(list(t.label() for t in tree.subtrees()))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"d = json.load(open(\"data/squad/shared_dev.json\", 'r'))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"73"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(d['pos_counter'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'#': 6,\n",
" '$': 80,\n",
" \"''\": 1291,\n",
" ',': 14136,\n",
" '-LRB-': 1926,\n",
" '-RRB-': 1925,\n",
" '.': 9505,\n",
" ':': 1455,\n",
" 'ADJP': 3426,\n",
" 'ADVP': 4936,\n",
" 'CC': 9300,\n",
" 'CD': 6216,\n",
" 'CONJP': 191,\n",
" 'DT': 26286,\n",
" 'EX': 288,\n",
" 'FRAG': 107,\n",
" 'FW': 96,\n",
" 'IN': 32564,\n",
" 'INTJ': 12,\n",
" 'JJ': 21452,\n",
" 'JJR': 563,\n",
" 'JJS': 569,\n",
" 'LS': 7,\n",
" 'LST': 1,\n",
" 'MD': 1051,\n",
" 'NAC': 19,\n",
" 'NN': 34750,\n",
" 'NNP': 28392,\n",
" 'NNPS': 1400,\n",
" 'NNS': 16716,\n",
" 'NP': 91636,\n",
" 'NP-TMP': 236,\n",
" 'NX': 108,\n",
" 'PDT': 89,\n",
" 'POS': 1451,\n",
" 'PP': 33278,\n",
" 'PRN': 2085,\n",
" 'PRP': 2320,\n",
" 'PRP$': 1959,\n",
" 'PRT': 450,\n",
" 'QP': 838,\n",
" 'RB': 7611,\n",
" 'RBR': 301,\n",
" 'RBS': 252,\n",
" 'ROOT': 9587,\n",
" 'RP': 454,\n",
" 'RRC': 19,\n",
" 'S': 21557,\n",
" 'SBAR': 5009,\n",
" 'SBARQ': 6,\n",
" 'SINV': 135,\n",
" 'SQ': 5,\n",
" 'SYM': 17,\n",
" 'TO': 5167,\n",
" 'UCP': 143,\n",
" 'UH': 15,\n",
" 'VB': 4197,\n",
" 'VBD': 8377,\n",
" 'VBG': 3570,\n",
" 'VBN': 7218,\n",
" 'VBP': 2897,\n",
" 'VBZ': 4146,\n",
" 'VP': 33696,\n",
" 'WDT': 1368,\n",
" 'WHADJP': 5,\n",
" 'WHADVP': 439,\n",
" 'WHNP': 1927,\n",
" 'WHPP': 153,\n",
" 'WP': 482,\n",
" 'WP$': 50,\n",
" 'WRB': 442,\n",
" 'X': 23,\n",
" '``': 1269}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d['pos_counter']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[False False False False]\n",
" [False True False False]\n",
" [False False False False]]\n",
"[[0 2 2 0]\n",
" [2 2 0 2]\n",
" [2 0 0 0]]\n"
]
}
],
"source": [
"from my.nltk_utils import tree2matrix, load_compressed_tree, find_max_f1_subtree, set_span\n",
"string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
"tree = load_compressed_tree(string)\n",
"span = (1, 3)\n",
"set_span(tree)\n",
"subtree = find_max_f1_subtree(tree, span)\n",
"f = lambda t: t == subtree\n",
"g = lambda t: 1 if isinstance(t, str) else 2\n",
"a, b = tree2matrix(tree, f, dtype='bool')\n",
"c, d = tree2matrix(tree, g, dtype='int32')\n",
"print(a)\n",
"print(c)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View file

@ -0,0 +1,36 @@
import tensorflow as tf
from tree.model import Model
class Trainer(object):
def __init__(self, config, model):
assert isinstance(model, Model)
self.config = config
self.model = model
self.opt = tf.train.AdagradOptimizer(config.init_lr)
self.loss = model.get_loss()
self.var_list = model.get_var_list()
self.global_step = model.get_global_step()
self.ema_op = model.ema_op
self.summary = model.summary
self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
# Define train op
with tf.control_dependencies([opt_op]):
self.train_op = tf.group(self.ema_op)
def get_train_op(self):
return self.train_op
def step(self, sess, batch, get_summary=False):
assert isinstance(sess, tf.Session)
feed_dict = self.model.get_feed_dict(batch, True)
if get_summary:
loss, summary, train_op = \
sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
else:
loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
summary = None
return loss, summary, train_op

View file

@ -0,0 +1,122 @@
import shutil
from collections import OrderedDict
import http.server
import socketserver
import argparse
import json
import os
import numpy as np
from tqdm import tqdm
from jinja2 import Environment, FileSystemLoader
def bool_(string):
if string == 'True':
return True
elif string == 'False':
return False
else:
raise Exception()
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default='basic')
parser.add_argument("--data_type", type=str, default='dev')
parser.add_argument("--step", type=int, default=5000)
parser.add_argument("--template_name", type=str, default="visualizer.html")
parser.add_argument("--num_per_page", type=int, default=100)
parser.add_argument("--data_dir", type=str, default="data/squad")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--open", type=str, default='False')
parser.add_argument("--run_id", type=str, default="0")
args = parser.parse_args()
return args
def _decode(decoder, sent):
return " ".join(decoder[idx] for idx in sent)
def accuracy2_visualizer(args):
model_name = args.model_name
data_type = args.data_type
num_per_page = args.num_per_page
data_dir = args.data_dir
run_id = args.run_id.zfill(2)
step = args.step
eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
eval_ = json.load(open(eval_path, 'r'))
_id = 0
html_dir = "/tmp/list_results%d" % _id
while os.path.exists(html_dir):
_id += 1
html_dir = "/tmp/list_results%d" % _id
if os.path.exists(html_dir):
shutil.rmtree(html_dir)
os.mkdir(html_dir)
cur_dir = os.path.dirname(os.path.realpath(__file__))
templates_dir = os.path.join(cur_dir, 'templates')
env = Environment(loader=FileSystemLoader(templates_dir))
env.globals.update(zip=zip, reversed=reversed)
template = env.get_template(args.template_name)
data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
data = json.load(open(data_path, 'r'))
shared = json.load(open(shared_path, 'r'))
rows = []
for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])):
id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x'))
x = shared['x'][rx[0]][rx[1]]
ques = [" ".join(q)]
para = [[word for word in sent] for sent in x]
row = {
'id': id_,
'title': "Hello world!",
'ques': ques,
'para': para,
'y': yi,
'y2': yi,
'yp': ypi,
'yp2': ypi,
'a': ""
}
rows.append(row)
if i % num_per_page == 0:
html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
var_dict = {'title': "Accuracy Visualization",
'rows': rows
}
with open(html_path, "wb") as f:
f.write(template.render(**var_dict).encode('UTF-8'))
rows = []
os.chdir(html_dir)
port = args.port
host = args.host
# Overriding to suppress log message
class MyHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass
handler = MyHandler
httpd = socketserver.TCPServer((host, port), handler)
if args.open == 'True':
os.system("open http://%s:%d" % (args.host, args.port))
print("serving at %s:%d" % (host, port))
httpd.serve_forever()
if __name__ == "__main__":
ARGS = get_args()
accuracy2_visualizer(ARGS)

View file

@ -0,0 +1,244 @@
import numpy as np
from collections import Counter
import string
import re
import argparse
import os
import json
import nltk
from matplotlib_venn import venn2
from matplotlib import pyplot as plt
class Question:
def __init__(self, id, question_text, ground_truth, model_names):
self.id = id
self.question_text = self.normalize_answer(question_text)
self.question_head_ngram = []
self.question_tokens = nltk.word_tokenize(self.question_text)
for nc in range(3):
self.question_head_ngram.append(' '.join(self.question_tokens[0:nc]))
self.ground_truth = ground_truth
self.model_names = model_names
self.em = np.zeros(2)
self.f1 = np.zeros(2)
self.answer_text = []
def add_answers(self, answer_model_1, answer_model_2):
self.answer_text.append(answer_model_1)
self.answer_text.append(answer_model_2)
self.eval()
def eval(self):
for model_count in range(2):
self.em[model_count] = self.metric_max_over_ground_truths(self.exact_match_score, self.answer_text[model_count], self.ground_truth)
self.f1[model_count] = self.metric_max_over_ground_truths(self.f1_score, self.answer_text[model_count], self.ground_truth)
def normalize_answer(self, s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(self, prediction, ground_truth):
prediction_tokens = self.normalize_answer(prediction).split()
ground_truth_tokens = self.normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(self, prediction, ground_truth):
return (self.normalize_answer(prediction) == self.normalize_answer(ground_truth))
def metric_max_over_ground_truths(self, metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def safe_dict_access(in_dict, in_key, default_string='some junk string'):
if in_key in in_dict:
return in_dict[in_key]
else:
return default_string
def aggregate_metrics(questions):
total = len(questions)
exact_match = np.zeros(2)
f1_scores = np.zeros(2)
for mc in range(2):
exact_match[mc] = 100 * np.sum(np.array([questions[x].em[mc] for x in questions])) / total
f1_scores[mc] = 100 * np.sum(np.array([questions[x].f1[mc] for x in questions])) / total
model_names = questions[list(questions.keys())[0]].model_names
print('\nAggregate Scores:')
for model_count in range(2):
print('Model {0} EM = {1:.2f}'.format(model_names[model_count], exact_match[model_count]))
print('Model {0} F1 = {1:.2f}'.format(model_names[model_count], f1_scores[model_count]))
def venn_diagram(questions, output_dir):
em_model1_ids = [x for x in questions if questions[x].em[0] == 1]
em_model2_ids = [x for x in questions if questions[x].em[1] == 1]
model_names = questions[list(questions.keys())[0]].model_names
print('\nVenn diagram')
correct_model1 = em_model1_ids
correct_model2 = em_model2_ids
correct_model1_and_model2 = list(set(em_model1_ids).intersection(set(em_model2_ids)))
correct_model1_and_not_model2 = list(set(em_model1_ids) - set(em_model2_ids))
correct_model2_and_not_model1 = list(set(em_model2_ids) - set(em_model1_ids))
print('{0} answers correctly = {1}'.format(model_names[0], len(correct_model1)))
print('{0} answers correctly = {1}'.format(model_names[1], len(correct_model2)))
print('Both answer correctly = {1}'.format(model_names[0], len(correct_model1_and_model2)))
print('{0} correct & {1} incorrect = {2}'.format(model_names[0], model_names[1], len(correct_model1_and_not_model2)))
print('{0} correct & {1} incorrect = {2}'.format(model_names[1], model_names[0], len(correct_model2_and_not_model1)))
plt.clf()
venn_diagram_plot = venn2(
subsets=(len(correct_model1_and_not_model2), len(correct_model2_and_not_model1), len(correct_model1_and_model2)),
set_labels=('{0} correct'.format(model_names[0]), '{0} correct'.format(model_names[1]), 'Both correct'),
set_colors=('r', 'b'),
alpha=0.3,
normalize_to=1
)
plt.savefig(os.path.join(output_dir, 'venn_diagram.png'))
plt.close()
return correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1
def get_head_ngrams(questions, num_grams):
head_ngrams = []
for question in questions.values():
head_ngrams.append(question.question_head_ngram[num_grams])
return head_ngrams
def get_head_ngram_frequencies(questions, head_ngrams, num_grams):
head_ngram_frequencies = {}
for current_ngram in head_ngrams:
head_ngram_frequencies[current_ngram] = 0
for question in questions.values():
head_ngram_frequencies[question.question_head_ngram[num_grams]] += 1
return head_ngram_frequencies
def get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1, output_dir, num_grams=2, top_count=25):
# Head ngram statistics
head_ngrams = get_head_ngrams(questions, num_grams)
# Get head_ngram_frequencies (hnf)
hnf_all = get_head_ngram_frequencies(questions, head_ngrams, num_grams)
hnf_correct_model1 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1}, head_ngrams, num_grams)
hnf_correct_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model2}, head_ngrams, num_grams)
hnf_correct_model1_and_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1_and_model2}, head_ngrams, num_grams)
hnf_correct_model1_and_not_model2 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model1_and_not_model2}, head_ngrams, num_grams)
hnf_correct_model2_and_not_model1 = get_head_ngram_frequencies({qid: questions[qid] for qid in correct_model2_and_not_model1}, head_ngrams, num_grams)
sorted_bigrams_all = sorted(hnf_all.items(), key=lambda x: x[1], reverse=True)
top_bigrams = [x[0] for x in sorted_bigrams_all[0:top_count]]
counts_total = [hnf_all[x] for x in top_bigrams]
counts_model1 = [hnf_correct_model1[x] for x in top_bigrams]
counts_model2 = [hnf_correct_model2[x] for x in top_bigrams]
counts_model1_and_model2 = [hnf_correct_model1_and_model2[x] for x in top_bigrams]
counts_model1_and_not_model2 = [hnf_correct_model1_and_not_model2[x] for x in top_bigrams]
counts_model2_and_not_model1 = [hnf_correct_model2_and_not_model1[x] for x in top_bigrams]
top_bigrams_with_counts = []
for cc in range(len(top_bigrams)):
top_bigrams_with_counts.append('{0} ({1})'.format(top_bigrams[cc], counts_total[cc]))
plt.clf()
fig, ax = plt.subplots(figsize=(6, 10))
ylocs = list(range(top_count))
counts_model1_percent = 100 * np.array(counts_model1) / np.array(counts_total)
plt.barh([top_count - x for x in ylocs], counts_model1_percent, height=0.4, alpha=0.5, color='#EE3224', label=top_bigrams)
counts_model2_percent = 100 * np.array(counts_model2) / np.array(counts_total)
plt.barh([top_count - x+0.4 for x in ylocs], counts_model2_percent, height=0.4, alpha=0.5, color='#2432EE', label=top_bigrams )
ax.set_yticks([top_count - x + 0.4 for x in ylocs])
ax.set_yticklabels(top_bigrams_with_counts)
ax.set_ylim([0.5, top_count+1])
ax.set_xlim([0, 100])
plt.subplots_adjust(left=0.28, right=0.9, top=0.9, bottom=0.1)
plt.xlabel('Percentage of questions with correct answers')
plt.ylabel('Top N-grams')
plt.savefig(os.path.join(output_dir, 'ngram_stats_{0}.png'.format(num_grams)))
plt.close()
def read_json(filename):
with open(filename) as filepoint:
data = json.load(filepoint)
return data
def compare_models(dataset_file, predictions_m1_file, predictions_m2_file, output_dir, name_m1='Model 1', name_m2='Model 2'):
dataset = read_json(dataset_file)['data']
predictions_m1 = read_json(predictions_m1_file)
predictions_m2 = read_json(predictions_m2_file)
# Read in data
total = 0
questions = {}
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
current_question = Question(id=qa['id'], question_text=qa['question'], ground_truth=list(map(lambda x: x['text'], qa['answers'])), model_names=[name_m1, name_m2])
current_question.add_answers(answer_model_1=safe_dict_access(predictions_m1, qa['id']), answer_model_2=safe_dict_access(predictions_m2, qa['id']))
questions[current_question.id] = current_question
total += 1
model_names = questions[list(questions.keys())[0]].model_names
print('Read in {0} questions'.format(total))
# Aggregate scores
aggregate_metrics(questions)
# Venn diagram
correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2, correct_model2_and_not_model1 = venn_diagram(questions, output_dir=output_dir)
# Head Unigram statistics
get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2,
correct_model2_and_not_model1, output_dir, num_grams=1, top_count=10)
# Head Bigram statistics
get_head_ngram_statistics(questions, correct_model1, correct_model2, correct_model1_and_model2, correct_model1_and_not_model2,
correct_model2_and_not_model1, output_dir, num_grams=2, top_count=10)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Compare two QA models')
parser.add_argument('-dataset', action='store', dest='dataset', required=True, help='Dataset file')
parser.add_argument('-model1', action='store', dest='predictions_m1', required=True, help='Prediction file for model 1')
parser.add_argument('-model2', action='store', dest='predictions_m2', required=True, help='Prediction file for model 2')
parser.add_argument('-name1', action='store', dest='name_m1', help='Name for model 1')
parser.add_argument('-name2', action='store', dest='name_m2', help='Name for model 2')
parser.add_argument('-output', action='store', dest='output_dir', help='Output directory for visualizations')
results = parser.parse_args()
if results.name_m1 is not None and results.name_m2 is not None:
compare_models(dataset_file=results.dataset, predictions_m1_file=results.predictions_m1, predictions_m2_file=results.predictions_m2, output_dir=results.output_dir, name_m1=results.name_m1, name_m2=results.name_m2)
else:
compare_models(dataset_file=results.dataset, predictions_m1_file=results.predictions_m1, predictions_m2_file=results.predictions_m2, output_dir=results.output_dir)