dawn-bench-models/tensorflow/SQuAD/squad/eda_aug_train.ipynb

315 lines
7.6 KiB
Text
Raw Permalink Normal View History

2017-08-17 12:43:17 -06:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"\n",
"aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
"aug_data = json.load(open(aug_data_path, 'r'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
"(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
"(['the', 'Main', 'Building'], 'the Main Building')\n",
"(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
]
}
],
"source": [
"def compare_answers():\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" nodes, edges = dep\n",
" if dep is not None:\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
" yield answer_words, text\n",
"\n",
"ca = compare_answers()\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))\n",
"print(next(ca))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"q: k\n",
"q: j\n",
"q: n\n",
"q: b\n",
"q: v\n",
"x: .\n",
"x: :208\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"q: dd\n",
"q: dd\n",
"q: dd\n",
"q: dd\n",
"q: d\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :411\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :40\n",
"x: .\n",
"x: *\n",
"x: :14\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: :131\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"x: .\n",
"53 10\n"
]
}
],
"source": [
"def nodep_counter():\n",
" x_count = 0\n",
" q_count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for sent, dep in zip(para['sents'], deps):\n",
" if dep is None:\n",
" print(\"x:\", sent)\n",
" x_count += 1\n",
" for qa in para['qas']:\n",
" if qa['dep'] is None:\n",
" print(\"q:\", qa['question'])\n",
" q_count += 1\n",
" print(x_count, q_count)\n",
"nodep_counter()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": [
"def bad_node_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" sents = para['sents']\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" for node in nodes:\n",
" if len(node) != 5:\n",
" count += 1\n",
" print(count)\n",
"bad_node_counter() "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36\n"
]
}
],
"source": [
"def noanswer_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" deps = para['deps']\n",
" nodess = []\n",
" for dep in deps:\n",
" if dep is not None:\n",
" nodes, edges = dep\n",
" nodess.append(nodes)\n",
" else:\n",
" nodess.append([])\n",
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" if word_start is None:\n",
" count += 1\n",
" print(count)\n",
"noanswer_counter()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"106\n"
]
}
],
"source": [
"def mult_sent_answer_counter():\n",
" count = 0\n",
" for article in aug_data['data']:\n",
" for para in article['paragraphs']:\n",
" for qa in para['qas']:\n",
" for answer in qa['answers']:\n",
" text = answer['text']\n",
" word_start = answer['answer_word_start']\n",
" word_stop = answer['answer_word_stop']\n",
" if word_start is not None and word_start[0] != word_stop[0]:\n",
" count += 1\n",
" print(count)\n",
"mult_sent_answer_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}