315 lines
7.6 KiB
Text
315 lines
7.6 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import json\n",
|
||
|
"\n",
|
||
|
"aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
|
||
|
"aug_data = json.load(open(aug_data_path, 'r'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
|
||
|
"(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
|
||
|
"(['the', 'Main', 'Building'], 'the Main Building')\n",
|
||
|
"(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def compare_answers():\n",
|
||
|
" for article in aug_data['data']:\n",
|
||
|
" for para in article['paragraphs']:\n",
|
||
|
" deps = para['deps']\n",
|
||
|
" nodess = []\n",
|
||
|
" for dep in deps:\n",
|
||
|
" nodes, edges = dep\n",
|
||
|
" if dep is not None:\n",
|
||
|
" nodess.append(nodes)\n",
|
||
|
" else:\n",
|
||
|
" nodess.append([])\n",
|
||
|
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
|
||
|
" for qa in para['qas']:\n",
|
||
|
" for answer in qa['answers']:\n",
|
||
|
" text = answer['text']\n",
|
||
|
" word_start = answer['answer_word_start']\n",
|
||
|
" word_stop = answer['answer_word_stop']\n",
|
||
|
" answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
|
||
|
" yield answer_words, text\n",
|
||
|
"\n",
|
||
|
"ca = compare_answers()\n",
|
||
|
"print(next(ca))\n",
|
||
|
"print(next(ca))\n",
|
||
|
"print(next(ca))\n",
|
||
|
"print(next(ca))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"q: k\n",
|
||
|
"q: j\n",
|
||
|
"q: n\n",
|
||
|
"q: b\n",
|
||
|
"q: v\n",
|
||
|
"x: .\n",
|
||
|
"x: :208\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"q: dd\n",
|
||
|
"q: dd\n",
|
||
|
"q: dd\n",
|
||
|
"q: dd\n",
|
||
|
"q: d\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: :411\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: :40\n",
|
||
|
"x: .\n",
|
||
|
"x: *\n",
|
||
|
"x: :14\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: :131\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"x: .\n",
|
||
|
"53 10\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def nodep_counter():\n",
|
||
|
" x_count = 0\n",
|
||
|
" q_count = 0\n",
|
||
|
" for article in aug_data['data']:\n",
|
||
|
" for para in article['paragraphs']:\n",
|
||
|
" deps = para['deps']\n",
|
||
|
" nodess = []\n",
|
||
|
" for sent, dep in zip(para['sents'], deps):\n",
|
||
|
" if dep is None:\n",
|
||
|
" print(\"x:\", sent)\n",
|
||
|
" x_count += 1\n",
|
||
|
" for qa in para['qas']:\n",
|
||
|
" if qa['dep'] is None:\n",
|
||
|
" print(\"q:\", qa['question'])\n",
|
||
|
" q_count += 1\n",
|
||
|
" print(x_count, q_count)\n",
|
||
|
"nodep_counter()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def bad_node_counter():\n",
|
||
|
" count = 0\n",
|
||
|
" for article in aug_data['data']:\n",
|
||
|
" for para in article['paragraphs']:\n",
|
||
|
" sents = para['sents']\n",
|
||
|
" deps = para['deps']\n",
|
||
|
" nodess = []\n",
|
||
|
" for dep in deps:\n",
|
||
|
" if dep is not None:\n",
|
||
|
" nodes, edges = dep\n",
|
||
|
" for node in nodes:\n",
|
||
|
" if len(node) != 5:\n",
|
||
|
" count += 1\n",
|
||
|
" print(count)\n",
|
||
|
"bad_node_counter() "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"36\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def noanswer_counter():\n",
|
||
|
" count = 0\n",
|
||
|
" for article in aug_data['data']:\n",
|
||
|
" for para in article['paragraphs']:\n",
|
||
|
" deps = para['deps']\n",
|
||
|
" nodess = []\n",
|
||
|
" for dep in deps:\n",
|
||
|
" if dep is not None:\n",
|
||
|
" nodes, edges = dep\n",
|
||
|
" nodess.append(nodes)\n",
|
||
|
" else:\n",
|
||
|
" nodess.append([])\n",
|
||
|
" wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
|
||
|
" for qa in para['qas']:\n",
|
||
|
" for answer in qa['answers']:\n",
|
||
|
" text = answer['text']\n",
|
||
|
" word_start = answer['answer_word_start']\n",
|
||
|
" word_stop = answer['answer_word_stop']\n",
|
||
|
" if word_start is None:\n",
|
||
|
" count += 1\n",
|
||
|
" print(count)\n",
|
||
|
"noanswer_counter()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"106\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def mult_sent_answer_counter():\n",
|
||
|
" count = 0\n",
|
||
|
" for article in aug_data['data']:\n",
|
||
|
" for para in article['paragraphs']:\n",
|
||
|
" for qa in para['qas']:\n",
|
||
|
" for answer in qa['answers']:\n",
|
||
|
" text = answer['text']\n",
|
||
|
" word_start = answer['answer_word_start']\n",
|
||
|
" word_stop = answer['answer_word_stop']\n",
|
||
|
" if word_start is not None and word_start[0] != word_stop[0]:\n",
|
||
|
" count += 1\n",
|
||
|
" print(count)\n",
|
||
|
"mult_sent_answer_counter()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.5.1"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|