{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json\n", "\n", "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n", "aug_data = json.load(open(aug_data_path, 'r'))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n", "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n", "(['the', 'Main', 'Building'], 'the Main Building')\n", "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n" ] } ], "source": [ "def compare_answers():\n", " for article in aug_data['data']:\n", " for para in article['paragraphs']:\n", " deps = para['deps']\n", " nodess = []\n", " for dep in deps:\n", " nodes, edges = dep\n", " if dep is not None:\n", " nodess.append(nodes)\n", " else:\n", " nodess.append([])\n", " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", " for qa in para['qas']:\n", " for answer in qa['answers']:\n", " text = answer['text']\n", " word_start = answer['answer_word_start']\n", " word_stop = answer['answer_word_stop']\n", " answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n", " yield answer_words, text\n", "\n", "ca = compare_answers()\n", "print(next(ca))\n", "print(next(ca))\n", "print(next(ca))\n", "print(next(ca))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "q: k\n", "q: j\n", "q: n\n", "q: b\n", "q: v\n", "x: .\n", "x: :208\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "q: dd\n", "q: dd\n", "q: dd\n", "q: dd\n", "q: d\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: :411\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: :40\n", "x: .\n", "x: *\n", "x: :14\n", "x: .\n", "x: .\n", "x: .\n", "x: :131\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "x: .\n", "53 10\n" ] } ], "source": [ "def nodep_counter():\n", " x_count = 0\n", " q_count = 0\n", " for article in aug_data['data']:\n", " for para in article['paragraphs']:\n", " deps = para['deps']\n", " nodess = []\n", " for sent, dep in zip(para['sents'], deps):\n", " if dep is None:\n", " print(\"x:\", sent)\n", " x_count += 1\n", " for qa in para['qas']:\n", " if qa['dep'] is None:\n", " print(\"q:\", qa['question'])\n", " q_count += 1\n", " print(x_count, q_count)\n", "nodep_counter()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "def bad_node_counter():\n", " count = 0\n", " for article in aug_data['data']:\n", " for para in article['paragraphs']:\n", " sents = para['sents']\n", " deps = para['deps']\n", " nodess = []\n", " for dep in deps:\n", " if dep is not None:\n", " nodes, edges = dep\n", " for node in nodes:\n", " if len(node) != 5:\n", " count += 1\n", " print(count)\n", "bad_node_counter() " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "36\n" ] } ], "source": [ "def noanswer_counter():\n", " count = 0\n", " for article in aug_data['data']:\n", " for para in article['paragraphs']:\n", " deps = para['deps']\n", " nodess = []\n", " for dep in deps:\n", " if dep is not None:\n", " nodes, edges = dep\n", " nodess.append(nodes)\n", " else:\n", " nodess.append([])\n", " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", " for qa in para['qas']:\n", " for answer in qa['answers']:\n", " text = answer['text']\n", " word_start = answer['answer_word_start']\n", " word_stop = answer['answer_word_stop']\n", " if word_start is None:\n", " count += 1\n", " print(count)\n", "noanswer_counter()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "106\n" ] } ], "source": [ "def mult_sent_answer_counter():\n", " count = 0\n", " for article in aug_data['data']:\n", " for para in article['paragraphs']:\n", " for qa in para['qas']:\n", " for answer in qa['answers']:\n", " text = answer['text']\n", " word_start = answer['answer_word_start']\n", " word_stop = answer['answer_word_stop']\n", " if word_start is not None and word_start[0] != word_stop[0]:\n", " count += 1\n", " print(count)\n", "mult_sent_answer_counter()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }