360 lines
62 KiB
Text
360 lines
62 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"100%|██████████| 3198/3198 [00:12<00:00, 264.62it/s]\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import os\n",
|
||
|
"import nltk\n",
|
||
|
"import re\n",
|
||
|
"\n",
|
||
|
"from collections import Counter\n",
|
||
|
"\n",
|
||
|
"from tqdm import tqdm\n",
|
||
|
"\n",
|
||
|
"root_dir = \"/Users/minjoons/data/cnn/questions\"\n",
|
||
|
"data_dir = os.path.join(root_dir, \"test\")\n",
|
||
|
"\n",
|
||
|
"char_counter = Counter()\n",
|
||
|
"word_counter = Counter()\n",
|
||
|
"ent_counter = Counter()\n",
|
||
|
"max_num_words = 0\n",
|
||
|
"max_num_ques_words = 0\n",
|
||
|
"max_num_sents = 0\n",
|
||
|
"max_num_words_per_sent = 0\n",
|
||
|
"max_num_chars = 0\n",
|
||
|
"\n",
|
||
|
"nums_words = []\n",
|
||
|
"nums_ques_words = []\n",
|
||
|
"nums_sents = []\n",
|
||
|
"nums_words_per_sent = []\n",
|
||
|
"nums_chars = []\n",
|
||
|
"nums_entities = []\n",
|
||
|
"\n",
|
||
|
"sent_tokenize = lambda x: re.split(\"[.!?]\", x)\n",
|
||
|
"sent_tokenize = nltk.sent_tokenize\n",
|
||
|
"\n",
|
||
|
"num_ques = len(list(os.listdir(data_dir)))\n",
|
||
|
"\n",
|
||
|
"cand_set= set()\n",
|
||
|
"\n",
|
||
|
"for path in tqdm(os.listdir(data_dir), total=num_ques):\n",
|
||
|
" if path.endswith(\".question\"):\n",
|
||
|
" with open(os.path.join(data_dir, path), 'r') as fh:\n",
|
||
|
" url = fh.readline().strip()\n",
|
||
|
" _ = fh.readline()\n",
|
||
|
" para = fh.readline().strip()\n",
|
||
|
" _ = fh.readline()\n",
|
||
|
" ques = fh.readline().strip()\n",
|
||
|
" _ = fh.readline()\n",
|
||
|
" answer = fh.readline().strip()\n",
|
||
|
" _ = fh.readline()\n",
|
||
|
" cands = list(line.strip() for line in fh)\n",
|
||
|
" cand_ents, cand_names = zip(*[cand.split(\":\") for cand in cands])\n",
|
||
|
" cand_set = cand_set | set(cand_names)\n",
|
||
|
" words = para.split(\" \")\n",
|
||
|
" sents = sent_tokenize(para)\n",
|
||
|
" wordss = list(sent.split(\" \") for sent in sents)\n",
|
||
|
" ques_words = ques.split(\" \")\n",
|
||
|
" \n",
|
||
|
" ents = [word for word in words if word.startswith(\"@\")]\n",
|
||
|
" num_ents = len(ents)\n",
|
||
|
" \n",
|
||
|
" nums_entities.append(num_ents)\n",
|
||
|
" nums_words.append(len(words))\n",
|
||
|
" nums_ques_words.append(len(ques_words))\n",
|
||
|
" nums_sents.append(len(sents))\n",
|
||
|
" nums_words_per_sent.extend(map(len, wordss))\n",
|
||
|
" nums_chars.extend(map(len, words))\n",
|
||
|
" \n",
|
||
|
" for word in ques_words:\n",
|
||
|
" if word.startswith(\"@\"):\n",
|
||
|
" ent_counter[word] += 1\n",
|
||
|
" else:\n",
|
||
|
" word_counter[word] += 1\n",
|
||
|
" for c in word:\n",
|
||
|
" char_counter[c] += 1\n",
|
||
|
" \n",
|
||
|
" for word in words:\n",
|
||
|
" if word.startswith(\"@\"):\n",
|
||
|
" ent_counter[word] += 1\n",
|
||
|
" else:\n",
|
||
|
" word_counter[word] += 1\n",
|
||
|
" for c in word:\n",
|
||
|
" char_counter[c] += 1"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"(22747, 465, 77, 12465)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"len(word_counter), len(ent_counter), len(char_counter), len(cand_set)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"(1989, 37, 122, 443, 24)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"max_num_words, max_num_ques_words, max_num_sents, max_num_words_per_sent, max_num_chars"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"['hello', ' Wow', ' Hmm', '']"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import re\n",
|
||
|
"re.split(\"[.!?]\", \"hello. Wow! Hmm?\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEACAYAAABfxaZOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAG9NJREFUeJzt3Xt4VNW5x/HvC6IckXoBi8hVEYt4KRZFVJC0iEZqC2Kr\nWErFpxaeg1zaqhVO29PYxxs9RytqvWAjCmqRgop6sCDoWFA01IIgEC4ql0RAwEq5ySWs88caYQyB\nTMjMrJnZv8/z7CczOzuTN5vJm8W71n63OecQEZFoqBM6ABERyRwlfRGRCFHSFxGJECV9EZEIUdIX\nEYkQJX0RkQipNumbWbGZrTezBYc45gEzW25m882sQ2pDFBGRVElmpD8WuPxgnzSzK4A2zrm2wCDg\n0RTFJiIiKVZt0nfOzQb+dYhDegHj4se+CxxrZk1SE56IiKRSKmr6zYA1Cc/L4/tERCTLaCJXRCRC\njkjBa5QDLRKeN4/vO4CZqdGPiMhhcM5ZKl4n2aRv8a0qLwE3Ac+ZWWfgc+fc+oO9kBq8pU5RURFF\nRUWhw8gbOp+pk6vn0jnYtg327IGKiv0ft2+H1ath5UpYtQo2bYLNm2HrVti1y287d/qPX3zhP79x\nI9SpAyeeuH9r0gTOPBPOOcdvTZqAJZHKLZmDklRt0jezZ4ECoJGZrQZ+BxwJOOfcGOfcVDPraWYr\ngG3ADSmLTkQkxZzziXnLFp+0t23z2/r1cPvtsGgR1K8PRxwBdev6rX59aNUKWreGli3hG9+AY4+F\nY46Bo46CI4/86sdGjaBxYzj66NA/7YGqTfrOuR8lccyQ1IQjIpIamzfDm2/CwoX+4+LFPslv3Qr1\n6vmE/eV29NHQoAEMHQrXX+9H6PkqFTV9CaSgoCB0CHlF5zN10n0uN2+Gn//cl1B27vTbF1/sf7xz\nJ2zYAJ07Q8eO0K8fdOu2f3Rer15aw8tqlskau5k51fRFpCacg/JyGDcOSkp8+WXFCvjOd2D4cF9O\nSdzq1/cfmzXzj/OBmaVsIldJX0SyUnk5jBgBr7ziJ1SvvBKuuspPgJ56qq+fR0Uqk77KOyISzPbt\nfhJ11SpYutSP4P/5T1+2WbgQBgzwtfiTTkpulYtUTyN9EcmYNWtg0iQoLYXly2HOHF9jb9rUL2Vs\n3RrOPdc/b9fOL3MUlXdEJIs5B599BmVl+7c1a/zHGTN8Lb5zZ5/gL7nEJ305NCV9EckqL78MDz4I\nH3/sk3v9+tC8ObRo4T9+uZ1xBlx4Yehoc49q+iIShHOwbJmvwX85gv/4Y3jxRSgu9iWa5s01es9m\nGumLSFLeegvuv9/X4du12z96b9HCj97POSd0hPlLI30RSYvXXoOZM+HTT+Hf/96/bdjg6/SDB8NT\nT2VnewFJjkb6IhG2cye88QZ88IFfTfPCC/6Cp5NO8levNmwIX/ua31q29M8l8zSRKyKHrbgYpk/3\nbQsWLfLLIi+8ENq0gYICX5eX7KLyjogkzTk/6TpzJrz3nh/N/+EPvkTTtCl06qQLn6JESV8kjzjn\nV9N89plfYfPMM75O37AhnHceXHyxf37GGaEjlVBU3hHJA5s3w9Sp8Nhjvp3BySf7G3T06eP71TRq\nFDpCqQ3V9EVknzvugP/9X3916xVXwMCB/sYfkj9U0xeJuK1b4f33YdYsv3Z+0SLfSlikOnl8fxiR\n/LJ7t18v/9vf+nLNL37h79kaiynhS/I00hfJYp9/Dn/9K4wfv78jZdeusGSJ7ykvUlOq6YtkmbIy\nv+pmzBhYt87X6fv39x+jdOMQ2U81fZE8NWMG9O0LPXv6vvNnnw1H6LdUUkhvJ5HAnPNdKkeN8l0r\nJ070PedF0kETuSKBTZgAt97qSzirVyvhS3pppC8SyI4dvnY/cqS/CUnnzqEjkihQ0hcJoKwMrrwS\njjsOpkxRwpfMUXlHJEOc86txnnvO98ApKPBtjS+6KHRkEiUa6Yuk2a5dMGSIX2vfsCG0bw9PPAHd\nu4eOTKJISV8kjUpKfC+cU06BtWt9OUckJJV3RNLkxRf9evvhw+H555XwJTvoilyRNFi92t+c5Mkn\nobAwdDSS69RaWSSL7d4NP/mJL+ncdVfoaCQfpDLpq7wjkkKzZ/tGaMuW+bKOSLZR0hdJgbVrfV/7\nPn3gvvv8vWibNAkdlciBtHpHpJbWrIHzz/fr7qdNg3PPDR2RyMGppi9SC875lsddusBvfhM6GslX\nqumLZIniYvjkE7jtttCRiCQnqaRvZoVmVmpmy8zsgLe3mTUys1fNbL6ZLTSzASmPVCTLvPMO/PrX\n8PjjUK9e6GhEklNtecfM6gDLgO7AJ8BcoK9zrjThmN8B9Z1zI82sMbAUaOKc21PptVTekZy3YYOf\nsP3wQ3j4YejdO3REku8yXd7pBCx3zq1yzu0GJgC9Kh2zDmgYf9wQ2FQ54YvkOuegtBR++ENo29bf\nlFwJX3JNMkm/GbAm4XlZfF+ix4EzzewT4H1AK5Qlb+zaBYMHw8knQ48ecMklvqSj+9VKLkrVks2R\nwPvOuW+bWRvgNTM7xzm3tfKBRUVF+x4XFBRQUFCQohBEUs85+OUv/ah+zhxo1QosJf/JFjm4WCxG\nLBZLy2snU9PvDBQ55wrjz0cAzjk3KuGYqcCdzrm34s9nArc55/5R6bVU05ecMmqUb4n8+uvw9a+H\njkaiKpU1/WRG+nOB08ysFbAW6AtcV+mYJcClwFtm1gQ4HfgoFQGKhLBqle95/9hjfpWOEr7ki2pr\n+s65CmAIMB1YBExwzi0xs0FmNjB+2N3AeWb2PvAa8Cvn3GfpCloknd56y19hu2kTxGLQunXoiERS\nR1fkiiRYvNhfXfvYY36Vjkg2UGtlkTTYsgWuvtqvzlFLBckmSvoiadCjBzRuDGPHQv36oaMR2S/T\nE7kiee/++2HBAn/Hq6OOCh2NSPoo6Uvkvf463HEHvP22Er7kP3XZlMjats33zrn2WvjLX+D000NH\nJJJ+SvoSOc7BlClw1lnwf/8H06f7er5IFKi8I5FSXg79+8PGjXDPPXDNNWqrINGikb5EgnMwZIgv\n4XTqBPPm+bKOEr5EjUb6EgkPPAAlJbBiBTRtGjoakXC0Tl/y3rp1cOaZMHcunHpq6GhEak73yBVJ\nUmkpXHYZ/PjHSvgioKQveezRR6FjR7jxRn/xlYiopi956r774MEH/VW2bdqEjkYke6imL3ln2jQY\nMMBP3LZoEToakdpTTV+kCs75G59ceSU89JASvkhVVN6RvLByJfzsZ/Dpp76Hzvnnh45IJDsp6UvO\nW7UKvv99uOACeOUVNU0TORSVdySnrVwJHTpAnz7+bldK+CKHpolcyWl33AEffeRr+SL5SjdREQFm\nzfLr7+fMCR2JSO5QeUdy0u23+/vZPvQQtG0bOhqR3KHyjuSct96Cyy/3vXTOOCN0NCLpp3X6EknO\nwW9+A717w+TJSvgih0M1fckJFRXwpz/B+PF+pK9bG4ocHiV9yXrbt8PFF/sbnjz7rBK+SG0o6UtW\n27QJvvtdOOccePJJ3elKpLY0kStZq7zct0XeuxemToW6dUNHJBKGJnIlb+3Y4ev2ffr4u121bAnP\nPKOEL5IqGulLVrn+en+3q4EDfT+dE08MHZFIeLoiV/LOtm3w3/8NM2b4G580ahQ6IpH8pKQvWeGW\nW2DRIn8DFCV8kfRR0pfgZs+GiRPhgw+gadPQ0YjkN03kSlCbNsGgQTB6tBK+SCZoIleC2bMHunb1\nF1uNHQt1NAQRqZKWbEpe+OMfoUEDJXyRTErqV83MCs2s1MyWmdltBzmmwMzmmdkHZvZGasOUfDN9\nOvzhD/DII0r4IplUbXnHzOoAy4DuwCfAXKCvc6404ZhjgbeBy5xz5WbW2Dm3sYrXUnlHWL0aevTw\nSb9Xr9DRiGS/TJd3OgHLnXOrnHO7gQlA5V/VHwGTnXPlAFUlfBGAefP8PW379fMXX4lIZiWT9JsB\naxKel8X3JTodOMHM3jC
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10c247128>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"%matplotlib inline\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import numpy as np\n",
|
||
|
"counter = Counter(nums_words)\n",
|
||
|
"values = list(counter.values())\n",
|
||
|
"plt.plot(list(counter.keys()), np.cumsum(values)/sum(values))\n",
|
||
|
"plt.show()\n",
|
||
|
"# plt.hist(nums_words)\n",
|
||
|
"# plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEACAYAAACznAEdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHDZJREFUeJzt3X2QVPWd7/H3BxCRBxGDDMqAoIjixtzIKl7Xq/ZGAz5U\nqZWtNSS5q66uWxtNtO5ucgP+I1ZlNVYlu2Zz1arcZBVz3bAkt6KkpBC5pK1KsioaVAIIKIIMyvgA\novi0IN/7xzkj7dA93TPTffr0zOdVNTVnfnMevnMY5tO/3++c04oIzMzMhjS7ADMzywcHgpmZAQ4E\nMzNLORDMzAxwIJiZWcqBYGZmQI2BIGmspF9I2iBpnaSzJI2TtELSRkmPShpbsv4CSZvT9eeUtM+S\n9LykTZLuasQPZGZmfVNrD+GHwLKImAn8F+AFYD6wMiJOBlYBCwAknQpcCcwELgbukaR0P/cC10XE\nDGCGpLl1+0nMzKxfqgaCpCOBcyPiPoCI2B8Re4DLgUXpaouAK9Lly4DF6Xpbgc3AbEkTgTERsTpd\n74GSbczMrMlq6SFMA96UdJ+kP0j6saSRQFtEdAJExE5gQrr+JGB7yfY70rZJQEdJe0faZmZmOVBL\nIAwDZgF3R8Qs4D2S4aLuz7zwMzDMzFrYsBrW6QC2R8TT6df/lyQQOiW1RURnOhz0evr9HcDkku3b\n07ZK7YeQ5HAxM+uDiFD1tcqr2kNIh4W2S5qRNl0ArAOWAtekbVcDD6fLS4F5koZLmgZMB55Kh5X2\nSJqdTjJfVbJNuePm6uPWW29teg2uaWDV5ZpcU70/+quWHgLATcCDkg4DtgB/DQwFlki6FthGcmUR\nEbFe0hJgPbAPuCEOVnojcD8wguSqpeX9/gnMzKwuagqEiHgOOLPMty6ssP4dwB1l2p8BTutNgWZm\nlg3fqVyjQqHQ7BIO4Zpql8e6XFNtXFN2VI9xp3qTFHmsy8wszyQRjZxUNjOzwcGBYGZmgAPBzMxS\nDgQzMwMcCGZmlnIgmJkZ4EAwM7OUA8HMzAAHgpmZpRwIZmYGOBDMzCzlQDAzM8CBYGZmKQeCmZkB\nDgQzM0s5EMzMDHAgmJlZyoFgZmaAA8HMzFIOBDMzAxwIZmaWciCYmRngQDAzs5QDwczMAAeCmZml\nHAhmZgbUGAiStkp6TtIaSU+lbeMkrZC0UdKjksaWrL9A0mZJGyTNKWmfJel5SZsk3VX/H8fMzPqq\n1h7CAaAQEadHxOy0bT6wMiJOBlYBCwAknQpcCcwELgbukaR0m3uB6yJiBjBD0tw6/RxmZtZPtQaC\nyqx7ObAoXV4EXJEuXwYsjoj9EbEV2AzMljQRGBMRq9P1HijZxszMmqzWQAjgMUmrJf1N2tYWEZ0A\nEbETmJC2TwK2l2y7I22bBHSUtHekbWZmlgPDalzvnIh4TdIxwApJG0lColT3r83MrIXUFAgR8Vr6\n+Q1JDwGzgU5JbRHRmQ4HvZ6uvgOYXLJ5e9pWqb2shQsXfrJcKBQoFAq1lGpmNmgUi0WKxWLd9qeI\nnl/YSxoJDImIvZJGASuA24ALgF0Rcaek7wDjImJ+Oqn8IHAWyZDQY8BJERGSngBuAlYDjwD/EhHL\nyxwzqtVlZmafJomIUPU1y6ulh9AG/EpSpOs/GBErJD0NLJF0LbCN5MoiImK9pCXAemAfcEPJX/cb\ngfuBEcCycmFgZmbNUbWH0AzuIZiZ9V5/ewi+U9nMzIAcB4I7CGZm2XIgmJkZkONAOHCg2RWYmQ0u\nuQ0E9xDMzLLlQDAzM8CBYGZmKQeCmZkBOQ4ETyqbmWUrt4HgHoKZWbYcCGZmBjgQzMwsldtA8ByC\nmVm2chsI7iGYmWXLgWBmZoADwczMUg4EMzMDchwInlQ2M8tWbgPBPQQzs2w5EMzMDHAgmJlZyoFg\nZmZAjgPBk8pmZtnKbSC4h2Bmli0HgpmZAQ4EMzNL5TYQPIdgZpat3AaCewhmZtlyIJiZGdCLQJA0\nRNIfJC1Nvx4naYWkjZIelTS2ZN0FkjZL2iBpTkn7LEnPS9ok6a6ejudAMDPLVm96CDcD60u+ng+s\njIiTgVXAAgBJpwJXAjOBi4F7JCnd5l7guoiYAcyQNLfSwRwIZmbZqikQJLUDlwA/KWm+HFiULi8C\nrkiXLwMWR8T+iNgKbAZmS5oIjImI1el6D5RscwhPKpuZZavWHsI/A98GSl+3t0VEJ0BE7AQmpO2T\ngO0l6+1I2yYBHSXtHWlbWe4hmJlla1i1FSRdCnRGxLOSCj2sWtc/4T/60ULGj0+WC4UChUJPhzYz\nG3yKxSLFYrFu+1NUeSku6XbgvwP7gSOAMcCvgDOAQkR0psNBv4mImZLmAxERd6bbLwduBbZ1rZO2\nzwPOj4ivlzlmrF8fzJxZ2w8RkXwMye01U2ZmjSeJiFD1Ncur+ic0Im6JiCkRcQIwD1gVEX8F/Bq4\nJl3tauDhdHkpME/ScEnTgOnAU+mw0h5Js9NJ5qtKtilz3Np/iEWL4Fvfqn19MzM7VNUhox58D1gi\n6VqSV/9XAkTEeklLSK5I2gfcEAe7ITcC9wMjgGURsbzSznszqdzRATt29OEnMDOzT/QqECLiceDx\ndHkXcGGF9e4A7ijT/gxwWm3Hqr2u3buTDzMz67vcjrr3NhDefrtxtZiZDQb9GTJqqFoDoVBIhpfc\nQzAz65+WDoSPPoLHH4dRo+CIIxpfk5nZQJbbQKhlUnl7evvbe+/Bhx8mIaI+X3BlZja4tfQcwtat\nB5c//hj27m1YOWZmA17LBsLSpXD99Qe/PvJITyybmfVHywbCU08lPYSpU2HoUJg82RPLZmb9kdtA\nqDaH8NJLyWTyF74A48YlH+4hmJn1XW4nlav1ELZsgUcegeOPT8JhzBjYtSub2szMBqLc9hCqBcJL\nL8EppyRDRsUifPaz8PTT8OCD8P3vZ1GhmdnA0pI9hD17kstMJ0w42HbJJXDzzfDcc3DYYY2vz8xs\noGnJQHj55aRnUHrPwdlnw6uvwvr1cFpNT0syM7NSuR0y6mlSeetWmDbt023DhsEvfgEXXwyvvNLQ\n0szMBqSW7CGUCwSAc8+Fc85JHmPx4YcwYkTDyjMzG3By20OoZcionCFDYNKk5D0SzMysdi0ZCJV6\nCF2mTPGwkZlZb7VkIPTUQwCYMQM2bKh7SWZmA1puA6GnSeXXXoPjjqv8/dNPhzVr6l+TmdlAlttA\nqNRD6HoznKOPrrytA8HMrPdaLhDefhtGj+755rPPfS4ZMtq3rzG1mZkNRC0XCG+9BePH97ztyJFw\nzDG+0sjMrDdyGwiV5hDefBM+85nq20+efPAd1czMrLrcBkJPPYRaAsGXnpqZ9U5LBkK1ISNwD8HM\nrLdaLhBqHTJyD8HMrHdaLhDcQzAza4zcBkKlSeU33qith3D88ckjLszMrDa5DYRKPYRNm+Ckk6pv\nf+KJybuqVXtvZjMzS1QNBEmHS3pS0hpJ6yTdnraPk7RC0kZJj0oaW7LNAkmbJW2QNKekfZak5yVt\nknRXT8ctFwgRsHZtbW+AM3p0cjezh43MzGpTNRAi4iPgzyPidOBzwBcknQPMB1ZGxMnAKmABgKRT\ngSuBmcDFwD3SJ+9tdi9wXUTMAGZImlv5uIe2vfpqcody6Vtn9mTGDNi8ubZ1zcwGu5qGjCLi/XTx\n8HSb3cDlwKK0fRFwRbp8GbA4IvZHxFZgMzBb0kRgTESsTtd7oGSbMsc8tG3tWvjsZ2upODFjRjLE\nZGZm1dUUCJKGSFoD7ASKEbEeaIuIToCI2Al0vW6fBJQO1OxI2yYBpQ+T6Ejbyio39v/yyzB9ei0V\nJ048EbZsqX19M7PBrKa30IyIA8Dpko4EHpVUALq/hu/hHQx679//fSF//GOyXCgUKBQKdHZCW1vt\n+xg/3u+LYGYDV7FYpFgs1m1/vXpP5Yh4R9Iy4AygU1JbRHSmw0Gvp6vtACaXbNaetlVqL+sv/3Ih\nV1756badO5Mnmdbq6KN
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10e956438>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"counter = Counter(nums_words_per_sent)\n",
|
||
|
"plt.plot(list(counter.keys()), list(counter.values()))\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEACAYAAACznAEdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEMNJREFUeJzt3W+sXHWdx/H3Rwv4l253k95mKRYMioWsf8im4BLjVVwE\nTQqPCGhckA1P0Eh0/dO6D6iPEBPjwq5sYtRuJSgpyoaaEKlNd0w0cVGRhdBaatyW0t1e4kow8KC0\n8N0H8wOHyy1/ZubOzB3er2SSc373nDm/L0Pm09/vnDMnVYUkSa8adwckSZPBQJAkAQaCJKkxECRJ\ngIEgSWoMBEkS8BICIcm3kswlua+nbUWS7Un2JLkryfKev21MsjfJ7iTn97SfleS+JA8m+afhlyJJ\nGsRLGSFsBj44r20DsKOqTgd2AhsBkpwBXAKsBS4EbkqSts+/An9fVW8F3ppk/ntKksboRQOhqn4K\nPDqv+SJgS1veAlzcltcDt1bV0araB+wF1iVZBbyxqn7RtvtOzz6SpAnQ7zmElVU1B1BVh4CVrf0k\n4EDPdgdb20nAwz3tD7c2SdKEGNZJZX//QpKWuGV97jeXZKaq5tp00COt/SBwcs92q1vbsdoXlMSA\nkaQ+VFVefKuFvdQRQtrrGduAK9ry5cAdPe2XJjk+yanAacDdbVrpsSTr2knmv+vZZ0FVNbWva6+9\ndux9sDbrs77pew3qRUcISb4LzAJ/keQh4Frgy8BtSa4E9tO9soiq2pVkK7ALOAJcXX/q5SeAfwNe\nA9xZVT8auPeSpKF50UCoqo8c408fOMb21wHXLdD+K+CvXlbvJEkj453KYzA7OzvuLiyaaa4NrG+p\nm/b6BpVhzDsNW5KaxH5J0iRLQo3gpLIkacoZCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIA\nA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMhJdl1apTSDLy16pVp4y7dEmvAD4P4WXoPg56HP3K\nUJ6XKmm6+TwESdJQGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUG\ngiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiRgwEBIsjHJA0nuS3JLkuOTrEiyPcmeJHclWT5v\n+71Jdic5f/DuS5KGpe9ASLIGuAp4V1W9HVgGXAZsAHZU1enATmBj2/4M4BJgLXAhcFO6DymWJE2A\nQUYIfwSeBF6fZBnwWuAgcBGwpW2zBbi4La8Hbq2qo1W1D9gLrBvg+JKkIeo7EKrqUeCrwEN0g+Cx\nqtoBzFTVXNvmELCy7XIScKDnLQ62NknSBFjW745J3gx8GlgDPAbcluSjQM3bdP76S7Jp06Znl2dn\nZ5mdne2rn5I0rTqdDp1OZ2jvl6q+vq9Jcgnwt1V1VVv/GHAO8H5gtqrmkqwC/qOq1ibZAFRVXd+2\n/xFwbVX95wLvXf32azF1T3mMo19hEv97SJosSaiqvs/NDnIOYQ9wTpLXtJPD5wG7gG3AFW2by4E7\n2vI24NJ2JdKpwGnA3QMcX5I0RH1PGVXVfyX5DvAr4Cng18A3gDcCW5NcCeyne2URVbUryVa6oXEE\nuHoihwGS9ArV95TRYnLK6HlHdspI0osa55SRJGmKGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJj\nIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCBngegkbphPbT26M1M7OGQ4f2jfy4ksbD5yG8\nDON8HoLPYZD0YnwegiRpKAwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAg\nSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkpqBAiHJ8iS3Jdmd5IEkZydZkWR7\nkj1J7kqyvGf7jUn2tu3PH7z7kqRhGXSEcANwZ1WtBd4B/AbYAOyoqtOBncBGgCRnAJcAa4ELgZvS\nfWq9JGkC9B0ISU4E3lNVmwGq6mhVPQZcBGxpm20BLm7L64Fb23b7gL3Aun6PL0karkFGCKcCv0+y\nOck9Sb6R5HXATFXNAVTVIWBl2/4k4EDP/gdbmyRpAgwSCMuAs4CvV9VZwBN0p4tq3nbz1yVJE2jZ\nAPs+DByoql+29R/QDYS5JDNVNZdkFfBI+/tB4OSe/Ve3tgVt2rTp2eXZ2VlmZ2cH6KokTZ9Op0On\n0xna+6Wq/3/AJ/kJcFVVPZjkWuB17U9/qKrrk3wBWFFVG9pJ5VuAs+lOFf0YeEst0IEkCzWPXfcc\n+Dj6Nb7jTuLnIGlhSaiqvi/WGWSEAPAp4JYkxwG/Az4OvBrYmuRKYD/dK4uoql1JtgK7gCPA1RP5\nrS9Jr1ADjRAWiyOE5x15bMedxM9B0sIGHSF4p7IkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIk\nwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElS\nYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVKzbNwdeLmeeOIJbr/99nF3Q5KmzpIL\nhM2bN/PZz/4zxx23bqTHPXLk7pEeT5JGbckFwtNPPw18kMcfv3Gkxz3hhE8CD470mJI0Sp5DkCQB\nBoIkqVlyU0YapRNIMtIjzsys4dChfSM9pqQuA0Ev4DBQIz3i3NxoA0jSnzhlJEkChhAISV6V5J4k\n29r6iiTbk+xJcleS5T3bbkyyN8nuJOcPemxJ0vAMY4RwDbCrZ30DsKOqTgd2AhsBkpwBXAKsBS4E\nbsqoJ6glScc0UCAkWQ18CPhmT/NFwJa2vAW4uC2vB26tqqNVtQ/YC4z27jJJ0jENOkL4GvA5nnvm\ncaaq5gCq6hCwsrWfBBzo2e5ga5MkTYC+AyHJh4G5qroXeKGpn9FepiJJ6ssgl52eC6xP8iHgtcAb\nk9wMHEoyU1VzSVYBj7TtDwIn9+y/urUtaNOmTc8uz87OMjs7O0BXJWn6dDodOp3O0N4vVYP/Az7J\ne4F/qKr1Sb4C/F9VXZ/kC8CKqtrQTirfApxNd6rox8BbaoEOJFmoGYAbb7yRz3/+txw+PPrfMjp8\n+OuMZ8CTV9BxwzD+n5ReiZJQVX1frLMYN6Z9Gdia5EpgP90ri6iqXUm20r0i6Qhw9TG/9SVJIzeU\nQKiqnwA/act/AD5wjO2uA64bxjElScPlncqSJMBAkCQ1BoIkCTAQJEmNgSBJAnwegibO6B/KAz6Y\nRwIDQRNn9A/lAR/MI4FTRpKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAk\nAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiS\nGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElq+g6EJKuT7EzyQJL7k3yqta9Isj3JniR3JVnes8/G\nJHuT7E5y/jAKkCQNxyAjhKPAZ6rqTODdwCeSvA3YAOyoqtOBncBGgCRnAJcAa4ELgZuSZJDOS5KG\np+9AqKpDVXVvW34c2A2sBi4CtrTNtgAXt+X1wK1VdbSq9gF7gXX9Hl+SNFxDOYeQ5BTgncDPgZmq\nmoNuaAAr22YnAQd6djvY2iRJE2DZoG+Q5A3A94FrqurxJDVvk/nrL8mmTZueXZ6dnWV2drbfLkrS\nVOp0OnQ6naG930CBkGQZ3TC4uaruaM1zSWaqai7JKuCR1n4QOLln99WtbUG9gSBJer75/1j+0pe+\nNND7DTpl9G1gV1Xd0NO2DbiiLV8O3NHTfmmS45OcCpwG3D3g8SVJQ9L3CCHJucBHgfuT/Jru1NAX\ngeuBrUmuBPbTvbKIqtqVZCuwCzgCXF1VfU0nSZKGr+9AqKqfAa8+xp8/cIx9rgOu6/eYkqTF453K\nkiTAQJAkNQNfdipNhxM
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10e94a6d8>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"plt.hist(nums_sents)\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEACAYAAABfxaZOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH9VJREFUeJzt3XuQXGWd//H3d+6TSRiSQIibkAAikLAgCKIrqzSCgLoS\n1gsKKz8ChbWKi9buekn4aTHZxQXdcvmxWtbWCmK8LQvrhbBSlZgKzcpyR2IiCUMUSEJMBkhCIBeS\nmcn398dzOt3T0z3T1+nT3Z9X1VSfPn366W9Oks955jlPn2PujoiINIeWWhcgIiITR6EvItJEFPoi\nIk1EoS8i0kQU+iIiTUShLyLSRMYNfTO73cwGzGxN1vrrzGy9ma01s5sz1i82sw3RaxdUo2gRESlN\nWwHb3AF8C/hBaoWZJYAPAae4+5CZHRGtnwdcCswDZgMrzewtri8DiIjEwrg9fXd/ENiZtfozwM3u\nPhRt80q0fgFwp7sPufsLwAbgrMqVKyIi5Sh1TP8E4D1m9oiZ3W9mZ0TrZwGbM7bbEq0TEZEYKGR4\nJ9/7prr7O83s7cDdwHGVK0tERKqh1NDfDPwMwN0fN7NhM5tO6NnPydhudrRuFDPTOL+ISAnc3Up9\nb6HDOxb9pPwCeC+AmZ0AdLj7dmAZ8HEz6zCzY4HjgcfyNerusf+54YYbal6D6lSd9VxnPdRYT3WW\na9yevpn9BEgA081sE3AD8D3gDjNbC+wH/k8U4uvM7C5gHTAIXOuVqFJERCpi3NB398vzvHRFnu1v\nAm4qpygREakOfSN3HIlEotYlFER1VpbqrJx6qBHqp85yWa1GX8xMIz8iIkUyM3wCTuSKiEgDUOiL\niDQRhb6ISBNR6IuINBGFvohIE1Hoi4g0EYW+iEgTUeiLiDSR2If+tm3w3/9d6ypERBpD7EP/scfg\n1ltrXYWISGOIfegPDcHu3bWuQkSkMcQ+9IeHFfoiIpUS+9BXT19EpHLqIvT37Kl1FSIijaEuQn/3\nbrj+erj33lpXIyJS3+oi9Pftg7Vr4fnn4d/+LYzzi4hI8eoi9AFefBFefx2uuw4GBmpbk4hIvRo3\n9M3sdjMbMLM1OV77ezM7aGbTMtYtNrMNZrbezC4ot8BUr37zZnj55XAQeOmlclsVEWlOhfT07wAu\nzF5pZrOB9wEbM9bNAy4F5gHvB75jZiXf1gvSPf3t22HLlrCs0BcRKc24oe/uDwI7c7x0C/DFrHUL\ngDvdfcjdXwA2AGeVU2Aq9EGhLyJSrpLG9M3sYmCzu6/NemkWsDnj+ZZoXckU+iIildNW7BvMrBu4\nnjC0U3WZob91a3hU6IuIlKbo0AfeDBwD/DYar58N/MbMziL07OdkbDs7WpdTX1/foeVEIkEikRi1\nTWboDw6GR4W+iDSLZDJJMpmsWHvm7uNvZHYMcK+7n5LjteeBt7n7TjObD/wYeAdhWOdXwFs8x4eY\nWa7Vo3z1q3DjjdDTE76Ze9hh8O5363LLItKczAx3L3mCTCFTNn8CPAScYGabzOyqrE0cMAB3Xwfc\nBawD7gOuLSjZx5Dq6c+Jfn94y1vU0xcRKdW4wzvufvk4rx+X9fwm4KYy6zpkaAhaWkLor18Pb34z\nPPpopVoXEWkudfGN3N5eOPro8HzGDNi/v7Y1iYjUq7oI/Q9/GBYuhPZ2mDZt5MldEREpXF2E/hln\nwNlnw5QpCn0RkXLEPvSHh6EtOvOg0BcRKU/sQ39oCFpbw7JCX0SkPHUR+qme/kc+AiefrNAXESlV\nKd/InVCZod/XBwcPhnXuUN71O0VEmk9d9fQhzNlvaQnhLyIixam70IfwXEM8IiLFi33oZ87eSWlr\ng+9+N1yXR0RECldXY/opbW2wbVv6UssiIlKY2Pf0M6dsprS1wb59sHt3bWoSEalXdRH6uXr6Cn0R\nkeIp9EVEmkhdh/6ePbWpSUSkXsU+9PPN3lFPX0SkeLEPfQ3viIhUjkJfRKSJ1EXo55uyuWcPPP+8\nvp0rIlKoQm6MfruZDZjZmox13zCz9Wa22sx+amaHZby22Mw2RK9fUG6BY/X0h4bgL/8SHn643E8R\nEWkOhfT07wAuzFq3AjjZ3U8DNgCLAcxsPnApMA94P/Ads/KuhTlW6AP098PeveV8gohI8xg39N39\nQWBn1rqV7p66zuUjwOxo+WLgTncfcvcXCAeEs8opcKzZOwBvvKEbpYuIFKoSY/pXA/dFy7OAzRmv\nbYnWlSxXT7+9PR36EIJfRETGV9YF18zs/wKD7v4fpby/r6/v0HIikSCRSIzaJt/wTuaQjkJfRBpV\nMpkkmUxWrL2SQ9/MFgIfAN6bsXoLcHTG89nRupwyQz+ffLN3BgfTzzW8IyKNKrtDvGTJkrLaK3R4\nx6Kf8MTsIuCLwMXunhm5y4BPmFmHmR0LHA88Vk6B+Xr6AJMnh0f19EVECjNuT9/MfgIkgOlmtgm4\nAbge6AB+FU3OecTdr3X3dWZ2F7AOGASudXcvp8CxQn/q1PAFLYW+iEhhxg19d788x+o7xtj+JuCm\ncorKNFboz5oV7pWr4R0RkcLE/s5Z+aZsAnzrW3Dfferpi4gUqm4vwwDQ2Qnd3Qp9EZFCxTr0Dx4E\nM2jJqjIV+u3t0NWl4R0RkULFOvRz9fJhdOirpy8iUpjYh372eD6MDP3OToW+iEih6j70NbwjIlK4\nWId+rpk7oOEdEZFSxTr0C+3pK/RFRApT96Hf2anhHRGRQtV16Le1qacvIlKM2Ie+pmyKiFROrEN/\n377wjdtsbW3hYGCm4R0RkWLEOvT37IGentHr29pCLx/U0xcRKYZCX0SkiTRE6Gt4R0SkMHUf+roM\ng4hI4Roi9Pfvh/LuzyUi0hzqPvRbWsJy5o3SRUQkt7oPfdDJXBGRQo0b+mZ2u5kNmNmajHVTzWyF\nmfWb2XIz6814bbGZbTCz9WZ2QTnFFRr6nZ1hTr+IiIytkJ7+HcCFWesWASvd/URgFbAYwMzmA5cC\n84D3A98xMyu1uHyh394+MvR1y0QRkcKMG/ru/iCwM2v1AmBptLwUuCRavhi4092H3P0FYANwVqnF\naXhHRKSySh3Tn+HuAwDuvg2YEa2fBWzO2G5LtK4khYZ+dzfs3Qvbt5f6SSIizSHHNSxLUtKEyb6+\nvkPLiUSCRCIx4vVievorV8Ly5eFRRKRRJJNJkslkxdorNfQHzOwodx8ws5nAS9H6LcDRGdvNjtbl\nlBn6ueQL/UmTRq7v7oatW+H11wusXkSkTmR3iJcsWVJWe4UO71j0k7IMWBgtXwnck7H+E2bWYWbH\nAscDj5Va3J49IeCznXMO/OhH6efd3bBjhy7HICIynnF7+mb2EyABTDezTcANwM3A3WZ2NbCRMGMH\nd19nZncB64BB4Fr30r8rm6+n39ICU6emn3d1hfF8hb6IyNjGDX13vzzPS+fn2f4m4KZyikrJF/rZ\nurth2zbN4BERGU9dfiM3m3r6IiKFqdTsnaoopqe/fbsuuiYiMp7Y9vSffDL04KdMGX/brq5wIlfD\nOyIiY4tt6C9eDF/7Wvom6GPp7g69fA3viIiMLbah/+yzcO65hW3b1RUeh4fDj4iI5Bbb0N++HaZP\nL2zb7u70snr7IiL5xTL0DxwI4/OHHVbY9qmePmhcX0RkLLEM/e3bYdo0KPSizOrpi4gUJrahX+jQ\nDozs6Sv0RUTyi23oH3FE4dtn9vQ1vCMikl9sQ7+Unv6kSerpi4iMJZah/8orxYV+qqc/dapCX0Rk\nLLEM/VJ7+lOnanhHRGQsDRH6qZ7+tGnq6YuIjKUhQr+rK1yuYcoUhb6IyFhiGfr79uW+Y1Y+3d1h\n+87OMLyzaRMMDVWvPhGRehXL0B8cHHnj8/EceSR85jMh9Pfvh6uvhv/5n+rVJyJSrxoi9Lu64Oab\n06G/f3/4bUFEREZqiNBP6eoKgT84GK7fIyIiI5UV+ma22MyeNrM1ZvZjM+sws6lmtsLM+s1suZn1\nFttuqaGfGtMfGtIJXRG
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x107dc1a58>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"%matplotlib inline\n",
|
||
|
"counter = Counter(nums_entities)\n",
|
||
|
"plt.plot(list(counter.keys()), list(counter.values()))\n",
|
||
|
"plt.show()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEACAYAAAC9Gb03AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFtpJREFUeJzt3XuQHeV55/HvIwlkxEUIExR0DSBAWMZcDAIHcMawBgFV\nUUIlMYJysBM7pAp2Xa7aDdgV184f63WcimtZjL22Ykxsp4JcGygbjLAFmCnCiouwdeGiuwTSSEIy\nmEsCMh6JZ//oIzyMJc2ZmTOnz+nz/VSdmu4+7/R5Xkb8puftt7sjM5EkVdeYsguQJI0ug16SKs6g\nl6SKM+glqeIMekmqOINekipu0KCPiNsjYmdErDpIm1sjYn1ErIiIMxtboiRpJOo5or8DuOxAb0bE\n5cBJmXkycD3wjQbVJklqgEGDPjMfBV45SJP5wHdrbZ8AJkbE5MaUJ0kaqUaM0U8FtvZb31bbJklq\nAZ6MlaSKG9eAfWwDpvdbn1bb9lsiwhvrSNIwZGYM93vrDfqovfbnHuAG4PsRcT7wambuPNCOqnwT\nte7ubrq7u5v+uS+/DKtXw3PPwY4d8Npr8Prrxdf+r74+uP56OPzw+l7jx0P0+6mX1b9mqXL/qtw3\nqH7/Ioad8UAdQR8R/wJ0Ae+NiC3AfwcOBTIzF2bm4oi4IiI2AG8AnxxRRdqvTNi+vQj0faG+7+uv\nfw2nnVa8pk2DqVOL5YkTi9dRRxVfZ84sAlxSZxk06DPzmjra3NiYcjrbnj2wZg3s2lWE+M9+Blu2\nFEfpW7fCYYcVAf6+9xWvP/mTYv3449995C1J/TVijF41XV1dQ/6eV16BH/0IFi+GJUvg2GNhyhSY\nNQvOOw+uvrpYnzoVJk1qfM1DMZz+tZMq96/KfYPq92+koplj5hGRVR6jH6of/AA+9Sm46CK48kq4\n/PIi0CWpv4hoyslYNVAmfOYzcO+9xZH83LllVySpypxH32Rf+hK85z2wYgWsXGnISxp9Dt00UW8v\nnHEGLF8OM2aUXY2kdjHSoRuDvknefhuuuw6OOw6+8pWyq5HUTgz6NnD//fDFL8KYMcWY/BFHlF2R\npHYy0qB3jH4U7d1bzKq54Qb49Kehp8eQl9R8zroZBb/+Ndx0U3H0PmVKceL1qKPKrkpSpzLoG+yt\nt2D+/OI+MYsWFVeuvuc9ZVclqZM5dNMge/fCrbfC5Mlw9NFw991w1lmGvKTyeUTfAG++WVzVuns3\nPPVUcfsCSWoVzroZoUz48z8vpk9+73vFzBpJaiRvgVCyhQuLK1wff9yQl9SaPKIfgb4+mD4dHngA\nTj+97GokVZXz6EuSCf/wD3DKKYa8pNbm0M0w9PXBX/xFMT/+vvvKrkaSDs6gH6JM+Ou/Lp7T+sQT\nMGFC2RVJ0sEZ9EN0993FiVdDXlK78GTsEOzeDXPmwLe+BRdfXHY1kjqFd69sovnzi2e63n572ZVI\n6iTOo2+Sn/60OPm6YUPZlUjS0Bj0dejrg2uuge9+Fw45pOxqJGlonEc/iLffhi98AU44AS69tOxq\nJGnoPKIfxC23FFe+fv3rZVciScPjydhBnHkm3HYbXHhh2ZVI6lTeAmEUrVwJO3fChz5UdiWSNHwG\n/QFkwmc/W4zPjx1bdjWSNHwG/QE88ghs3w5/9VdlVyJJI2PQH8CiRfCJT8A4T1dLanOejN2P7duL\nWw8vXw4zZpRdjaRO58nYUfCd78Cf/ZkhL6kaDPr9uPde+OM/LrsKSWoMh24G2LWreGrUzp0wfnzZ\n1UiSQzcNd8cd8NGPGvKSqsMj+n5efBFOO614sMipp5ZdjSQVmnJEHxHzImJNRKyLiJv28/57I+L+\niFgREU9HxCeGW1CZfvzj4mjekJdUJYMGfUSMAW4DLgPmAAsiYvaAZjcCKzLzTOAjwFciou1moN93\nH1xxRdlVSFJj1XNEPxdYn5kvZGYfsAiYP6DNi8CRteUjgZczc0/jyhx9fX3w4INw+eVlVyJJjVXP\nUfdUYGu/9V6K8O/vH4GHImI7cATwscaU1zxLl8KsWTB5ctmVSFJjNWp45XPAysz8SEScBDwQER/I\nzP8Y2LC7u/ud5a6uLrq6uhpUwsjcdx9ceWXZVUgS9PT00NPT07D9DTrrJiLOB7ozc15t/WYgM/PL\n/dosBr6Ymf+vtv4QcFNmPjVgXy076+b974dvfxvmDvxbRZJK1oxZN8uAWRExMyIOBa4G7hnQZjXw\nn2oFTQZOATYNt6hme/754kKpc84puxJJarxBh24yc29E3AgsofjFcHtmro6I64u3cyHwJeCOiFgJ\nBPA3mfnL0Sy8kb72NViwAMZ4+ZikCur4C6ZefhlOPrl4mtT06WVXI0m/zVsgjNCtt8JVVxnykqqr\n7S5qaqR/+7di2Oaxx8quRJJGT0cP3Vx3HZx7Ltx4Y9mVSNKBOXQzAo8+ChdfXHYVkjS6Ojbo16+H\nN96A2QPv2iNJFdOxQX/nncXjAp1SKanqOjLmMougX7Cg7EokafR1ZNCvWgW/+hWcf37ZlUjS6OvI\noL/zTrj6aohhn8OWpPbRkUH/0EPeqVJS5+i4efR79sDEicXzYY88cvD2klQ259EP0bp1MGWKIS+p\nc3Rc0D/4IFxwQdlVSFLzdFzQ33UX/Omfll2FJDVPR43Rv/FG8UzYXbtgwoTSypCkIXGMfgiWLoWz\nzzbkJXWWjgr6Zcu8SEpS5+mooF+1Cj7wgbKrkKTm6qigf/ppOP30squQpObqmKB/6SXo7YXTTiu7\nEklqro4J+sWL4ZJL4NBDy65EkpqrI4I+E776Vbj22rIrkaTm64h59Bs3woc/DFu3+qARSe3HefR1\neOqp4iHghrykTtQR0fezn8E555RdhSSVo2OC/oMfLLsKSSpH5cfoM+GYY2DtWjjuuKZ+tCQ1hGP0\ng9i0CY44wpCX1LkqH/QO20jqdJUP+qeeMugldbbKB/0jj8DcuWVXIUnlqXTQr1oF27YVtz6QpE5V\n6aC//3646ioYN67sSiSpPJUO+p4e6OoquwpJKldl59H39cF73wubNxdfJaldNWUefUTMi4g1EbEu\nIm46QJuuiFgeEc9ExMPDLahRfv5zOOEEQ16SBh29jogxwG3AJcB2YFlE/DAz1/RrMxH4GnBpZm6L\niGNHq+B6LV0KF15YdhWSVL56jujnAusz84XM7AMWAfMHtLkGuCsztwFk5kuNLXPonnnGxwZKEtQX\n9FOBrf3We2vb+jsFOCYiHo6IZRHx8UYVOFzPPgvvf3/ZVUhS+Ro18XAccDZwMXA48FhEPJaZGxq0\n/yHJLIJ+zpwyPl2SWks9Qb8NmNFvfVptW3+9wEuZ+SvgVxHxCHAG8FtB393d/c5yV1cXXaMw/3HL\nFjjqKJg0qeG7lqRR19PTQ09PT8P2N+j0yogYC6ylOBm7A3gSWJCZq/u1mQ18FZgHjAeeAD6Wmc8N\n2FdTplcuXgy33AJLloz6R0nSqBvp9MpBj+gzc29E3AgsoRjTvz0zV0fE9cXbuTAz10TET4BVwF5g\n4cCQb6ZnnnHYRpL2qeQFU9ddVzwM/C//ctQ/SpJGnQ8e2Q9PxErSb1TuiP7tt+HII2HHjuKErCS1\nO4/oB9i8GY491pCXpH0qF/QO20jSu1Uu6J1xI0nvVrmg99YHkvRulQx6j+gl6TcqNetmz57iJOwv\nfgGHHz5qHyNJTeWsm342bIDjjzfkJam/SgX9E0/AueeWXYUktZZKBf3SpfD7v192FZLUWioV9MuX\nwznnlF2FJLWWSgX9xo0wa1bZVUhSa6lM0L/6Krz1FvzO75RdiSS1lsoE/ebNcNJJEMOegCRJ1VSZ\noF+/vgh6SdK7VSbovceNJO1fpYL+9NPLrkKSWk9lgt573EjS/lXiXjeZcNhh8PLL3v5AUvV4rxvg\npZdgwgRDXpL2pxJBv3UrTJ9edhWS1JoMekmquEoE/caNMHNm2VVIUmuqRNA/+CB0dZVdhSS1praf\ndfP223DkkdDbC5MmNXT
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10fb574a8>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import numpy as np\n",
|
||
|
"%matplotlib inline\n",
|
||
|
"counter = Counter(nums_entities)\n",
|
||
|
"keys = sorted(counter.keys())\n",
|
||
|
"values = [counter[key] for key in keys]\n",
|
||
|
"plt.plot(keys, np.cumsum(values)/sum(values))\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEACAYAAABI5zaHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGeVJREFUeJzt3XuYVNWZ7/HvC0YEQQQciYD3RElQgaA84o1GvKAnERNN\nRBKjPsbwGLzFxANh5kzaiXow4/1oDDgELwgS0QhGjBiHjvEKKipC040abo3ijOAgCNo07/ljFdA0\n3fStqtauXb/P89TTVdXb8ucS3l797rXXNndHRETSpU3sACIikn0q7iIiKaTiLiKSQiruIiIppOIu\nIpJCKu4iIinUaHE3s0lmtsbM3tnNMXeb2VIze8vM+mU3ooiINFdTZu6TgTMb+qaZnQUc7u5fB0YB\nv89SNhERaaFGi7u7vwis280hw4GHMse+BnQ2s+7ZiSciIi2RjZ57T2BlrddVmfdERCQSnVAVEUmh\nPbLwGVXAgbVe98q8twsz00Y2IiIt4O7WnOObWtwt86jPLGA0MN3Mjgc+dfc1uwnYnHxRlJaWUlpa\nGjtGo5Qzuwoh57hxpVxxRSnr1rHTY+3aHc83bICNG8Pj8893fb5+Pey7L/TqFR49e+543qsX9OgB\nHTvCXntB+/bh6x7NnAYWwlhC4eQ0a1ZdB5pQ3M1sKlACdDOzFcCvgT0Bd/eJ7j7bzM42s/eAjcCl\nzU4hImzaBJWVUFUFq1bteNR+vXEjPPggdOkCXbuGr7UfvXtDp06w997QoUP4Wvt5hw7QuTO0axf7\nv1ZyrdHi7u4jm3DMldmJI1I8Nm2CV1+FsrLweOMNOOSQnWfRJ5yw88z6zjvhhhsiB5eCkI2ee+qU\nlJTEjtAkyplduc5ZXzE/+mgoKYFx40Ih79Rp958xZEhuM2aL/p/HZ/nsgZuZF0LPXSRb1q+HP/8Z\nHn8cnnsOvvnNUMxLSuDEExsv5iIQeu7NPaGq4i6SZWvXwqxZoaD/7W9w8slw3nkwfDh06xY7nRQi\nFXeRCLZsgYoKeOmlUNBfeQWGDg0F/dvfDitTRFpDxV0kxzZtgoULYcGCHY933w3LBwcOhHPPhbPO\nCksJRbJFxV0ky7ZuDTPxadPCSdD334cjj4T+/Xc8+vaFffaJnVTSTMVdJEsWLoSpU0NR79ABRo4M\nM/KjjtIaccm/lhR3LYUUyfjHP0IxnzoVPvsMRoyAmTPhmGOgBRcIikSlmbsUvTfegOuug/JyOP/8\nMEs/4QRoo231JCE0cxdphs8+g3/91zBTHz8efvQj+MpXYqcSyQ7NTaQozZwJffrAp5/CokVw6aUq\n7JIumrlLUVm5Eq6+GhYvDhtwDRkSO5FIbmjmLkWhpgbuuissXezXD955R4Vd0k0zd0m9igr44Q/D\nhUUvvhi2xRVJO83cJdX+9jc45RS47DKYO1eFXYqHZu6SWg8/DL/8ZVgNM3Ro7DQi+aXiLqnjDqWl\nobjPnRu22RUpNirukipffBFaMO+9F/aE6d49diKRONRzl9T45BM4/fRQ4OfOVWGX4qbiLqmwdCkM\nGhS2DZg+Hdq3j51IJC4Vdyl4L78c7nZ0/fVhGwHtCSOijcOkwH38cdhPfdIkOPvs2GlEckP7uUtR\ncYfvfjesXR8/PnYakdzRrpBSVB58MOzBPn167CQiyaOZuxSk5cvh2GPh+efDzTRE0qwlM3edepKC\ns3UrXHJJuPpUhV2kfiruUnDuuguqq0NxF5H6qS0jBWXxYhg8GF59FQ4/PHYakfxQW0ZS7csv4aKL\n4OabVdhFGqPiLgXjxhvhq1+Fn/wkdhKR5NNSSCkIr70GEyfCggVgzfrlVKQ4aeYuiff556Edc889\ncMABsdOIFAadUJXEu+oqWLcOpkyJnUQkDl2hKqlTVgZPPgkLF8ZOIlJY1JaRxNq0CS6/HO69F/bd\nN3YakcKitowk1pgxsGyZ9o4Rydk6dzMbZmZLzKzSzMbU8/1uZvaMmb1lZgvN7JLmhBCp68034YEH\n4O67YycRKUyNztzNrA1QCQwFVgPzgRHuvqTWMb8G9nL3X5nZfkAF0N3dt9T5LM3cpVHV1TBwIFx7\nLVx8cew0IvHlauY+EFjq7svdvRp4FBhe55iPgE6Z552AT+oWdpGmuu022H9/+PGPYycRKVxNWS3T\nE1hZ6/UqQsGv7X7geTNbDXQELshOPCk2lZVw663w+uu6WEmkNbK1FPJXwNvuPsTMDgeeM7Nj3H1D\n3QNLS0u3Py8pKaGkpCRLEaTQbd0aVsf8y7/AIYfETiMST1lZGWVlZa36jKb03I8HSt19WOb1WMDd\n/ZZax8wGbnL3lzKvnwfGuPvrdT5LPXdp0IQJMHkyvPQStG0bO41IcuSq5z4f+JqZHWxmewIjgFl1\njikHTsuE6A4cAXzQnCBS3Kqqwox90iQVdpFsaLQt4+41ZnYlMIfww2CSu5eb2ajwbZ8I/F9gspm9\nDRjwv919bS6DS3q4w89+BqNHQ58+sdOIpIMuYpLo/vhHuOGGsLa9XbvYaUSSpyVtGRV3ieqTT+Co\no+CJJ2DQoNhpRJJJxV0Kzs9/Hu6wdO+9sZOIJJeKuxSU1avDrH3x4nCHJRGpn4q7FJSrrgo99ltv\njZ1EJNlU3KVgrFwJ/fpBeXnYakBEGpazXSFFsu2mm8LVqCrsIrmhmbvk3bJlMGAAVFTAfvvFTiOS\nfJq5S0G48Ua44goVdpFc0j1UJa/efz/cE3Xp0thJRNJNM3fJq9/8JqyS6dIldhKRdNPMXfKmshKe\nfhreey92EpH008xd8ubf/i3cOq9z59hJRNJPq2UkL8rLYfDg0HPv1Knx40VkB62WkcQqLYVf/EKF\nXSRfNHOXnFu4EE4/PfTaO3aMnUak8GjmLolUWgrXX6/CLpJPmrlLTi1YAGefHXrtHTrETiNSmDRz\nl8S54QYYO1aFXSTfNHOXnFm4EM44Az74ANq3j51GpHBp5i6JcvPN4U5LKuwi+aeZu+TE0qVwwglh\n1q7ljyKto5m7JMb48TB6tAq7SCyauUvWrVgR7rL03nvQtWvsNCKFTzN3SYRbb4XLLlNhF4lJM3fJ\nqjVr4BvfgEWL4IADYqcRSQfN3CW6O+6ACy9UYReJTTN3yZp16+BrX4M334SDD46dRiQ9NHOXqO65\nB845R4VdJAk0c5es2LABDjsM/v53OPLI2GlE0kUzd4lmwgQYMkSFXSQpNHOXVtu8OczaZ88O69tF\nJLs0c5coJk+Gb31LhV0kSTRzl1aproYjjoCpU2HQoNhpRNJJM3fJu2nT4NBDVdhFkkYzd2mxrVuh\nT5+wBHLo0NhpRNIrZzN3MxtmZkvMrNLMxjRwTImZLTCzd81sbnNCSGF66qmw6+Opp8ZOIiJ1NTpz\nN7M2QCUwFFgNzAdGuPuSWsd0Bl4GznD3KjPbz93/u57P0sw9RU45JWzre8EFsZOIpFuuZu4DgaXu\nvtzdq4FHgeF1jhkJPO7uVQD1FXZJl/nzYflyOO+82ElEpD5NKe49gZW1Xq/KvFfbEUBXM5trZvPN\n7KJsBZRkuv12uOYa2GOP2ElEpD7Z+qu5B/At4FRgb+AVM3vF3d/L0udLgqxYAXPmhKtSRSSZmlLc\nq4CDar3ulXmvtlXAf7v7ZmCzmb0A9AV2Ke6lpaXbn5eUlFBSUtK8xBLd3XfDpZfCPvvETiKSTmVl\nZZSVlbXqM5pyQrUtUEE4ofohMA+40N3Lax3TG/h/wDCgHfAacIG7L67zWTqhWuDWrw/r2hcsgIMO\navx4EWm9lpxQbXTm7u41ZnYlMIfQo5/k7uVmNip82ye6+xIzexZ4B6gBJtYt7JIOkybBGWeosIsk\nnS5ikibbsiXcjOOxx+C442KnESke2n5AcuqJJ8KMXYVdJPlU3KVJ3OG22+C662InEZGmUHGXJnn5\nZVi7Fr7zndhJRKQpVNylSW67Da69Ftq2jZ1ERJpCJ1SlUe+/D8cfD8uWwd57x04jUnx0QlVy4s47\n4fLLVdhFColm7rJb69b
|
||
|
"text/plain": [
|
||
|
"<matplotlib.figure.Figure at 0x10e922668>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import numpy as np\n",
|
||
|
"%matplotlib inline\n",
|
||
|
"counter = Counter(nums_ques_words)\n",
|
||
|
"keys = sorted(counter.keys())\n",
|
||
|
"values = [counter[key] for key in keys]\n",
|
||
|
"plt.plot(keys, np.cumsum(values)/sum(values))\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.5.1"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|