26 lines
695 B
Bash
26 lines
695 B
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
DATA_DIR=$HOME/data
|
||
|
mkdir $DATA_DIR
|
||
|
|
||
|
# Download SQuAD
|
||
|
SQUAD_DIR=$DATA_DIR/squad
|
||
|
mkdir $SQUAD_DIR
|
||
|
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
|
||
|
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
|
||
|
|
||
|
|
||
|
# Download CNN and DailyMail
|
||
|
# Download at: http://cs.nyu.edu/~kcho/DMQA/
|
||
|
|
||
|
|
||
|
# Download GloVe
|
||
|
GLOVE_DIR=$DATA_DIR/glove
|
||
|
mkdir $GLOVE_DIR
|
||
|
wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
|
||
|
unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
|
||
|
|
||
|
# Download NLTK (for tokenizer)
|
||
|
# Make sure that nltk is installed!
|
||
|
python3 -m nltk.downloader -d $HOME/nltk_data punkt
|