From a3a7a85c829a2d9014d1d7f0b32dcfe9a23557e5 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 09:54:36 -0500 Subject: [PATCH 1/8] updated local directory --- .gitignore | 4 ++++ requirements.txt | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index a55e00c..3fe0e8e 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,7 @@ venv.bak/ .DS_Store .idea/ + +#sean_fork +sean_notes.txt +results/* \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f9d410c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +absl-py==0.11.0 +astunparse==1.6.3 +cachetools==4.2.0 +certifi==2020.12.5 +chardet==4.0.0 +click==7.1.2 +Cython==0.29.14 +flatbuffers==1.12 +gast==0.3.3 +gensim==3.8.3 +google-auth==1.24.0 +google-auth-oauthlib==0.4.2 +google-pasta==0.2.0 +grpcio==1.32.0 +h5py==2.10.0 +idna==2.10 +joblib==1.0.0 +Keras-Preprocessing==1.1.2 +Markdown==3.3.3 +nltk==3.5 +numpy==1.19.5 +oauthlib==3.1.0 +opt-einsum==3.3.0 +protobuf==3.14.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +regex==2020.11.13 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7 +scikit-learn==0.24.1 +scipy==1.6.0 +six==1.15.0 +sklearn==0.0 +smart-open==4.1.2 +tensorboard==2.4.1 +tensorboard-plugin-wit==1.8.0 +tensorflow==2.4.0 +tensorflow-estimator==2.4.0 +termcolor==1.1.0 +threadpoolctl==2.1.0 +tqdm==4.56.0 +typing-extensions==3.7.4.3 +urllib3==1.26.2 +Werkzeug==1.0.1 +wrapt==1.12.1 From e59ff1a7be894b522732a5e62d6bcf17921b3547 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 10:26:43 -0500 Subject: [PATCH 2/8] explicitly passing encoding argument to open GloVe file. shouldn't affect Linux/MacOS --- CO_ATTN/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CO_ATTN/utils.py b/CO_ATTN/utils.py index a4755d9..c330c07 100644 --- a/CO_ATTN/utils.py +++ b/CO_ATTN/utils.py @@ -75,7 +75,7 @@ def load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger, e logger.info("Loading GloVe ...") embedd_dim = -1 embedd_dict = dict() - with open(embedding_path, 'r') as file: + with open(embedding_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() if len(line) == 0: From 7ab81b22b5479d1639a6947d79ee870bda0be60a Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 10:26:56 -0500 Subject: [PATCH 3/8] add README.md --- README.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e7d6276..06e7f5c 100644 --- a/README.md +++ b/README.md @@ -3,20 +3,31 @@ Code for BEA 13 paper "Co-Attention Based Neural Network for Source-Dependent Es ## Dependencies -python 2 for data/preprocess_asap.py (will be upgraded to python 3) +Python 2 for data/preprocess_asap.py (will be upgraded to Python 3). + - I recommend that on installation, you *do not add to your PATH variable*. This way, it doesn't interfere with your current Python workflow. + - Then, when you need to run the preprocessing script, you'll run it something like: + - *c:/Python27/python.exe preprocess_asap.py* -python 3 for the rest +Python 3 for the rest * tensorflow 2.0.0 beta * gensim * nltk * sklearn -run python2 data/preprocess_asap.py for data splitting. -Download Glove pretrained embedding from https://nlp.stanford.edu/projects/glove -Extract glove.6B.50d.txt to the glove folder -run python3 attn_network.py [options] for training and evaluation +## Running on Linux, MacOS +1. Run python2 data/preprocess_asap.py for data splitting. +2. Download GloVe pretrained embedding from *https://nlp.stanford.edu/projects/glove* +3. Extract *glove.6B.50d.txt* to the glove folder +4. Run *python3 attn_network.py* [options] for training and evaluation + +## Running on Windows + +To run on Windows, do all of the commands for Linux/MacOS. Then you'll need to remove two "\n" symbols from the preprocessing script. + 1. open "data/preprocess_asap.py" in your preferred text editor + 2. on lines 28 and 31 in the preprocessing script, you'll find: *f_write.write("\r\n")* + 3. remove the *\n* from both lines ## Cite If you use the code, please cite the following paper: From bea2a0860eb02efa2d72485fc359dda6153504f3 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 10:29:24 -0500 Subject: [PATCH 4/8] updated README --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 06e7f5c..ba0ca4b 100644 --- a/README.md +++ b/README.md @@ -4,23 +4,24 @@ Code for BEA 13 paper "Co-Attention Based Neural Network for Source-Dependent Es ## Dependencies Python 2 for data/preprocess_asap.py (will be upgraded to Python 3). - - I recommend that on installation, you *do not add to your PATH variable*. This way, it doesn't interfere with your current Python workflow. - - Then, when you need to run the preprocessing script, you'll run it something like: - - *c:/Python27/python.exe preprocess_asap.py* + * I recommend that on installation, you *do not add to your PATH variable*. This way, it doesn't interfere with your current Python workflow. + * Then, when you need to run the preprocessing script, you'll run it something like: + * *c:/Python27/python.exe preprocess_asap.py* Python 3 for the rest * tensorflow 2.0.0 beta * gensim + * gensim may have more dependencies, such as VS tools * nltk * sklearn ## Running on Linux, MacOS 1. Run python2 data/preprocess_asap.py for data splitting. -2. Download GloVe pretrained embedding from *https://nlp.stanford.edu/projects/glove* +2. Download GloVe pretrained embedding from: *https://nlp.stanford.edu/projects/glove* 3. Extract *glove.6B.50d.txt* to the glove folder -4. Run *python3 attn_network.py* [options] for training and evaluation +4. Run *python3 attn_network.py [options]* for training and evaluation ## Running on Windows From 7392f781b76c5f99155ac6b7e0f48b0daebef726 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 10:30:46 -0500 Subject: [PATCH 5/8] updated README --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ba0ca4b..606dd1c 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,10 @@ Code for BEA 13 paper "Co-Attention Based Neural Network for Source-Dependent Es ## Dependencies Python 2 for data/preprocess_asap.py (will be upgraded to Python 3). - * I recommend that on installation, you *do not add to your PATH variable*. This way, it doesn't interfere with your current Python workflow. - * Then, when you need to run the preprocessing script, you'll run it something like: - * *c:/Python27/python.exe preprocess_asap.py* + +* I recommend that on installation, you *do not add to your PATH variable*. This way, it doesn't interfere with your current Python workflow. +* Then, when you need to run the preprocessing script, you'll run it something like: + * *c:/Python27/python.exe preprocess_asap.py* Python 3 for the rest From f53b3a68659d91287fc0545f7a929c8d1a4191d4 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Thu, 28 Jan 2021 13:40:12 -0500 Subject: [PATCH 6/8] updated README --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 606dd1c..1b3713f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# co-attention +# Co-Attention Code for BEA 13 paper "Co-Attention Based Neural Network for Source-Dependent Essay Scoring" ## Dependencies @@ -23,6 +23,7 @@ Python 3 for the rest 2. Download GloVe pretrained embedding from: *https://nlp.stanford.edu/projects/glove* 3. Extract *glove.6B.50d.txt* to the glove folder 4. Run *python3 attn_network.py [options]* for training and evaluation +5. T ## Running on Windows @@ -31,6 +32,23 @@ To run on Windows, do all of the commands for Linux/MacOS. Then you'll need to r 2. on lines 28 and 31 in the preprocessing script, you'll find: *f_write.write("\r\n")* 3. remove the *\n* from both lines +## Results + +After preprocessing the data, the program will start the training process. At the end of each epoch, the logger will +output the development and testing set scores. The highest will be kept and outputted after all epochs are complete. You can toggle the which +task (the default is ASAP3), the number of epochs (the default is 50), and more by looking at the arguments in lines 21-51 of *attn_network.py*. + +Additionally, if you want to look at specific essays with their predicted and actual scores: + 1. go to the "checkpoints folder" + 2. after training, there should be a text file with one number per line. the line number corresponds to the essay number in the test data. + 3. in the various fold directories, open the test.tsv file and compare with the predicted scored in from step 2. + +## Coming Soon + +1. making specific essays, as well as their predicted scores and real scores, more accessible. + - likely a Python script +2. updating *the preprocessing_asap.py* to be compatible with Python 3 + ## Cite If you use the code, please cite the following paper: ``` From 8d8642347dd5b08935dc687a8d8b2d84d67f01b6 Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Sun, 31 Jan 2021 20:33:04 -0500 Subject: [PATCH 7/8] edited README --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1b3713f..dd73151 100644 --- a/README.md +++ b/README.md @@ -23,14 +23,14 @@ Python 3 for the rest 2. Download GloVe pretrained embedding from: *https://nlp.stanford.edu/projects/glove* 3. Extract *glove.6B.50d.txt* to the glove folder 4. Run *python3 attn_network.py [options]* for training and evaluation -5. T ## Running on Windows To run on Windows, do all of the commands for Linux/MacOS. Then you'll need to remove two "\n" symbols from the preprocessing script. - 1. open "data/preprocess_asap.py" in your preferred text editor - 2. on lines 28 and 31 in the preprocessing script, you'll find: *f_write.write("\r\n")* - 3. remove the *\n* from both lines + +1. open "data/preprocess_asap.py" in your preferred text editor +2. on lines 28 and 31 in the preprocessing script, you'll find: *f_write.write("\r\n")* +3. remove the *\n* from both lines ## Results @@ -39,9 +39,10 @@ output the development and testing set scores. The highest will be kept and outp task (the default is ASAP3), the number of epochs (the default is 50), and more by looking at the arguments in lines 21-51 of *attn_network.py*. Additionally, if you want to look at specific essays with their predicted and actual scores: - 1. go to the "checkpoints folder" - 2. after training, there should be a text file with one number per line. the line number corresponds to the essay number in the test data. - 3. in the various fold directories, open the test.tsv file and compare with the predicted scored in from step 2. + +1. go to the "checkpoints folder" +2. after training, there should be a text file with one number per line. the line number corresponds to the essay number in the test data. +3. in the various fold directories, open the test.tsv file and compare with the predicted scored in from step 2. ## Coming Soon From d3960d14fc3dd79bdb0b9c0e2d8fceead30c424c Mon Sep 17 00:00:00 2001 From: Sean Steinle Date: Sun, 31 Jan 2021 20:34:46 -0500 Subject: [PATCH 8/8] edited README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dd73151..a0371e9 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Python 3 for the rest ## Running on Linux, MacOS -1. Run python2 data/preprocess_asap.py for data splitting. +1. Run *python2 data/preprocess_asap.py* for data splitting. 2. Download GloVe pretrained embedding from: *https://nlp.stanford.edu/projects/glove* 3. Extract *glove.6B.50d.txt* to the glove folder 4. Run *python3 attn_network.py [options]* for training and evaluation @@ -28,7 +28,7 @@ Python 3 for the rest To run on Windows, do all of the commands for Linux/MacOS. Then you'll need to remove two "\n" symbols from the preprocessing script. -1. open "data/preprocess_asap.py" in your preferred text editor +1. open *data/preprocess_asap.py* in your preferred text editor 2. on lines 28 and 31 in the preprocessing script, you'll find: *f_write.write("\r\n")* 3. remove the *\n* from both lines @@ -40,7 +40,7 @@ task (the default is ASAP3), the number of epochs (the default is 50), and more Additionally, if you want to look at specific essays with their predicted and actual scores: -1. go to the "checkpoints folder" +1. go to the *checkpoints* folder 2. after training, there should be a text file with one number per line. the line number corresponds to the essay number in the test data. 3. in the various fold directories, open the test.tsv file and compare with the predicted scored in from step 2.