diff --git a/allennlp_config/ner.jsonnet b/allennlp_config/ner.jsonnet index e8b908d..61d400a 100644 --- a/allennlp_config/ner.jsonnet +++ b/allennlp_config/ner.jsonnet @@ -15,6 +15,12 @@ "do_lowercase": std.extVar("is_lowercase"), "use_starting_offsets": true }, + "bert2": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_VOCAB2"), + "do_lowercase": std.extVar("is_lowercase2"), + "use_starting_offsets": true + }, "token_characters": { "type": "characters", "min_padding_length": 3 @@ -36,6 +42,7 @@ "allow_unmatched_keys": true, "embedder_to_indexer_map": { "bert": ["bert", "bert-offsets"], + "bert2": ["bert2", "bert2-offsets"], "token_characters": ["token_characters"], }, "token_embedders": { @@ -43,6 +50,11 @@ "type": "bert-pretrained", "pretrained_model": std.extVar("BERT_WEIGHTS") }, + "bert2": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_WEIGHTS2"), + "requires_grad": false + }, "token_characters": { "type": "character_encoding", "embedding": { @@ -60,7 +72,7 @@ }, "encoder": { "type": "lstm", - "input_size": 768 + 128, + "input_size": 768 + 768 + 128, "hidden_size": 200, "num_layers": 2, "dropout": 0.5, diff --git a/allennlp_config/text_classification.jsonnet b/allennlp_config/text_classification.jsonnet index 3a7155a..3fee85d 100644 --- a/allennlp_config/text_classification.jsonnet +++ b/allennlp_config/text_classification.jsonnet @@ -11,6 +11,12 @@ "do_lowercase": std.extVar("is_lowercase"), "use_starting_offsets": true }, + "bert2": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_VOCAB2"), + "do_lowercase": std.extVar("is_lowercase2"), + "use_starting_offsets": true + }, "token_characters": { "type": "characters", "min_padding_length": 3 @@ -23,17 +29,24 @@ "evaluate_on_test": true, "model": { "type": "text_classifier", - "verbose_metrics": true, + "verbose_metrics": false, "text_field_embedder": { "allow_unmatched_keys": true, "embedder_to_indexer_map": { "bert": ["bert", "bert-offsets"], + "bert2": ["bert2", "bert2-offsets"], "token_characters": ["token_characters"], }, "token_embedders": { "bert": { "type": "bert-pretrained", - "pretrained_model": std.extVar("BERT_WEIGHTS") + "pretrained_model": std.extVar("BERT_WEIGHTS"), + "requires_grad": false + }, + "bert2": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_WEIGHTS2"), + "requires_grad": false }, "token_characters": { "type": "character_encoding", @@ -53,7 +66,7 @@ "text_encoder": { "type": "lstm", "bidirectional": true, - "input_size": 768 + 128, + "input_size": 768 + 768 + 128, "hidden_size": 200, "num_layers": 2, "dropout": 0.5 diff --git a/scripts/exp.sh b/scripts/exp.sh index 90b6a8f..25a4bb1 100755 --- a/scripts/exp.sh +++ b/scripts/exp.sh @@ -4,14 +4,19 @@ bertvocab="ds_dpsaxi4ltpw9:/bert_vocab/" bertweights="ds_jda1d19zqy6z:/bert_weights/" -for task in text_classification +for dataset in NCBI-disease bc5cdr JNLPBA sciie chemprot citation_intent mag rct-20k sciie-relation-extraction # pico do - for dataset in chemprot + for SEED in 13370 13570 14680 do - for SEED in 13370 13570 14680 - do - for model in bertbase_basevocab_cased biobert_pmc_basevocab_cased biobert_pubmed_pmc_basevocab_cased s2bert_basevocab_uncased_512 s2bert_s2vocab_uncased_512 bertbase_basevocab_uncased biobert_pubmed_basevocab_cased s2bert_basevocab_cased_512 s2bert_s2vocab_cased_512 - do + # for model in s2bert_basevocab_cased_512 s2bert_s2vocab_cased_512 # bertbase_basevocab_cased biobert_pmc_basevocab_cased biobert_pubmed_pmc_basevocab_cased s2bert_basevocab_uncased_512 s2bert_s2vocab_uncased_512 bertbase_basevocab_uncased biobert_pubmed_basevocab_cased s2bert_basevocab_cased_512 s2bert_s2vocab_cased_512 + # do + +if [[ 'NCBI-diseasebc5cdrJNLPBAsciie' =~ $dataset ]]; +then + task='ner' +else + task='text_classification' +fi PYTORCH_SEED=`expr $SEED / 10` NUMPY_SEED=`expr $PYTORCH_SEED / 10` @@ -39,8 +44,28 @@ fi config_file=allennlp_config/"$task".jsonnet -export BERT_VOCAB=/bert_vocab/"$vocab_file".vocab -export BERT_WEIGHTS=/bert_weights/"$model".tar.gz +# vocab='basevocab' +# model='bertbase' +# export BERT_VOCAB=/bert_vocab/"$vocab"_uncased.vocab +# export BERT_WEIGHTS=/bert_weights/"$model"_"$vocab"_uncased.tar.gz +# export is_lowercase=true +# export BERT_VOCAB2=/bert_vocab/"$vocab"_cased.vocab +# export BERT_WEIGHTS2=/bert_weights/"$model"_"$vocab"_cased.tar.gz +# export is_lowercase2=false + + +export BERT_VOCAB=/bert_vocab/basevocab_cased.vocab +export BERT_WEIGHTS=/bert_weights/bertbase_basevocab_cased.tar.gz +export is_lowercase=false + +export BERT_VOCAB2=/bert_vocab/s2vocab_cased.vocab +export BERT_WEIGHTS2=/bert_weights/s2bert_s2vocab_cased_512.tar.gz +export is_lowercase2=false + + + +# export BERT_VOCAB=/bert_vocab/"$vocab_file".vocab +# export BERT_WEIGHTS=/bert_weights/"$model".tar.gz export TRAIN_PATH=data/$task/$dataset/train.txt export DEV_PATH=data/$task/$dataset/dev.txt export TEST_PATH=data/$task/$dataset/test.txt @@ -51,11 +76,11 @@ echo "$BERT_VOCAB", "$BERT_WEIGHTS", "$is_lowercase", "$TRAIN_PATH", "$config_fi # remember to change the desc below python scripts/run_with_beaker.py $config_file --source $bertvocab --source $bertweights \ --desc 's2-bert' \ - --env "BERT_VOCAB=$BERT_VOCAB" --env "BERT_WEIGHTS=$BERT_WEIGHTS" \ + --env "BERT_VOCAB=$BERT_VOCAB" --env "BERT_WEIGHTS=$BERT_WEIGHTS" --env "is_lowercase=$is_lowercase" \ + --env "BERT_VOCAB2=$BERT_VOCAB2" --env "BERT_WEIGHTS2=$BERT_WEIGHTS2" --env "is_lowercase2=$is_lowercase2" \ --env "TRAIN_PATH=$TRAIN_PATH" --env "DEV_PATH=$DEV_PATH" --env "TEST_PATH=$TEST_PATH" \ - --env "is_lowercase=$is_lowercase" \ --env "SEED=$SEED" --env "PYTORCH_SEED=$PYTORCH_SEED" --env "NUMPY_SEED=$NUMPY_SEED" - done - done +# done +# done done done diff --git a/scripts/train.sh b/scripts/train.sh index 2145d38..8e4f049 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -1,7 +1,7 @@ # Run allennlp training locally -dataset='chemprot' -task='text_classification' +dataset='sciie' +task='ner' config_file=allennlp_config/"$task".jsonnet SEED=13270 @@ -15,6 +15,9 @@ export NUMPY_SEED=$NUMPY_SEED export BERT_VOCAB=vocab/s2vocab_cased.vocab export BERT_WEIGHTS=pytorch_models/s2bert_s2vocab_cased_512.tar.gz export is_lowercase=false +export BERT_VOCAB2=vocab/s2vocab_uncased.vocab +export BERT_WEIGHTS2=pytorch_models/s2bert_s2vocab_uncased_512.tar.gz +export is_lowercase2=true export TRAIN_PATH=data/$task/$dataset/train.txt export DEV_PATH=data/$task/$dataset/dev.txt export TEST_PATH=data/$task/$dataset/test.txt