From 8eecb195ee2f5ba3f95ce97a8c731a3838f71ffe Mon Sep 17 00:00:00 2001 From: Pascal van Kooten Date: Tue, 9 Jul 2019 20:56:38 +0200 Subject: [PATCH] use tok as fallback tokenizer --- README.md | 5 +++-- bpe/encoder.py | 4 ++-- requirements.txt | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 069adf0..95ef975 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,9 @@ encoder.fit(test_corpus.split('\n')) example = "Vizzini: He didn't fall? INCONCEIVABLE!" print(encoder.tokenize(example)) # ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow'] +# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'did', 'not', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow'] print(next(encoder.transform([example]))) -# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24] +# [25, 102, 76, 77, 68, 24, 25, 149, 24, 13, 10, 11, 12, 25, 79, 24, 25, 135, 58, 37, 152, 81, 160, 108, 24, 25, 143, 24] print(next(encoder.inverse_transform(encoder.transform([example])))) -# vizzini : he didn ' t fall ? inconceivable ! +# vizzini : he did not fall ? inconceivable ! ``` diff --git a/bpe/encoder.py b/bpe/encoder.py index 2c9b07a..4ae60d1 100644 --- a/bpe/encoder.py +++ b/bpe/encoder.py @@ -7,7 +7,7 @@ except ImportError: pass -from nltk.tokenize import wordpunct_tokenize +from tok import word_tokenize from tqdm import tqdm import toolz import json @@ -39,7 +39,7 @@ def __init__(self, vocab_size=8192, pct_bpe=0.2, word_tokenizer=None, self.pct_bpe = pct_bpe self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])]) self.bpe_vocab_size = vocab_size - self.word_vocab_size - self.word_tokenizer = word_tokenizer if word_tokenizer is not None else wordpunct_tokenize + self.word_tokenizer = word_tokenizer if word_tokenizer is not None else word_tokenize self.custom_tokenizer = word_tokenizer is not None self.word_vocab = {} # type: Dict[str, int] self.bpe_vocab = {} # type: Dict[str, int] diff --git a/requirements.txt b/requirements.txt index daf5913..7d91d02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -nltk +tok tqdm pytest hypothesis toolz -mypy \ No newline at end of file +mypy