Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ encoder.fit(test_corpus.split('\n'))
example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example))
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'did', 'not', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
print(next(encoder.transform([example])))
# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24]
# [25, 102, 76, 77, 68, 24, 25, 149, 24, 13, 10, 11, 12, 25, 79, 24, 25, 135, 58, 37, 152, 81, 160, 108, 24, 25, 143, 24]
print(next(encoder.inverse_transform(encoder.transform([example]))))
# vizzini : he didn ' t fall ? inconceivable !
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So using tok would change the default tokenization?

# vizzini : he did not fall ? inconceivable !
```
4 changes: 2 additions & 2 deletions bpe/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
except ImportError:
pass

from nltk.tokenize import wordpunct_tokenize
from tok import word_tokenize
from tqdm import tqdm
import toolz
import json
Expand Down Expand Up @@ -39,7 +39,7 @@ def __init__(self, vocab_size=8192, pct_bpe=0.2, word_tokenizer=None,
self.pct_bpe = pct_bpe
self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])])
self.bpe_vocab_size = vocab_size - self.word_vocab_size
self.word_tokenizer = word_tokenizer if word_tokenizer is not None else wordpunct_tokenize
self.word_tokenizer = word_tokenizer if word_tokenizer is not None else word_tokenize
self.custom_tokenizer = word_tokenizer is not None
self.word_vocab = {} # type: Dict[str, int]
self.bpe_vocab = {} # type: Dict[str, int]
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
nltk
tok
tqdm
pytest
hypothesis
toolz
mypy
mypy