Tokenization/visual.py at main · yebyyy/Tokenization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from rich.console import Console
from rich.text import Text
from basic import BasicTokenizer
from reg import RegexTokenizer

# Function to print tokens with colors
def print_colored_tokens(tokens):
    console = Console()
    colored_text = Text()

    # Define a set of colors to cycle through
    colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "white", "bright_red", "bright_green", "bright_blue", "bright_yellow", "bright_magenta", "bright_cyan"]

    for i, token in enumerate(tokens):
        color = colors[i % len(colors)]
        colored_text.append(str(token) + " ", style=color)

    console.print(colored_text)

# Function to print decoded text with colored tokens
def print_colored_decoded_text(tokens, vocab):
    console = Console()
    colored_text = Text()

    # Define a set of colors to cycle through
    colors = ["red", "green", "blue", "yellow", "magenta", "cyan"]

    for i, token in enumerate(tokens):
        color = colors[i % len(colors)]
        token_text = vocab[token].decode("utf-8", errors="replace")
        colored_text.append(token_text, style=color)

    console.print(colored_text)

if __name__ == "__main__":
    # Read input text from file
    with open('taylorswift.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Train the tokenizer on the input text
    basic_tokenizer = BasicTokenizer()
    regex_tokenizer = RegexTokenizer("GPT2")

    # basic_tokenizer.train(text, vocab_size=10000, verbose=False)
    regex_tokenizer.train(text, vocab_size=10000, verbose=False)

    txt = input("\nInput: ")
    # Encode the text
    # tokens = basic_tokenizer.encode(txt)
    #
    # Print encoded tokens with colors
    # print("\nEncoded Tokens:")
    # print_colored_tokens(tokens)
    #
    # # Print decoded text with colored tokens
    # print("\nDecoded Text with Colors:")
    # print_colored_decoded_text(tokens, basic_tokenizer.vocab)

    tokens = regex_tokenizer.encode(txt)

    # Print encoded tokens with colors
    print("\nEncoded Tokens:")
    print_colored_tokens(tokens)

    # Print decoded text with colored tokens
    print("\nDecoded Text with Colors:")
    print_colored_decoded_text(tokens, regex_tokenizer.vocab)