-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvisual.py
More file actions
67 lines (52 loc) · 2.16 KB
/
visual.py
File metadata and controls
67 lines (52 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from rich.console import Console
from rich.text import Text
from basic import BasicTokenizer
from reg import RegexTokenizer
# Function to print tokens with colors
def print_colored_tokens(tokens):
console = Console()
colored_text = Text()
# Define a set of colors to cycle through
colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "white", "bright_red", "bright_green", "bright_blue", "bright_yellow", "bright_magenta", "bright_cyan"]
for i, token in enumerate(tokens):
color = colors[i % len(colors)]
colored_text.append(str(token) + " ", style=color)
console.print(colored_text)
# Function to print decoded text with colored tokens
def print_colored_decoded_text(tokens, vocab):
console = Console()
colored_text = Text()
# Define a set of colors to cycle through
colors = ["red", "green", "blue", "yellow", "magenta", "cyan"]
for i, token in enumerate(tokens):
color = colors[i % len(colors)]
token_text = vocab[token].decode("utf-8", errors="replace")
colored_text.append(token_text, style=color)
console.print(colored_text)
if __name__ == "__main__":
# Read input text from file
with open('taylorswift.txt', 'r', encoding='utf-8') as file:
text = file.read()
# Train the tokenizer on the input text
basic_tokenizer = BasicTokenizer()
regex_tokenizer = RegexTokenizer("GPT2")
# basic_tokenizer.train(text, vocab_size=10000, verbose=False)
regex_tokenizer.train(text, vocab_size=10000, verbose=False)
txt = input("\nInput: ")
# Encode the text
# tokens = basic_tokenizer.encode(txt)
#
# Print encoded tokens with colors
# print("\nEncoded Tokens:")
# print_colored_tokens(tokens)
#
# # Print decoded text with colored tokens
# print("\nDecoded Text with Colors:")
# print_colored_decoded_text(tokens, basic_tokenizer.vocab)
tokens = regex_tokenizer.encode(txt)
# Print encoded tokens with colors
print("\nEncoded Tokens:")
print_colored_tokens(tokens)
# Print decoded text with colored tokens
print("\nDecoded Text with Colors:")
print_colored_decoded_text(tokens, regex_tokenizer.vocab)