-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.js
More file actions
108 lines (102 loc) · 2.89 KB
/
tokenizer.js
File metadata and controls
108 lines (102 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
function loadVocab() {
try {
const raw = localStorage.getItem(STORAGE_KEY);
if (!raw) {
localStorage.setItem(STORAGE_KEY, JSON.stringify(DEFAULT_VOCAB));
return structuredClone(DEFAULT_VOCAB);
}
const parsed = JSON.parse(raw);
for (const k of Object.keys(parsed)) parsed[k] = Number(parsed[k]);
return parsed;
} catch (e) {
console.warn("failed to load vocab, using default", e);
localStorage.setItem(STORAGE_KEY, JSON.stringify(DEFAULT_VOCAB));
return structuredClone(DEFAULT_VOCAB);
}
}
function saveVocab(vocab) {
localStorage.setItem(STORAGE_KEY, JSON.stringify(vocab));
}
let vocab = loadVocab();
const encoder = new TextEncoder();
const decoder = new TextDecoder();
function numericMax(...arr) {
return arr.reduce((m, x) => Math.max(m, Number(x)), 255);
}
function getNextId() {
const values = Object.values(vocab).map((x) => Number(x));
return numericMax(...values) + 1;
}
function splitTextIntoParts(text) {
return text.split(/(\s+|[0-9]+|[^\w\s])/u).filter(Boolean);
}
function encodeText(text) {
const parts = splitTextIntoParts(text);
const tokens = [];
const details = [];
for (const part of parts) {
if (vocab[part] !== undefined) {
const id = Number(vocab[part]);
tokens.push(id);
details.push({ part, type: "existing", ids: [id] });
} else if (
/^\s+$/.test(part) ||
/^[0-9]+$/.test(part) ||
/[^\w\s]/u.test(part)
) {
const bytes = Array.from(encoder.encode(part));
const type =
/^\s+$/.test(part) || /^[0-9]+$/.test(part) ? "byte" : "punct";
tokens.push(...bytes);
details.push({ part, type, ids: bytes });
} else {
const nid = getNextId();
vocab[part] = nid;
saveVocab(vocab);
tokens.push(nid);
details.push({ part, type: "new", ids: [nid] });
}
}
return { tokens, details };
}
function decodeTokens(tokensArr) {
const reverse = {};
for (const [w, id] of Object.entries(vocab)) reverse[Number(id)] = w;
let out = "";
const details = [];
let byteBuffer = [];
function flushBytes() {
if (byteBuffer.length) {
out += decoder.decode(new Uint8Array(byteBuffer));
details.push({
part: decoder.decode(new Uint8Array(byteBuffer)),
type: "byte",
ids: [...byteBuffer],
});
byteBuffer = [];
}
}
for (const t of tokensArr) {
if (Number.isNaN(t)) {
flushBytes();
details.push({ part: "<?>", type: "unknown", ids: [t] });
out += "<?>";
continue;
}
if (t <= 255) {
byteBuffer.push(t);
} else {
flushBytes();
const word = reverse[t];
if (word !== undefined) {
details.push({ part: word, type: "existing", ids: [t] });
out += word;
} else {
details.push({ part: "<?>", type: "unknown", ids: [t] });
out += "<?>";
}
}
}
flushBytes();
return { text: out, details };
}