@@ -57,6 +57,52 @@ let next_state = index.next_state(&initial_state, token_id);
5757let final_states = index . final_states ();
5858```
5959
60+ ### Vocabulary
61+
62+ You can create a ` Vocabulary ` in three ways:
63+
64+ 1 . ** ` Vocabulary::from_pretrained(model, parameters) ` ** - Loads from a pretrained model (as in the example above)
65+
66+ 2 . ** Manual creation** - You can create a vocabulary from token mappings:
67+
68+ 1 . ** ` Vocabulary::new(eos_token_id) ` ** - Creates an empty vocabulary, then add tokens with ` try_insert() ` :
69+
70+ ``` rust
71+ let mut vocabulary = Vocabulary :: new (50256 );
72+ vocabulary . try_insert (" hello" , 0 )? ;
73+ vocabulary . try_insert (vec! [32 ], 1 )? ;
74+ ```
75+
76+ 2 . ** `Vocabulary :: try_from ((eos_token_id , tokens ))`** - Creates a vocabulary by directly providing the token mappings .
77+
78+ - It can be done either with the tokens as strings :
79+
80+ ```rust
81+ use rustc_hash :: FxHashMap as HashMap ;
82+
83+ let eos_token_id : u32 = 50256 ;
84+ let mut tokens : HashMap <String , Vec <u32 >> = HashMap :: default ();
85+ tokens . insert (" hello" . to_string (), vec! [0 ]);
86+ tokens . insert (" world" . to_string (), vec! [1 ]);
87+
88+ let vocabulary = Vocabulary :: try_from ((eos_token_id , tokens ))? ;
89+ ```
90+
91+ - Or with the tokens as byte vector keys :
92+
93+ ```rust
94+ use rustc_hash :: FxHashMap as HashMap ;
95+
96+ let eos_token_id : u32 = 50256 ;
97+ let mut tokens : HashMap <Vec <u8 >, Vec <u32 >> = HashMap :: default ();
98+ tokens . insert (b " hello" . to_vec (), vec! [0 ]);
99+ tokens . insert (b " world" . to_vec (), vec! [1 ]);
100+
101+ let vocabulary = Vocabulary :: try_from ((eos_token_id , tokens ))? ;
102+ ```
103+
104+ * * Important ** : When creating a `Vocabulary ` manually from tokenizer data , ensure tokens are converted to their string representations to replace special tokens that wouldn 't be recognized by the DFA .
105+
60106## Python Bindings
61107
62108Additionally , project provides interfaces to integrate the crate 's functionality with Python .
0 commit comments