From 07ba41228949b155b041891af676f001c0998c4f Mon Sep 17 00:00:00 2001 From: thespino <2001giorgio@gmail.com> Date: Tue, 23 May 2023 20:27:52 +0200 Subject: [PATCH 1/2] Add support for fine-tuned models in encoding_for_model Identify models that can be fine-tuned in encoding_for_model. - See https://platform.openai.com/docs/models/model-endpoint-compatibility - See https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb --- tiktoken/model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tiktoken/model.py b/tiktoken/model.py index 26201ce2..399df3bd 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -9,6 +9,11 @@ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo": "cl100k_base", # Azure deployment name + # fine-tuned models + "davinci": "r50k_base", + "curie": "r50k_base", + "babbage": "r50k_base", + "ada": "r50k_base", } MODEL_TO_ENCODING: dict[str, str] = { From 96b2158475de96726c94717b9592da0b6c367352 Mon Sep 17 00:00:00 2001 From: thespino <2001giorgio@gmail.com> Date: Tue, 23 May 2023 20:29:38 +0200 Subject: [PATCH 2/2] Tests for fine-tuned models in encoding_for_model --- tests/test_simple_public.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py index 8458c126..c6e67870 100644 --- a/tests/test_simple_public.py +++ b/tests/test_simple_public.py @@ -32,6 +32,16 @@ def test_encoding_for_model(): enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") assert enc.name == "cl100k_base" + # fine-tuned models + enc = tiktoken.encoding_for_model("davinci:ft-personal:finetunedmodel-2023-05-23-20-00-00") + assert enc.name == "r50k_base" + enc = tiktoken.encoding_for_model("curie:ft-personal:finetunedmodel-2023-05-23-20-00-00") + assert enc.name == "r50k_base" + enc = tiktoken.encoding_for_model("babbage:ft-personal:finetunedmodel-2023-05-23-20-00-00") + assert enc.name == "r50k_base" + enc = tiktoken.encoding_for_model("ada:ft-personal:finetunedmodel-2023-05-23-20-00-00") + assert enc.name == "r50k_base" + def test_optional_blobfile_dependency(): prog = """