-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathutils.py
More file actions
173 lines (138 loc) · 5.86 KB
/
utils.py
File metadata and controls
173 lines (138 loc) · 5.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Utility functions for the Token Prediction Model.
This module contains shared functions for feature extraction, token counting,
and text analysis used across the token prediction pipeline.
"""
import math
import tiktoken
import keras_nlp
def count_tokens_gpt4(text: str) -> int:
"""
Count tokens in text using OpenAI's GPT-4 tokenizer.
Args:
text: The input text to tokenize
Returns:
Number of tokens in the text
"""
enc = tiktoken.encoding_for_model("gpt-4")
num_tokens = len(enc.encode(text))
return num_tokens
def count_tokens_mistral(text: str) -> int:
"""
Count tokens in text using Mistral's tokenizer.
Args:
text: The input text to tokenize
Returns:
Number of tokens in the text
"""
tokenizer = keras_nlp.models.MistralTokenizer.from_preset(
"mistral_instruct_7b_en",
)
tokens = tokenizer(text)
return len(tokens)
def extract_text_features(text: str) -> tuple:
"""
Extract linguistic features from text for token prediction.
This function analyzes text and extracts 7 key features that correlate
with token count across different tokenizers.
Args:
text: The input text to analyze
Returns:
A tuple containing:
- text_length: Total character count
- word_count: Number of space-separated words
- punctuation_count: Count of punctuation marks (.,;:!?'")
- number_count: Count of digit characters
- whitespace_count: Count of all whitespace characters
- line_count: Number of newline characters
- sentence_count: Approximate count of sentences (based on .!?)
"""
text_length = len(text)
word_count = len(text.split())
punctuation_count = sum(c in '.,;:!?\'"' for c in text)
number_count = sum(c.isdigit() for c in text)
whitespace_count = sum(c.isspace() for c in text)
line_count = text.count('\n')
# Approximate sentence count by counting sentence-ending punctuation
sentence_delimiters = ['.', '?', '!']
sentence_count = sum(text.count(delimiter) for delimiter in sentence_delimiters)
return (text_length, word_count, punctuation_count, number_count,
whitespace_count, line_count, sentence_count)
def predict_tokens_gpt4(text_length: int, words: int, punctuations: int,
numbers: int, whitespaces: int, lines: int,
sentences: int) -> int:
"""
Predict GPT-4 token count using a pre-trained linear regression model.
This function uses coefficients from a model trained on a diverse dataset
of ABAP and Markdown files. The model achieves R² > 0.997.
Args:
text_length: Total character count
words: Number of words
punctuations: Count of punctuation marks
numbers: Count of digit characters
whitespaces: Count of whitespace characters
lines: Number of newline characters
sentences: Approximate sentence count
Returns:
Predicted token count (rounded up to nearest integer)
"""
# Model coefficients trained on diverse text data
args = [text_length, words, punctuations, numbers, whitespaces, lines, sentences]
coefficients = [0.09962379, 0.41045692, 0.94309167, 1.1346075,
-0.05270366, 3.94760669, -1.49832381]
intercept = -37.79047528292131
predicted_tokens = intercept + sum(arg * coef for arg, coef in zip(args, coefficients))
return math.ceil(predicted_tokens)
def predict_tokens_mistral(text_length: int, words: int, punctuations: int,
numbers: int, whitespaces: int, lines: int,
sentences: int) -> int:
"""
Predict Mistral token count using a pre-trained linear regression model.
This function uses coefficients from a model trained on a diverse dataset
of ABAP and Markdown files. The model achieves R² > 0.997.
Args:
text_length: Total character count
words: Number of words
punctuations: Count of punctuation marks
numbers: Count of digit characters
whitespaces: Count of whitespace characters
lines: Number of newline characters
sentences: Approximate sentence count
Returns:
Predicted token count (rounded up to nearest integer)
"""
# Model coefficients trained on diverse text data
args = [text_length, words, punctuations, numbers, whitespaces, lines, sentences]
coefficients = [0.21993344, -0.30867016, 0.53583886, 1.75223606,
-0.06621309, 5.21460546, 0.05222465]
intercept = -60.45156056006181
predicted_tokens = intercept + sum(arg * coef for arg, coef in zip(args, coefficients))
return math.ceil(predicted_tokens)
def predict_tokens_from_text(text: str, model: str = "gpt4") -> dict:
"""
Predict token count directly from text input.
Args:
text: The input text to analyze
model: Target model - "gpt4" or "mistral"
Returns:
Dictionary containing:
- predicted_tokens: The predicted token count
- features: Dictionary of extracted features
- model: The model used for prediction
"""
features = extract_text_features(text)
feature_names = ["text_length", "word_count", "punctuation_count",
"number_count", "whitespace_count", "line_count",
"sentence_count"]
feature_dict = dict(zip(feature_names, features))
if model.lower() == "gpt4":
predicted = predict_tokens_gpt4(*features)
elif model.lower() == "mistral":
predicted = predict_tokens_mistral(*features)
else:
raise ValueError(f"Unknown model: {model}. Choose 'gpt4' or 'mistral'")
return {
"predicted_tokens": predicted,
"features": feature_dict,
"model": model
}