3535
3636/* Definitions: */
3737
38- #define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
39- #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */
38+ #define MAX_REGEXP_LEN 70 /* Max number of bytes for a regex. */
4039
4140
4241enum { UNUSED , DOT , BEGIN , END , QUESTIONMARK , STAR , PLUS , CHAR , CHAR_CLASS , INV_CHAR_CLASS , DIGIT , NOT_DIGIT , ALPHA , NOT_ALPHA , WHITESPACE , NOT_WHITESPACE , /* BRANCH */ };
4342
4443typedef struct regex_t
4544{
46- unsigned char type ; /* CHAR, STAR, etc. */
47- union
48- {
49- unsigned char ch ; /* the character itself */
50- unsigned char * ccl ; /* OR a pointer to characters in class */
51- } u ;
45+ unsigned char type ; /* CHAR, STAR, etc. */
46+ unsigned char data_len ;
47+ unsigned char data [0 ];
5248} regex_t ;
5349
50+ static re_t getnext (regex_t * pattern )
51+ {
52+ return (re_t )(((unsigned char * )pattern ) + 2 + pattern -> data_len );
53+ }
54+
5455
5556
5657/* Private function declarations: */
5758static int matchpattern (regex_t * pattern , const char * text , int * matchlength );
5859static int matchcharclass (char c , const char * str );
59- static int matchstar (regex_t p , regex_t * pattern , const char * text , int * matchlength );
60- static int matchplus (regex_t p , regex_t * pattern , const char * text , int * matchlength );
61- static int matchone (regex_t p , char c );
60+ static int matchstar (regex_t * p , regex_t * pattern , const char * text , int * matchlength );
61+ static int matchplus (regex_t * p , regex_t * pattern , const char * text , int * matchlength );
62+ static int matchone (regex_t * p , char c );
6263static int matchdigit (char c );
6364static int matchalpha (char c );
6465static int matchwhitespace (char c );
@@ -80,9 +81,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
8081 * matchlength = 0 ;
8182 if (pattern != 0 )
8283 {
83- if (pattern [ 0 ]. type == BEGIN )
84+ if (pattern -> type == BEGIN )
8485 {
85- return ((matchpattern (& pattern [ 1 ] , text , matchlength )) ? 0 : -1 );
86+ return ((matchpattern (getnext ( pattern ) , text , matchlength )) ? 0 : -1 );
8687 }
8788 else
8889 {
@@ -106,33 +107,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
106107 return -1 ;
107108}
108109
110+ static int min (int a , int b )
111+ {
112+ return (a <= b ) ? a : b ;
113+ }
114+
109115re_t re_compile (const char * pattern )
110116{
111- /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
112- MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
113- MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
114- static regex_t re_compiled [MAX_REGEXP_OBJECTS ];
115- static unsigned char ccl_buf [MAX_CHAR_CLASS_LEN ];
116- int ccl_bufidx = 1 ;
117+ /* The size of this static array substantiates the static RAM usage of this module.
118+ MAX_REGEXP_LEN is the max number number of bytes in the expression. */
119+ static unsigned char re_data [MAX_REGEXP_LEN ];
117120
118121 char c ; /* current char in pattern */
119122 int i = 0 ; /* index into pattern */
120- int j = 0 ; /* index into re_compiled */
123+ int j = 0 ; /* index into re_data */
121124
122- while (pattern [i ] != '\0' && (j + 1 < MAX_REGEXP_OBJECTS ))
125+ while (pattern [i ] != '\0' && (j + 3 < MAX_REGEXP_LEN ))
123126 {
124127 c = pattern [i ];
128+ regex_t * re_compiled = (regex_t * )(re_data + j );
129+ re_compiled -> data_len = 0 ;
125130
126131 switch (c )
127132 {
128133 /* Meta-characters: */
129- case '^' : { re_compiled [ j ]. type = BEGIN ; } break ;
130- case '$' : { re_compiled [ j ]. type = END ; } break ;
131- case '.' : { re_compiled [ j ]. type = DOT ; } break ;
132- case '*' : { re_compiled [ j ]. type = STAR ; } break ;
133- case '+' : { re_compiled [ j ]. type = PLUS ; } break ;
134- case '?' : { re_compiled [ j ]. type = QUESTIONMARK ; } break ;
135- /* case '|': { re_compiled[j]. type = BRANCH; } break; <-- not working properly */
134+ case '^' : { re_compiled -> type = BEGIN ; } break ;
135+ case '$' : { re_compiled -> type = END ; } break ;
136+ case '.' : { re_compiled -> type = DOT ; } break ;
137+ case '*' : { re_compiled -> type = STAR ; } break ;
138+ case '+' : { re_compiled -> type = PLUS ; } break ;
139+ case '?' : { re_compiled -> type = QUESTIONMARK ; } break ;
140+ /* case '|': { re_compiled-> type = BRANCH; } break; <-- not working properly */
136141
137142 /* Escaped character-classes (\s \w ...): */
138143 case '\\' :
@@ -145,41 +150,42 @@ re_t re_compile(const char* pattern)
145150 switch (pattern [i ])
146151 {
147152 /* Meta-character: */
148- case 'd' : { re_compiled [ j ]. type = DIGIT ; } break ;
149- case 'D' : { re_compiled [ j ]. type = NOT_DIGIT ; } break ;
150- case 'w' : { re_compiled [ j ]. type = ALPHA ; } break ;
151- case 'W' : { re_compiled [ j ]. type = NOT_ALPHA ; } break ;
152- case 's' : { re_compiled [ j ]. type = WHITESPACE ; } break ;
153- case 'S' : { re_compiled [ j ]. type = NOT_WHITESPACE ; } break ;
153+ case 'd' : { re_compiled -> type = DIGIT ; } break ;
154+ case 'D' : { re_compiled -> type = NOT_DIGIT ; } break ;
155+ case 'w' : { re_compiled -> type = ALPHA ; } break ;
156+ case 'W' : { re_compiled -> type = NOT_ALPHA ; } break ;
157+ case 's' : { re_compiled -> type = WHITESPACE ; } break ;
158+ case 'S' : { re_compiled -> type = NOT_WHITESPACE ; } break ;
154159
155160 /* Escaped character, e.g. '.' or '$' */
156161 default :
157162 {
158- re_compiled [j ].type = CHAR ;
159- re_compiled [j ].u .ch = pattern [i ];
163+ re_compiled -> type = CHAR ;
164+ re_compiled -> data_len = 1 ;
165+ re_compiled -> data [0 ] = pattern [i ];
160166 } break ;
161167 }
162168 }
163169 /* '\\' as last char in pattern -> invalid regular expression. */
164170/*
165171 else
166172 {
167- re_compiled[j].type = CHAR;
168- re_compiled[j].ch = pattern[i];
173+ re_compiled->type = CHAR;
174+ re_compiled->data_len = 1;
175+ re_compiled->data[0] = pattern[i];
169176 }
170177*/
171178 } break ;
172179
173180 /* Character class: */
174181 case '[' :
175182 {
176- /* Remember where the char-buffer starts. */
177- int buf_begin = ccl_bufidx ;
183+ int char_limit = min (0xff , MAX_REGEXP_LEN - j - 4 ); // 4 for this object and UNUSED at the minimum
178184
179185 /* Look-ahead to determine if negated */
180186 if (pattern [i + 1 ] == '^' )
181187 {
182- re_compiled [ j ]. type = INV_CHAR_CLASS ;
188+ re_compiled -> type = INV_CHAR_CLASS ;
183189 i += 1 ; /* Increment i to avoid including '^' in the char-buffer */
184190 if (pattern [i + 1 ] == 0 ) /* incomplete pattern, missing non-zero char after '^' */
185191 {
@@ -188,7 +194,7 @@ re_t re_compile(const char* pattern)
188194 }
189195 else
190196 {
191- re_compiled [ j ]. type = CHAR_CLASS ;
197+ re_compiled -> type = CHAR_CLASS ;
192198 }
193199
194200 /* Copy characters inside [..] to buffer */
@@ -197,7 +203,7 @@ re_t re_compile(const char* pattern)
197203 {
198204 if (pattern [i ] == '\\' )
199205 {
200- if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1 )
206+ if (re_compiled -> data_len >= char_limit )
201207 {
202208 //fputs("exceeded internal buffer!\n", stderr);
203209 return 0 ;
@@ -206,31 +212,32 @@ re_t re_compile(const char* pattern)
206212 {
207213 return 0 ;
208214 }
209- ccl_buf [ ccl_bufidx ++ ] = pattern [i ++ ];
215+ re_compiled -> data [ re_compiled -> data_len ++ ] = pattern [i ++ ];
210216 }
211- else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN )
217+ // TODO: I think this "else if" is a bug, should just be "if"
218+ else if (re_compiled -> data_len >= char_limit )
212219 {
213220 //fputs("exceeded internal buffer!\n", stderr);
214221 return 0 ;
215222 }
216- ccl_buf [ ccl_bufidx ++ ] = pattern [i ];
223+ re_compiled -> data [ re_compiled -> data_len ++ ] = pattern [i ];
217224 }
218- if (ccl_bufidx >= MAX_CHAR_CLASS_LEN )
225+ if (re_compiled -> data_len >= char_limit )
219226 {
220227 /* Catches cases such as [00000000000000000000000000000000000000][ */
221228 //fputs("exceeded internal buffer!\n", stderr);
222229 return 0 ;
223230 }
224231 /* Null-terminate string end */
225- ccl_buf [ccl_bufidx ++ ] = 0 ;
226- re_compiled [j ].u .ccl = & ccl_buf [buf_begin ];
232+ re_compiled -> data [re_compiled -> data_len ++ ] = 0 ;
227233 } break ;
228234
229235 /* Other characters: */
230236 default :
231237 {
232- re_compiled [j ].type = CHAR ;
233- re_compiled [j ].u .ch = c ;
238+ re_compiled -> type = CHAR ;
239+ re_compiled -> data_len = 1 ;
240+ re_compiled -> data [0 ] = c ;
234241 } break ;
235242 }
236243 /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -240,35 +247,39 @@ re_t re_compile(const char* pattern)
240247 }
241248
242249 i += 1 ;
243- j += 1 ;
250+ j += 2 + re_compiled -> data_len ;
251+ }
252+ if (j + 1 >= MAX_REGEXP_LEN ) {
253+ //fputs("exceeded internal buffer!\n", stderr);
254+ return 0 ;
244255 }
245256 /* 'UNUSED' is a sentinel used to indicate end-of-pattern */
246- re_compiled [j ].type = UNUSED ;
257+ re_data [j ] = UNUSED ;
258+ re_data [j + 1 ] = 0 ;
247259
248- return (re_t ) re_compiled ;
260+ return (re_t ) re_data ;
249261}
250262
251263void re_print (regex_t * pattern )
252264{
253265 const char * types [] = { "UNUSED" , "DOT" , "BEGIN" , "END" , "QUESTIONMARK" , "STAR" , "PLUS" , "CHAR" , "CHAR_CLASS" , "INV_CHAR_CLASS" , "DIGIT" , "NOT_DIGIT" , "ALPHA" , "NOT_ALPHA" , "WHITESPACE" , "NOT_WHITESPACE" , "BRANCH" };
254266
255- int i ;
256267 int j ;
257268 char c ;
258- for (i = 0 ; i < MAX_REGEXP_OBJECTS ; ++ i )
269+ for (;; pattern = getnext ( pattern ) )
259270 {
260- if (pattern [ i ]. type == UNUSED )
271+ if (pattern -> type == UNUSED )
261272 {
262273 break ;
263274 }
264275
265- printf ("type: %s" , types [pattern [ i ]. type ]);
266- if (pattern [ i ]. type == CHAR_CLASS || pattern [ i ]. type == INV_CHAR_CLASS )
276+ printf ("type: %s" , types [pattern -> type ]);
277+ if (pattern -> type == CHAR_CLASS || pattern -> type == INV_CHAR_CLASS )
267278 {
268279 printf (" [" );
269- for (j = 0 ; j < MAX_CHAR_CLASS_LEN ; ++ j )
280+ for (j = 0 ; j < pattern -> data_len ; ++ j )
270281 {
271- c = pattern [ i ]. u . ccl [j ];
282+ c = pattern -> data [j ];
272283 if ((c == '\0' ) || (c == ']' ))
273284 {
274285 break ;
@@ -277,9 +288,9 @@ void re_print(regex_t* pattern)
277288 }
278289 printf ("]" );
279290 }
280- else if (pattern [ i ]. type == CHAR )
291+ else if (pattern -> type == CHAR )
281292 {
282- printf (" '%c'" , pattern [ i ]. u . ch );
293+ printf (" '%c'" , pattern -> data [ 0 ] );
283294 }
284295 printf ("\n" );
285296 }
@@ -380,24 +391,25 @@ static int matchcharclass(char c, const char* str)
380391 return 0 ;
381392}
382393
383- static int matchone (regex_t p , char c )
394+ static int matchone (regex_t * p , char c )
384395{
385- switch (p . type )
396+ switch (p -> type )
386397 {
387398 case DOT : return matchdot (c );
388- case CHAR_CLASS : return matchcharclass (c , (const char * )p . u . ccl );
389- case INV_CHAR_CLASS : return !matchcharclass (c , (const char * )p . u . ccl );
399+ case CHAR_CLASS : return matchcharclass (c , (const char * )p -> data );
400+ case INV_CHAR_CLASS : return !matchcharclass (c , (const char * )p -> data );
390401 case DIGIT : return matchdigit (c );
391402 case NOT_DIGIT : return !matchdigit (c );
392403 case ALPHA : return matchalphanum (c );
393404 case NOT_ALPHA : return !matchalphanum (c );
394405 case WHITESPACE : return matchwhitespace (c );
395406 case NOT_WHITESPACE : return !matchwhitespace (c );
396- default : return (p .u .ch == c );
407+ case BEGIN : return 0 ;
408+ default : return (p -> data [0 ] == c );
397409 }
398410}
399411
400- static int matchstar (regex_t p , regex_t * pattern , const char * text , int * matchlength )
412+ static int matchstar (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
401413{
402414 int prelen = * matchlength ;
403415 const char * prepoint = text ;
@@ -417,7 +429,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
417429 return 0 ;
418430}
419431
420- static int matchplus (regex_t p , regex_t * pattern , const char * text , int * matchlength )
432+ static int matchplus (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
421433{
422434 const char * prepoint = text ;
423435 while ((text [0 ] != '\0' ) && matchone (p , * text ))
@@ -435,10 +447,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
435447 return 0 ;
436448}
437449
438- static int matchquestion (regex_t p , regex_t * pattern , const char * text , int * matchlength )
450+ static int matchquestion (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
439451{
440- if (p .type == UNUSED )
441- return 1 ;
442452 if (matchpattern (pattern , text , matchlength ))
443453 return 1 ;
444454 if (* text && matchone (p , * text ++ ))
@@ -493,33 +503,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
493503static int matchpattern (regex_t * pattern , const char * text , int * matchlength )
494504{
495505 int pre = * matchlength ;
496- do
506+ while ( 1 )
497507 {
498- if (( pattern [ 0 ]. type == UNUSED ) || ( pattern [ 1 ]. type == QUESTIONMARK ) )
508+ if (pattern -> type == UNUSED )
499509 {
500- return matchquestion ( pattern [ 0 ], & pattern [ 2 ], text , matchlength ) ;
510+ return 1 ;
501511 }
502- else if (pattern [1 ].type == STAR )
512+ regex_t * next_pattern = getnext (pattern );
513+ if (next_pattern -> type == QUESTIONMARK )
503514 {
504- return matchstar (pattern [ 0 ], & pattern [ 2 ] , text , matchlength );
515+ return matchquestion (pattern , getnext ( next_pattern ) , text , matchlength );
505516 }
506- else if (pattern [ 1 ]. type == PLUS )
517+ else if (next_pattern -> type == STAR )
507518 {
508- return matchplus (pattern [ 0 ], & pattern [ 2 ] , text , matchlength );
519+ return matchstar (pattern , getnext ( next_pattern ) , text , matchlength );
509520 }
510- else if ((pattern [0 ].type == END ) && pattern [1 ].type == UNUSED )
521+ else if (next_pattern -> type == PLUS )
522+ {
523+ return matchplus (pattern , getnext (next_pattern ), text , matchlength );
524+ }
525+ else if ((pattern -> type == END ) && next_pattern -> type == UNUSED )
511526 {
512527 return (text [0 ] == '\0' );
513528 }
514529/* Branching is not working properly
515- else if (pattern[1]. type == BRANCH)
530+ else if (pattern-> type == BRANCH)
516531 {
517- return (matchpattern(pattern, text) || matchpattern(&pattern[2] , text));
532+ return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern) , text));
518533 }
519534*/
520535 (* matchlength )++ ;
536+ if (text [0 ] == '\0' )
537+ break ;
538+ if (!matchone (pattern , * text ++ ))
539+ break ;
540+ pattern = next_pattern ;
521541 }
522- while ((text [0 ] != '\0' ) && matchone (* pattern ++ , * text ++ ));
523542
524543 * matchlength = pre ;
525544 return 0 ;
0 commit comments