Skip to content
This repository was archived by the owner on Jun 1, 2023. It is now read-only.

Commit df5243a

Browse files
demerphqrurban
authored andcommitted
fix #131649 - extended charclass can trigger assert
The extended charclass parser makes some assumptions during the first pass which are only true on well structured input, and it does not properly catch various errors. later on the code assumes that things the first pass will let through are valid, when in fact they should trigger errors. (cherry picked from commit 19a498a) deleted duplicate wrong re sets tests: RT #126181: \cX behaves strangely inside RT #126481 !! with syntax error panics
1 parent 071c7ff commit df5243a

File tree

6 files changed

+68
-64
lines changed

6 files changed

+68
-64
lines changed

pod/perldiag.pod

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6085,7 +6085,7 @@ yourself.
60856085
a perl4 interpreter, especially if the next 2 tokens are "use strict"
60866086
or "my $var" or "our $var".
60876087

6088-
=item Syntax error in (?[...]) in regex m/%s/
6088+
=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/
60896089

60906090
(F) Perl could not figure out what you meant inside this construct; this
60916091
notifies you that it is giving up trying.
@@ -6577,6 +6577,31 @@ to find out why that isn't happening.
65776577
(F) The unexec() routine failed for some reason. See your local FSF
65786578
representative, who probably put it there in the first place.
65796579

6580+
=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
6581+
6582+
(F) While parsing an extended character class a ']' character was encountered
6583+
at a point in the definition where the only legal use of ']' is to close the
6584+
character class definition as part of a '])', you may have forgotten the close
6585+
paren, or otherwise confused the parser.
6586+
6587+
=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
6588+
6589+
(F) While parsing a nested extended character class like:
6590+
6591+
(?[ ... (?flags:(?[ ... ])) ... ])
6592+
^
6593+
6594+
we expected to see a close paren ')' (marked by ^) but did not.
6595+
6596+
=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
6597+
6598+
(F) While parsing a nested extended character class like:
6599+
6600+
(?[ ... (?flags:(?[ ... ])) ... ])
6601+
^
6602+
6603+
we expected to see a close paren ')' (marked by ^) but did not.
6604+
65806605
=item Unexpected binary operator '%c' with no preceding operand in regex;
65816606
marked by S<<-- HERE> in m/%s/
65826607

pod/perlrecharclass.pod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,8 +1128,8 @@ hence both of the following work:
11281128
Any contained POSIX character classes, including things like C<\w> and C<\D>
11291129
respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.
11301130

1131-
C<< (?[ ]) >> is a regex-compile-time construct. Any attempt to use
1132-
something which isn't knowable at the time the containing regular
1131+
Note that C<< (?[ ]) >> is a regex-compile-time construct. Any attempt
1132+
to use something which isn't knowable at the time the containing regular
11331133
expression is compiled is a fatal error. In practice, this means
11341134
just three limitations:
11351135

regcomp.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14909,8 +14909,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1490914909
TRUE /* Force /x */ );
1491014910

1491114911
switch (*RExC_parse) {
14912-
case '?':
14913-
if (RExC_parse[1] == '[') depth++, RExC_parse++;
14912+
case '(':
14913+
if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
14914+
depth++, RExC_parse+=2;
1491414915
/* FALLTHROUGH */
1491514916
default:
1491614917
break;
@@ -14967,9 +14968,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1496714968
}
1496814969

1496914970
case ']':
14970-
if (depth--) break;
14971-
RExC_parse++;
14972-
if (*RExC_parse == ')') {
14971+
if (RExC_parse[1] == ')') {
14972+
RExC_parse++;
14973+
if (depth--) break;
1497314974
node = reganode(pRExC_state, ANYOF, 0);
1497414975
RExC_size += ANYOF_SKIP;
1497514976
nextchar(pRExC_state);
@@ -14981,20 +14982,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1498114982

1498214983
return node;
1498314984
}
14984-
goto no_close;
14985+
/* We output the messages even if warnings are off, because we'll fail
14986+
* the very next thing, and these give a likely diagnosis for that */
14987+
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
14988+
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
14989+
}
14990+
RExC_parse++;
14991+
vFAIL("Unexpected ']' with no following ')' in (?[...");
1498514992
}
1498614993

1498714994
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
1498814995
}
1498914996

14990-
no_close:
1499114997
/* We output the messages even if warnings are off, because we'll fail
1499214998
* the very next thing, and these give a likely diagnosis for that */
1499314999
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
1499415000
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
1499515001
}
1499615002

14997-
FAIL("Syntax error in (?[...])");
15003+
vFAIL("Syntax error in (?[...])");
1499815004
}
1499915005

1500015006
/* Pass 2 only after this. */
@@ -15174,12 +15180,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1517415180
* inversion list, and RExC_parse points to the trailing
1517515181
* ']'; the next character should be the ')' */
1517615182
RExC_parse++;
15177-
assert(UCHARAT(RExC_parse) == ')');
15183+
if (UCHARAT(RExC_parse) != ')')
15184+
vFAIL("Expecting close paren for nested extended charclass");
1517815185

1517915186
/* Then the ')' matching the original '(' handled by this
1518015187
* case: statement */
1518115188
RExC_parse++;
15182-
assert(UCHARAT(RExC_parse) == ')');
15189+
if (UCHARAT(RExC_parse) != ')')
15190+
vFAIL("Expecting close paren for wrapper for nested extended charclass");
1518315191

1518415192
RExC_parse++;
1518515193
RExC_flags = save_flags;

t/lib/warnings/regcomp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,21 @@ Unmatched [ in regex; marked by <-- HERE in m/abc[ <-- HERE fi[.00./ at - line
6666
qr/(?[[[:word]]])/;
6767
EXPECT
6868
Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
69-
syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
69+
Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
7070
########
7171
# NAME qr/(?[ [[:digit: ])/
7272
# OPTION fatal
7373
qr/(?[[[:digit: ])/;
7474
EXPECT
7575
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
76-
syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
76+
syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
7777
########
7878
# NAME qr/(?[ [:digit: ])/
7979
# OPTION fatal
8080
qr/(?[[:digit: ])/
8181
EXPECT
8282
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
83-
syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
83+
syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
8484
########
8585
# NAME [perl #126141]
8686
# OPTION fatal

t/re/reg_mesg.t

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,9 @@ my @death =
213213
'/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
214214
'/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
215215

216-
'/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
217-
'/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
216+
217+
'/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
218+
'/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
218219
'/(?[[:w:]])/' => "",
219220
'/([.].*)[.]/' => "", # [perl #127582]
220221
'/[.].*[.]/' => "", # [perl #127604]
@@ -237,11 +238,12 @@ my @death =
237238
'/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
238239
'/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
239240
'/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
240-
'/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
241-
'/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
242-
'/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
243-
'/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
244-
'/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
241+
'/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
242+
'/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
243+
'/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
244+
'/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
245+
'/(?[ \t + \e # This was supposed to be a comment ])/' =>
246+
"Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
245247
'/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
246248
'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
247249
'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
@@ -427,10 +429,10 @@ my @death_utf8 = mark_as_utf8(
427429

428430
'/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',
429431

430-
'/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
431-
'/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
432-
'/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
433-
'/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
432+
'/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
433+
'/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
434+
'/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
435+
'/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
434436
'/ネ(?[[:ネ:]])ネ/' => "",
435437
'/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
436438
'/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
@@ -443,8 +445,9 @@ my @death_utf8 = mark_as_utf8(
443445
'/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
444446
'/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
445447
'/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
446-
'/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
447-
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
448+
'/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
449+
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
450+
"Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
448451
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
449452
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
450453
'/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",

t/re/regex_sets.t

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -99,38 +99,6 @@ is($@, "", 'qr/(?[ [a] ])/ can be interpolated');
9999

100100
like("B", qr/(?[ [B] | ! ( [^B] ) ])/, "[perl #125892]");
101101

102-
# RT #126181: \cX behaves strangely inside (?[])
103-
{
104-
no warnings qw(syntax regexp);
105-
106-
eval { $_ = '/(?[(\c]) /'; qr/$_/ };
107-
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
108-
eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
109-
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
110-
eval { $_ = '(?[(\c])'; qr/$_/ };
111-
like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
112-
eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
113-
like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
114-
eval { $_ = '(?[\c[]](])'; qr/$_/ };
115-
like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
116-
like("\c#", qr/(?[\c#])/, '\c# should match itself');
117-
like("\c[", qr/(?[\c[])/, '\c[ should match itself');
118-
like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');
119-
like("\c]", qr/(?[\c]])/, '\c] should match itself');
120-
}
121-
122-
# RT #126481 !! with syntax error panics
123-
{
124-
fresh_perl_like('no warnings "experimental::regex_sets"; qr/(?[ ! ! (\w])/',
125-
qr/^Unmatched \(/, {},
126-
'qr/(?[ ! ! (\w])/ doesnt panic');
127-
# The following didn't panic before, but easy to add this here with a
128-
# paren between the !!
129-
fresh_perl_like('no warnings "experimental::regex_sets";qr/(?[ ! ( ! (\w)])/',
130-
qr/^Unmatched \(/, {},
131-
'qr/(?[ ! ( ! (\w)])/ neither');
132-
}
133-
134102
like("a", qr/(?[ (?#comment) [a]])/, "Can have (?#comments)");
135103

136104
if (! is_miniperl() && locales_enabled('LC_CTYPE')) {
@@ -188,13 +156,13 @@ for my $char ("٠", "٥", "٩") {
188156
eval { $_ = '/(?[(\c]) /'; qr/$_/ };
189157
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
190158
eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
191-
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
159+
like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
192160
eval { $_ = '(?[(\c])'; qr/$_/ };
193161
like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
194162
eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
195-
like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
163+
like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
196164
eval { $_ = '(?[\c[]](])'; qr/$_/ };
197-
like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
165+
like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
198166
like("\c#", qr/(?[\c#])/, '\c# should match itself');
199167
like("\c[", qr/(?[\c[])/, '\c[ should match itself');
200168
like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');

0 commit comments

Comments
 (0)