Skip to content

Commit 0020946

Browse files
authored
Fix RegExp.toString generating invalid RE for CHAR and CHAR_RANGE (#14493)
1 parent 18b70d2 commit 0020946

File tree

3 files changed

+35
-18
lines changed

3 files changed

+35
-18
lines changed

lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,19 @@ public String toString() {
854854
return b.toString();
855855
}
856856

857+
StringBuilder escapeCharIfNeeded(StringBuilder b, int codePoint) {
858+
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
859+
// "It is an error to use a backslash prior to any alphabetic character that does not denote
860+
// an escaped
861+
// construct;"
862+
// Plus, ASCII characters conflict with character classes.
863+
// Escape only characters which are NOT in [A-Za-z]
864+
if (!((codePoint >= 65 && codePoint <= 90) || (codePoint >= 97 && codePoint <= 122))) {
865+
b.append("\\");
866+
}
867+
return b.appendCodePoint(codePoint);
868+
}
869+
857870
void toStringBuilder(StringBuilder b) {
858871
switch (kind) {
859872
case REGEXP_UNION:
@@ -901,10 +914,14 @@ void toStringBuilder(StringBuilder b) {
901914
b.append(")");
902915
break;
903916
case REGEXP_CHAR:
904-
b.append("\\").appendCodePoint(c);
917+
escapeCharIfNeeded(b, c);
905918
break;
906919
case REGEXP_CHAR_RANGE:
907-
b.append("[\\").appendCodePoint(from[0]).append("-\\").appendCodePoint(to[0]).append("]");
920+
b.append("[");
921+
escapeCharIfNeeded(b, from[0]);
922+
b.append("-");
923+
escapeCharIfNeeded(b, to[0]);
924+
b.append("]");
908925
break;
909926
case REGEXP_CHAR_CLASS:
910927
b.append("[");

lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public void testAnyString() {
5757

5858
public void testChar() {
5959
RegExp re = new RegExp("c");
60-
assertEquals("\\c", re.toString());
60+
assertEquals("c", re.toString());
6161
assertEquals("REGEXP_CHAR char=c\n", re.toStringTree());
6262

6363
Automaton actual = re.toAutomaton();
@@ -69,7 +69,7 @@ public void testChar() {
6969

7070
public void testCaseInsensitiveChar() {
7171
RegExp re = new RegExp("c", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE);
72-
assertEquals("\\c", re.toString());
72+
assertEquals("c", re.toString());
7373
assertEquals("REGEXP_CHAR char=c\n", re.toStringTree());
7474

7575
Automaton actual = re.toAutomaton();
@@ -113,7 +113,7 @@ public void testCaseInsensitiveClassRangeCompression() {
113113

114114
public void testCaseInsensitiveCharUpper() {
115115
RegExp re = new RegExp("C", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE);
116-
assertEquals("\\C", re.toString());
116+
assertEquals("C", re.toString());
117117
assertEquals("REGEXP_CHAR char=C\n", re.toStringTree());
118118

119119
Automaton actual = re.toAutomaton();
@@ -174,7 +174,7 @@ public void testCaseInsensitiveCharUnicodeSigma() {
174174
public void testNegatedChar() {
175175
RegExp re = new RegExp("[^c]");
176176
// TODO: would be nice to emit negated class rather than this
177-
assertEquals("(.&~(\\c))", re.toString());
177+
assertEquals("(.&~(c))", re.toString());
178178
assertEquals(
179179
String.join(
180180
"\n",
@@ -211,7 +211,7 @@ public void testNegatedClass() {
211211

212212
public void testCharRange() {
213213
RegExp re = new RegExp("[b-d]");
214-
assertEquals("[\\b-\\d]", re.toString());
214+
assertEquals("[b-d]", re.toString());
215215
assertEquals("REGEXP_CHAR_RANGE from=b to=d\n", re.toStringTree());
216216

217217
Automaton actual = re.toAutomaton();
@@ -224,7 +224,7 @@ public void testCharRange() {
224224
public void testNegatedCharRange() {
225225
RegExp re = new RegExp("[^b-d]");
226226
// TODO: would be nice to emit negated class rather than this
227-
assertEquals("(.&~([\\b-\\d]))", re.toString());
227+
assertEquals("(.&~([b-d]))", re.toString());
228228
assertEquals(
229229
String.join(
230230
"\n",
@@ -514,7 +514,7 @@ public void testEmptyInterval() {
514514

515515
public void testOptional() {
516516
RegExp re = new RegExp("a?");
517-
assertEquals("(\\a)?", re.toString());
517+
assertEquals("(a)?", re.toString());
518518
assertEquals(String.join("\n", "REGEXP_OPTIONAL", " REGEXP_CHAR char=a\n"), re.toStringTree());
519519

520520
Automaton actual = re.toAutomaton();
@@ -526,7 +526,7 @@ public void testOptional() {
526526

527527
public void testRepeat0() {
528528
RegExp re = new RegExp("a*");
529-
assertEquals("(\\a)*", re.toString());
529+
assertEquals("(a)*", re.toString());
530530
assertEquals(String.join("\n", "REGEXP_REPEAT", " REGEXP_CHAR char=a\n"), re.toStringTree());
531531

532532
Automaton actual = re.toAutomaton();
@@ -538,7 +538,7 @@ public void testRepeat0() {
538538

539539
public void testRepeat1() {
540540
RegExp re = new RegExp("a+");
541-
assertEquals("(\\a){1,}", re.toString());
541+
assertEquals("(a){1,}", re.toString());
542542
assertEquals(
543543
String.join("\n", "REGEXP_REPEAT_MIN min=1", " REGEXP_CHAR char=a\n"), re.toStringTree());
544544

@@ -553,7 +553,7 @@ public void testRepeat1() {
553553

554554
public void testRepeatN() {
555555
RegExp re = new RegExp("a{5}");
556-
assertEquals("(\\a){5,5}", re.toString());
556+
assertEquals("(a){5,5}", re.toString());
557557
assertEquals(
558558
String.join("\n", "REGEXP_REPEAT_MINMAX min=5 max=5", " REGEXP_CHAR char=a\n"),
559559
re.toStringTree());
@@ -567,7 +567,7 @@ public void testRepeatN() {
567567

568568
public void testRepeatNPlus() {
569569
RegExp re = new RegExp("a{5,}");
570-
assertEquals("(\\a){5,}", re.toString());
570+
assertEquals("(a){5,}", re.toString());
571571
assertEquals(
572572
String.join("\n", "REGEXP_REPEAT_MIN min=5", " REGEXP_CHAR char=a\n"), re.toStringTree());
573573

@@ -582,7 +582,7 @@ public void testRepeatNPlus() {
582582

583583
public void testRepeatMN() {
584584
RegExp re = new RegExp("a{5,8}");
585-
assertEquals("(\\a){5,8}", re.toString());
585+
assertEquals("(a){5,8}", re.toString());
586586
assertEquals(
587587
String.join("\n", "REGEXP_REPEAT_MINMAX min=5 max=8", " REGEXP_CHAR char=a\n"),
588588
re.toStringTree());
@@ -659,7 +659,7 @@ public void testNotTerminatedString() {
659659

660660
public void testConcatenation() {
661661
RegExp re = new RegExp("[b-c][e-f]");
662-
assertEquals("[\\b-\\c][\\e-\\f]", re.toString());
662+
assertEquals("[b-c][e-f]", re.toString());
663663
assertEquals(
664664
String.join(
665665
"\n",
@@ -679,7 +679,7 @@ public void testConcatenation() {
679679

680680
public void testIntersection() {
681681
RegExp re = new RegExp("[b-f]&[e-f]");
682-
assertEquals("([\\b-\\f]&[\\e-\\f])", re.toString());
682+
assertEquals("([b-f]&[e-f])", re.toString());
683683
assertEquals(
684684
String.join(
685685
"\n",
@@ -714,7 +714,7 @@ public void testTruncatedIntersectionParens() {
714714

715715
public void testUnion() {
716716
RegExp re = new RegExp("[b-c]|[e-f]");
717-
assertEquals("([\\b-\\c]|[\\e-\\f])", re.toString());
717+
assertEquals("([b-c]|[e-f])", re.toString());
718718
assertEquals(
719719
String.join(
720720
"\n",

lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1224,7 +1224,7 @@ public void testMultiTerm() throws IOException {
12241224
s.intervals("field1", ctx);
12251225
}
12261226
});
1227-
assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());
1227+
assertEquals("Automaton [p(.)*e] expanded to too many terms (limit 1)", e.getMessage());
12281228

12291229
checkVisits(source, 1);
12301230
}

0 commit comments

Comments
 (0)