Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions src/regex/escape.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
use regex_syntax::hir::{Hir, HirKind, Look};

pub(super) fn escape_start_and_end_anchors(hir: Hir) -> Hir {
match hir.into_kind() {
HirKind::Alternation(alternations) => {
let new_alternations: Vec<Hir> = alternations
.into_iter()
.map(escape_start_and_end_anchors)
.collect();
Hir::alternation(new_alternations)
}
HirKind::Concat(concats) => {
let new_concats: Vec<Hir> = concats
.into_iter()
.map(escape_start_and_end_anchors)
.collect();
Hir::concat(new_concats)
}
HirKind::Capture(mut capture) => {
capture.sub = Box::new(escape_start_and_end_anchors(*capture.sub));
Hir::capture(capture)
}
HirKind::Class(class) => Hir::class(class),
HirKind::Empty => Hir::empty(),
HirKind::Literal(literal) => Hir::literal(literal.0),
HirKind::Repetition(mut repetition) => {
repetition.sub = Box::new(escape_start_and_end_anchors(*repetition.sub));
Hir::repetition(repetition)
}
HirKind::Look(Look::Start) => Hir::literal([b'^']),
HirKind::Look(Look::End) => Hir::literal([b'$']),
HirKind::Look(look) => Hir::look(look),
}
}

#[cfg(test)]
mod tests {
use super::*;
use regex_syntax::hir::HirKind;

#[test]
fn test_escape_start_and_end_anchors() {
let hir = Hir::look(Look::Start);
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Literal(literal) = escaped_hir.kind() else {
panic!("Expected Literal")
};
assert_eq!(literal.0, vec![b'^'].into_boxed_slice());

let hir = Hir::look(Look::End);
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Literal(literal) = escaped_hir.kind() else {
panic!("Expected Literal")
};
assert_eq!(literal.0, vec![b'$'].into_boxed_slice());
}

#[test]
fn test_inside_alternation() {
let hir = regex_syntax::Parser::new().parse(r"\^|^").unwrap();
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Literal(literal) = escaped_hir.kind() else {
panic!("Expected Literal")
};
assert_eq!(literal.0, vec![b'^'].into_boxed_slice());

let hir = regex_syntax::Parser::new()
.parse(r"(\W|^)hello(\W|$)")
.unwrap();
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Concat(concats) = escaped_hir.kind() else {
panic!("Expected Concat")
};
assert_eq!(concats.len(), 3);
{
let HirKind::Capture(capture) = concats[0].kind() else {
panic!("Expected Capture")
};
let HirKind::Alternation(alternations) = capture.sub.kind() else {
panic!("Expected Alternation")
};
assert!(matches!(alternations[0].kind(), HirKind::Class(_)));
assert!(matches!(alternations[1].kind(), HirKind::Literal(_)));
}
{
let HirKind::Literal(literal) = concats[1].kind() else {
panic!("Expected Literal")
};
assert_eq!(
literal.0,
vec![b'h', b'e', b'l', b'l', b'o'].into_boxed_slice()
);
}
{
let HirKind::Capture(capture) = concats[2].kind() else {
panic!("Expected Capture")
};
let HirKind::Alternation(alternations) = capture.sub.kind() else {
panic!("Expected Alternation")
};
assert!(matches!(alternations[0].kind(), HirKind::Class(_)));
assert!(matches!(alternations[1].kind(), HirKind::Literal(_)));
}
}

#[test]
fn test_inside_adjacent_literal() {
let hir = regex_syntax::Parser::new().parse("ab^cd").unwrap();
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Literal(literal) = escaped_hir.kind() else {
panic!("Expected Literal")
};
assert_eq!(
literal.0,
vec![b'a', b'b', b'^', b'c', b'd'].into_boxed_slice()
);

let hir = regex_syntax::Parser::new().parse("abcd$").unwrap();
let escaped_hir = escape_start_and_end_anchors(hir);
let HirKind::Literal(literal) = escaped_hir.kind() else {
panic!("Expected Literal")
};
assert_eq!(
literal.0,
vec![b'a', b'b', b'c', b'd', b'$'].into_boxed_slice()
);
}

#[test]
fn test_escape_side_effects() {
let regexes = vec![
"hello",
r"\^",
r"hello\$",
r"(\W|a)hello(b|\W)",
r"ab\^cd",
r"[^abc]",
];
for regex_str in regexes {
let hir = regex_syntax::Parser::new().parse(regex_str).unwrap();
let escaped_hir = escape_start_and_end_anchors(hir.clone());
assert_eq!(hir, escaped_hir, "failed for regex: {}", regex_str);
}
}
}
6 changes: 4 additions & 2 deletions src/regex/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use crate::Automaton;
use crate::{regex::escape::escape_start_and_end_anchors, Automaton};
use regex_syntax;
use std::fmt;

mod compile;
mod dfa;
mod error;
mod escape;
mod sparse;

pub use self::error::Error;
Expand Down Expand Up @@ -83,7 +84,8 @@ impl Regex {

fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> {
let hir = regex_syntax::Parser::new().parse(re)?;
let insts = self::compile::Compiler::new(size).compile(&hir)?;
let escaped_hir = escape_start_and_end_anchors(hir);
let insts = self::compile::Compiler::new(size).compile(&escaped_hir)?;
let dfa = self::dfa::DfaBuilder::new(insts).build()?;
Ok(Regex {
original: re.to_owned(),
Expand Down