Skip to content

Commit 665c7f8

Browse files
committed
add regex sets and more docs
1 parent b61478d commit 665c7f8

File tree

1 file changed

+90
-10
lines changed

1 file changed

+90
-10
lines changed

src/lib.rs

Lines changed: 90 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
use pyo3::prelude::*;
22
use pyo3::wrap_pyfunction;
3-
use regex::Regex;
3+
use regex::{Regex, RegexSet};
44

55
use mimalloc::MiMalloc;
6+
use pyo3::exceptions::PyValueError;
67

78
/// Faster memory allocator in Pyo3 context
89
#[global_allocator]
910
static GLOBAL: MiMalloc = MiMalloc;
1011

11-
12+
/// Compiles and produces a regex class for matching strings to the regex
13+
/// pattern, it is recommended to use this over the function methods as
14+
/// compiling takes a while and shouldn't be constantly remade hurting performance.
1215
#[pyclass(name=Regex)]
1316
pub struct PyRegex {
1417
regex: Regex,
@@ -130,26 +133,103 @@ impl PyRegex {
130133
Some(new)
131134
}
132135

133-
/// Function that given returns a vector of tuples that contain (start_match, end_match+1)
134-
/// according to the compiled regex.
135-
///
136+
/// Function that given returns a vector of tuples that contain
137+
/// (start_match, end_match+1) according to the compiled regex.
136138
/// Args:
137139
/// other:
138140
/// The other string to be matched against the compiled regex.
139141
///
140142
/// Returns:
141-
/// A list with containing grouped matches relating
142-
/// to the compiled regex.
143-
fn matches(regex_pattern: &str, other: &str) -> Vec<(usize, usize)> {
144-
let re = Regex::new(regex_pattern).unwrap();
143+
/// A vector of tuples that contain (start_match, end_match+1).
144+
fn matches(&self, other: &str) -> Vec<(usize, usize)> {
145145
let mut matches = Vec::new();
146-
for m in re.find_iter(input_str) {
146+
for m in self.regex.find_iter(other) {
147147
matches.push((m.start(), m.end()));
148148
}
149149
matches
150150
}
151151
}
152152

153+
/// Compile several regex patterns into a RegexSet, this will match all patterns
154+
/// in a single match, if you have several patterns you want to check on the
155+
/// same string this system will be the most performance and efficient method.
156+
///
157+
///
158+
/// # Limitations
159+
/// Regex sets are limited to answering the following two questions:
160+
///
161+
/// 1. Does any regex in the set match?
162+
/// 2. If so, which regexes in the set match?
163+
///
164+
/// As with the main Regex type, it is cheaper to ask (1) instead of (2)
165+
/// since the matching engines can stop after the first match is found.
166+
///
167+
/// Other features like finding the location of successive matches or their
168+
/// sub-captures aren't supported. If you need this functionality, the
169+
/// recommended approach is to compile each regex in the set independently
170+
/// and selectively match them based on which regexes in the set matched.
171+
///
172+
///
173+
/// # Performance
174+
/// A RegexSet has the same performance characteristics as Regex. Namely,
175+
/// search takes O(mn) time, where m is proportional to the size of the regex
176+
/// set and n is proportional to the length of the search text.
177+
#[pyclass(name=RegexSet)]
178+
struct PyRegexSet {
179+
set: RegexSet,
180+
}
181+
182+
#[pymethods]
183+
impl PyRegexSet {
184+
#[new]
185+
fn new(pattern: Vec<&str>) -> PyResult<Self> {
186+
let set = RegexSet::new(pattern);
187+
188+
let set = match set {
189+
Ok(s) => s,
190+
Err(e) => return Err(PyValueError::new_err(format!("{:?}", e)))
191+
};
192+
193+
Ok(PyRegexSet {
194+
set,
195+
})
196+
}
197+
198+
/// Checks if any of the compiled regex patterns in the set match.
199+
///
200+
/// Args:
201+
/// other:
202+
/// The other string to be matched against the compiled set.
203+
///
204+
/// Returns:
205+
/// A bool signifying if any patterns in the set match.
206+
fn is_match(&self, other: &str) -> bool {
207+
self.set.is_match(other)
208+
}
209+
210+
/// Matches the string against the compiled set which will give a list of
211+
/// numbers representing which pattern(s) matches the string.
212+
///
213+
/// Args:
214+
/// other:
215+
/// The other string to be matched against the compiled set.
216+
///
217+
/// Returns:
218+
/// A list of ints which relates the the index of the pattern that was
219+
/// matched. The order of patterns is the same order as added.
220+
fn find(&self, other: &str) -> Vec<usize> {
221+
let matches = self.set.matches(other);
222+
223+
let mut out_matches = Vec::with_capacity(self.set.len());
224+
for match_ in matches.iter() {
225+
out_matches.push(match_)
226+
}
227+
228+
out_matches
229+
}
230+
}
231+
232+
153233
fn list_captures(capture: regex::Captures) ->Vec<Option<String>> {
154234
let mut new: Vec<Option<String>> = capture
155235
.iter()

0 commit comments

Comments
 (0)