|
1 | 1 | use pyo3::prelude::*; |
2 | 2 | use pyo3::wrap_pyfunction; |
3 | | -use regex::Regex; |
| 3 | +use regex::{Regex, RegexSet}; |
4 | 4 |
|
5 | 5 | use mimalloc::MiMalloc; |
| 6 | +use pyo3::exceptions::PyValueError; |
6 | 7 |
|
7 | 8 | /// Faster memory allocator in Pyo3 context |
8 | 9 | #[global_allocator] |
9 | 10 | static GLOBAL: MiMalloc = MiMalloc; |
10 | 11 |
|
11 | | - |
| 12 | +/// Compiles and produces a regex class for matching strings to the regex |
| 13 | +/// pattern, it is recommended to use this over the function methods as |
| 14 | +/// compiling takes a while and shouldn't be constantly remade hurting performance. |
12 | 15 | #[pyclass(name=Regex)] |
13 | 16 | pub struct PyRegex { |
14 | 17 | regex: Regex, |
@@ -130,26 +133,103 @@ impl PyRegex { |
130 | 133 | Some(new) |
131 | 134 | } |
132 | 135 |
|
133 | | - /// Function that given returns a vector of tuples that contain (start_match, end_match+1) |
134 | | - /// according to the compiled regex. |
135 | | - /// |
| 136 | + /// Function that given returns a vector of tuples that contain |
| 137 | + /// (start_match, end_match+1) according to the compiled regex. |
136 | 138 | /// Args: |
137 | 139 | /// other: |
138 | 140 | /// The other string to be matched against the compiled regex. |
139 | 141 | /// |
140 | 142 | /// Returns: |
141 | | - /// A list with containing grouped matches relating |
142 | | - /// to the compiled regex. |
143 | | - fn matches(regex_pattern: &str, other: &str) -> Vec<(usize, usize)> { |
144 | | - let re = Regex::new(regex_pattern).unwrap(); |
| 143 | + /// A vector of tuples that contain (start_match, end_match+1). |
| 144 | + fn matches(&self, other: &str) -> Vec<(usize, usize)> { |
145 | 145 | let mut matches = Vec::new(); |
146 | | - for m in re.find_iter(input_str) { |
| 146 | + for m in self.regex.find_iter(other) { |
147 | 147 | matches.push((m.start(), m.end())); |
148 | 148 | } |
149 | 149 | matches |
150 | 150 | } |
151 | 151 | } |
152 | 152 |
|
| 153 | +/// Compile several regex patterns into a RegexSet, this will match all patterns |
| 154 | +/// in a single match, if you have several patterns you want to check on the |
| 155 | +/// same string this system will be the most performance and efficient method. |
| 156 | +/// |
| 157 | +/// |
| 158 | +/// # Limitations |
| 159 | +/// Regex sets are limited to answering the following two questions: |
| 160 | +/// |
| 161 | +/// 1. Does any regex in the set match? |
| 162 | +/// 2. If so, which regexes in the set match? |
| 163 | +/// |
| 164 | +/// As with the main Regex type, it is cheaper to ask (1) instead of (2) |
| 165 | +/// since the matching engines can stop after the first match is found. |
| 166 | +/// |
| 167 | +/// Other features like finding the location of successive matches or their |
| 168 | +/// sub-captures aren't supported. If you need this functionality, the |
| 169 | +/// recommended approach is to compile each regex in the set independently |
| 170 | +/// and selectively match them based on which regexes in the set matched. |
| 171 | +/// |
| 172 | +/// |
| 173 | +/// # Performance |
| 174 | +/// A RegexSet has the same performance characteristics as Regex. Namely, |
| 175 | +/// search takes O(mn) time, where m is proportional to the size of the regex |
| 176 | +/// set and n is proportional to the length of the search text. |
| 177 | +#[pyclass(name=RegexSet)] |
| 178 | +struct PyRegexSet { |
| 179 | + set: RegexSet, |
| 180 | +} |
| 181 | + |
| 182 | +#[pymethods] |
| 183 | +impl PyRegexSet { |
| 184 | + #[new] |
| 185 | + fn new(pattern: Vec<&str>) -> PyResult<Self> { |
| 186 | + let set = RegexSet::new(pattern); |
| 187 | + |
| 188 | + let set = match set { |
| 189 | + Ok(s) => s, |
| 190 | + Err(e) => return Err(PyValueError::new_err(format!("{:?}", e))) |
| 191 | + }; |
| 192 | + |
| 193 | + Ok(PyRegexSet { |
| 194 | + set, |
| 195 | + }) |
| 196 | + } |
| 197 | + |
| 198 | + /// Checks if any of the compiled regex patterns in the set match. |
| 199 | + /// |
| 200 | + /// Args: |
| 201 | + /// other: |
| 202 | + /// The other string to be matched against the compiled set. |
| 203 | + /// |
| 204 | + /// Returns: |
| 205 | + /// A bool signifying if any patterns in the set match. |
| 206 | + fn is_match(&self, other: &str) -> bool { |
| 207 | + self.set.is_match(other) |
| 208 | + } |
| 209 | + |
| 210 | + /// Matches the string against the compiled set which will give a list of |
| 211 | + /// numbers representing which pattern(s) matches the string. |
| 212 | + /// |
| 213 | + /// Args: |
| 214 | + /// other: |
| 215 | + /// The other string to be matched against the compiled set. |
| 216 | + /// |
| 217 | + /// Returns: |
| 218 | + /// A list of ints which relates the the index of the pattern that was |
| 219 | + /// matched. The order of patterns is the same order as added. |
| 220 | + fn find(&self, other: &str) -> Vec<usize> { |
| 221 | + let matches = self.set.matches(other); |
| 222 | + |
| 223 | + let mut out_matches = Vec::with_capacity(self.set.len()); |
| 224 | + for match_ in matches.iter() { |
| 225 | + out_matches.push(match_) |
| 226 | + } |
| 227 | + |
| 228 | + out_matches |
| 229 | + } |
| 230 | +} |
| 231 | + |
| 232 | + |
153 | 233 | fn list_captures(capture: regex::Captures) ->Vec<Option<String>> { |
154 | 234 | let mut new: Vec<Option<String>> = capture |
155 | 235 | .iter() |
|
0 commit comments