From fef69dce65847c3e67059f97ec44d5bacaac9db6 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Tue, 7 Apr 2026 17:40:35 +0100 Subject: [PATCH] feat: recognize brace expansion patterns in Word.parts Add post-hoc brace expansion detection so downstream consumers (e.g. security tools) can identify {a,b,c} and {1..10} patterns in Word.parts without changing s-expression output. - Add WordSpanKind::BraceExpansion and detect_brace_expansions() - Add NodeKind::BraceExpansion { content } to AST - Add WordSegment::BraceExpansion and wire through segment pipeline - Extract span_to_segment() helper, collapse duplicate match arms - 15 new unit tests covering comma, range, nested, edge cases Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ast.rs | 3 + src/format/mod.rs | 4 +- src/lexer/brace_expansion.rs | 189 +++++++++++++++++++++++++++++++++++ src/lexer/mod.rs | 1 + src/lexer/word_builder.rs | 2 + src/lexer/words.rs | 2 + src/parser/word_parts.rs | 41 ++++++++ src/sexp/mod.rs | 5 +- src/sexp/word.rs | 102 +++++++++++-------- 9 files changed, 304 insertions(+), 45 deletions(-) create mode 100644 src/lexer/brace_expansion.rs diff --git a/src/ast.rs b/src/ast.rs index 3fff3d4..e774a33 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -224,6 +224,9 @@ pub enum NodeKind { /// Locale string: `$"..."` LocaleString { content: String }, + /// Brace expansion: `{a,b,c}` or `{1..10}`. + BraceExpansion { content: String }, + /// Arithmetic expansion: `$(( expr ))` ArithmeticExpansion { expression: Option> }, diff --git a/src/format/mod.rs b/src/format/mod.rs index 06be22a..fa2fe46 100644 --- a/src/format/mod.rs +++ b/src/format/mod.rs @@ -605,7 +605,9 @@ fn process_word_value(value: &str, spans: &[crate::lexer::word_builder::WordSpan } result.push(')'); } - WordSegment::ParamExpansion(text) | WordSegment::SimpleVar(text) => { + WordSegment::ParamExpansion(text) + | WordSegment::SimpleVar(text) + | WordSegment::BraceExpansion(text) => { result.push_str(text); } } diff --git a/src/lexer/brace_expansion.rs b/src/lexer/brace_expansion.rs new file mode 100644 index 0000000..4bf1753 --- /dev/null +++ b/src/lexer/brace_expansion.rs @@ -0,0 +1,189 @@ +//! Post-hoc brace expansion detection. +//! +//! After a word is fully built by the lexer, scans the word value +//! to identify brace expansion patterns (`{a,b,c}`, `{1..10}`) and +//! records `WordSpanKind::BraceExpansion` spans. Existing spans +//! (quotes, escapes, parameter expansions) are used to skip +//! protected regions. + +use super::word_builder::{QuotingContext, WordBuilder, WordSpan, WordSpanKind}; + +/// Scans the completed word value for brace expansion patterns and +/// records `BraceExpansion` spans for each one found. +pub(super) fn detect_brace_expansions(wb: &mut WordBuilder) { + let value = wb.value.as_bytes(); + let spans = &wb.spans; + let mut new_spans: Vec = Vec::new(); + let mut i = 0; + + while i < value.len() { + if value[i] != b'{' { + i += 1; + continue; + } + + // Skip if preceded by $ (parameter expansion) + if i > 0 && value[i - 1] == b'$' { + i += 1; + continue; + } + + // Skip if inside an existing span + if span_end_at(i, spans).is_some() { + i += 1; + continue; + } + + // Try to find matching } with a comma or .. separator + if let Some(close) = find_brace_close(value, i, spans) { + new_spans.push(WordSpan { + start: i, + end: close + 1, + kind: WordSpanKind::BraceExpansion, + context: QuotingContext::None, + }); + i = close + 1; + } else { + i += 1; + } + } + + wb.spans.extend(new_spans); +} + +/// Returns the byte index of the matching `}` if the content between +/// `{` and `}` contains a `,` or `..` at depth 1. Returns `None` if +/// no valid brace expansion is found. +fn find_brace_close(value: &[u8], open: usize, spans: &[WordSpan]) -> Option { + let mut depth: i32 = 1; + let mut has_comma = false; + let mut has_dotdot = false; + let mut j = open + 1; + + while j < value.len() { + // Skip positions inside existing spans + if let Some(end) = span_end_at(j, spans) { + j = end; + continue; + } + + match value[j] { + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + if has_comma || has_dotdot { + return Some(j); + } + return None; + } + } + b',' if depth == 1 => has_comma = true, + b'.' if depth == 1 && j + 1 < value.len() && value[j + 1] == b'.' => { + has_dotdot = true; + } + _ => {} + } + j += 1; + } + + None +} + +/// If `pos` falls inside an existing span, returns the span's end offset. +fn span_end_at(pos: usize, spans: &[WordSpan]) -> Option { + spans + .iter() + .find(|s| pos >= s.start && pos < s.end) + .map(|s| s.end) +} + +#[cfg(test)] +mod tests { + use crate::lexer::word_builder::WordSpanKind; + + /// Helper: lex a single word and check for `BraceExpansion` spans. + #[allow(clippy::unwrap_used)] + fn brace_spans(source: &str) -> Vec<(usize, usize)> { + let mut lexer = crate::lexer::Lexer::new(source, false); + let tok = lexer.next_token().unwrap(); + tok.spans + .iter() + .filter(|s| s.kind == WordSpanKind::BraceExpansion) + .map(|s| (s.start, s.end)) + .collect() + } + + #[test] + fn comma_form() { + let spans = brace_spans("{a,b,c}"); + assert_eq!(spans, vec![(0, 7)]); + } + + #[test] + fn range_form() { + let spans = brace_spans("{1..10}"); + assert_eq!(spans, vec![(0, 7)]); + } + + #[test] + fn mid_word() { + let spans = brace_spans("file{1,2}.txt"); + assert_eq!(spans, vec![(4, 9)]); + } + + #[test] + fn nested_braces() { + let spans = brace_spans("{a,{b,c}}"); + assert_eq!(spans, vec![(0, 9)]); + } + + #[test] + fn empty_braces_not_expansion() { + let spans = brace_spans("{}"); + assert!(spans.is_empty()); + } + + #[test] + fn single_element_not_expansion() { + let spans = brace_spans("{a}"); + assert!(spans.is_empty()); + } + + #[test] + fn trailing_comma() { + let spans = brace_spans("{a,}"); + assert_eq!(spans, vec![(0, 4)]); + } + + #[test] + fn leading_comma() { + let spans = brace_spans("{,a}"); + assert_eq!(spans, vec![(0, 4)]); + } + + #[test] + fn param_expansion_not_brace() { + // ${foo} should NOT produce a BraceExpansion span + let spans = brace_spans("${foo}"); + assert!(spans.is_empty()); + } + + #[test] + fn adjacent_brace_expansions() { + let spans = brace_spans("{a,b}{c,d}"); + assert_eq!(spans, vec![(0, 5), (5, 10)]); + } + + #[test] + fn alpha_range() { + let spans = brace_spans("{a..z}"); + assert_eq!(spans, vec![(0, 6)]); + } + + #[test] + fn range_with_step() { + let spans = brace_spans("{1..10..2}"); + assert_eq!(spans, vec![(0, 10)]); + } +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 3334d61..5778e37 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,6 +1,7 @@ use crate::error::{RableError, Result}; use crate::token::{Token, TokenType}; +mod brace_expansion; mod expansions; mod heredoc; mod operators; diff --git a/src/lexer/word_builder.rs b/src/lexer/word_builder.rs index a8298d1..491a2c3 100644 --- a/src/lexer/word_builder.rs +++ b/src/lexer/word_builder.rs @@ -148,4 +148,6 @@ pub enum WordSpanKind { DeprecatedArith, /// Backslash escape: `\X` (not `\` line continuations). Escape, + /// Brace expansion: `{a,b,c}` or `{1..10}`. + BraceExpansion, } diff --git a/src/lexer/words.rs b/src/lexer/words.rs index 0bfb3e1..5c6c6d6 100644 --- a/src/lexer/words.rs +++ b/src/lexer/words.rs @@ -68,6 +68,8 @@ impl Lexer { } } + super::brace_expansion::detect_brace_expansions(&mut wb); + if wb.is_empty() { return Err(RableError::parse("unexpected character", start, line)); } diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs index ba438ce..5ab7fee 100644 --- a/src/parser/word_parts.rs +++ b/src/parser/word_parts.rs @@ -67,6 +67,9 @@ fn segment_to_node(seg: WordSegment) -> Node { } WordSegment::SimpleVar(text) => parse_simple_var(&text), WordSegment::ParamExpansion(text) => parse_braced_param(&text), + WordSegment::BraceExpansion(text) => { + Node::empty(NodeKind::BraceExpansion { content: text }) + } } } @@ -517,4 +520,42 @@ mod tests { NodeKind::LocaleString { content } if content == "\"hello\"" )); } + + #[test] + fn brace_expansion_comma() { + let parts = decompose("{a,b,c}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::BraceExpansion { content } if content == "{a,b,c}" + )); + } + + #[test] + fn brace_expansion_range() { + let parts = decompose("{1..10}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::BraceExpansion { content } if content == "{1..10}" + )); + } + + #[test] + fn brace_expansion_mid_word() { + let parts = decompose("file{1,2}.txt"); + assert_eq!(parts.len(), 3); + assert!(matches!( + &parts[0].kind, + NodeKind::WordLiteral { value } if value == "file" + )); + assert!(matches!( + &parts[1].kind, + NodeKind::BraceExpansion { content } if content == "{1,2}" + )); + assert!(matches!( + &parts[2].kind, + NodeKind::WordLiteral { value } if value == ".txt" + )); + } } diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs index 5ad6077..7d98d43 100644 --- a/src/sexp/mod.rs +++ b/src/sexp/mod.rs @@ -160,6 +160,7 @@ impl fmt::Display for NodeKind { } Self::AnsiCQuote { content } => write!(f, "$'{content}'"), Self::LocaleString { content } => write!(f, "$\"{content}\""), + Self::BraceExpansion { content } => write!(f, "{content}"), Self::ArithmeticExpansion { expression } => { write_arith_wrapper(f, "arith", expression.as_deref()) } @@ -735,7 +736,9 @@ fn write_redirect_segments( } write!(f, ")")?; } - word::WordSegment::ParamExpansion(text) | word::WordSegment::SimpleVar(text) => { + word::WordSegment::ParamExpansion(text) + | word::WordSegment::SimpleVar(text) + | word::WordSegment::BraceExpansion(text) => { write!(f, "{text}")?; } } diff --git a/src/sexp/word.rs b/src/sexp/word.rs index 33bacb4..66bfc79 100644 --- a/src/sexp/word.rs +++ b/src/sexp/word.rs @@ -30,6 +30,8 @@ pub enum WordSegment { ParamExpansion(String), /// Simple variable `$var` — raw text includes `$` prefix. SimpleVar(String), + /// Brace expansion `{a,b,c}` or `{1..10}` — raw text includes braces. + BraceExpansion(String), } /// Formats word segments into S-expression output. @@ -38,7 +40,8 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) match seg { WordSegment::Literal(text) | WordSegment::ParamExpansion(text) - | WordSegment::SimpleVar(text) => { + | WordSegment::SimpleVar(text) + | WordSegment::BraceExpansion(text) => { for ch in text.chars() { write_escaped_char(f, ch)?; } @@ -121,7 +124,6 @@ fn build_segments( spans: &[crate::lexer::word_builder::WordSpan], filter: fn(&crate::lexer::word_builder::WordSpanKind) -> bool, ) -> Vec { - use crate::lexer::word_builder::{QuotingContext, WordSpanKind}; let top_level = collect_filtered_spans(spans, filter); let mut segments = Vec::new(); let mut pos = 0; @@ -131,55 +133,66 @@ fn build_segments( { segments.push(WordSegment::Literal(text.to_string())); } - match &span.kind { - WordSpanKind::CommandSub => { - if let Some(c) = value.get(span.start + 2..span.end - 1) { - segments.push(WordSegment::CommandSubstitution(c.to_string())); - } - } - WordSpanKind::ProcessSub(dir) => { - if let Some(c) = value.get(span.start + 2..span.end - 1) { - segments.push(WordSegment::ProcessSubstitution(*dir, c.to_string())); - } + span_to_segment(&mut segments, value, span); + pos = span.end; + } + if pos < value.len() + && let Some(text) = value.get(pos..) + { + segments.push(WordSegment::Literal(text.to_string())); + } + segments +} + +/// Converts a single span into the appropriate `WordSegment` and appends it. +fn span_to_segment( + segments: &mut Vec, + value: &str, + span: &crate::lexer::word_builder::WordSpan, +) { + use crate::lexer::word_builder::{QuotingContext, WordSpanKind}; + match &span.kind { + WordSpanKind::CommandSub => { + if let Some(c) = value.get(span.start + 2..span.end - 1) { + segments.push(WordSegment::CommandSubstitution(c.to_string())); } - WordSpanKind::AnsiCQuote => { - push_ansi_c_span(&mut segments, value, span); + } + WordSpanKind::ProcessSub(dir) => { + if let Some(c) = value.get(span.start + 2..span.end - 1) { + segments.push(WordSegment::ProcessSubstitution(*dir, c.to_string())); } - WordSpanKind::LocaleString => { - match span.context { - QuotingContext::DoubleQuote => { - // $"..." inside "..." is literal (not a locale string) - if let Some(text) = value.get(span.start..span.end) { - push_literal(&mut segments, text); - } - } - _ => { - if let Some(c) = value.get(span.start + 1..span.end) { - segments.push(WordSegment::LocaleString(c.to_string())); - } + } + WordSpanKind::AnsiCQuote => { + push_ansi_c_span(segments, value, span); + } + WordSpanKind::LocaleString => { + match span.context { + QuotingContext::DoubleQuote => { + // $"..." inside "..." is literal (not a locale string) + if let Some(text) = value.get(span.start..span.end) { + push_literal(segments, text); } } - } - WordSpanKind::ParamExpansion => { - if let Some(text) = value.get(span.start..span.end) { - segments.push(WordSegment::ParamExpansion(text.to_string())); + _ => { + if let Some(c) = value.get(span.start + 1..span.end) { + segments.push(WordSegment::LocaleString(c.to_string())); + } } } - WordSpanKind::SimpleVar => { - if let Some(text) = value.get(span.start..span.end) { - segments.push(WordSegment::SimpleVar(text.to_string())); - } + } + WordSpanKind::ParamExpansion | WordSpanKind::SimpleVar | WordSpanKind::BraceExpansion => { + if let Some(text) = value.get(span.start..span.end) { + let seg = match &span.kind { + WordSpanKind::ParamExpansion => WordSegment::ParamExpansion, + WordSpanKind::SimpleVar => WordSegment::SimpleVar, + WordSpanKind::BraceExpansion => WordSegment::BraceExpansion, + _ => unreachable!(), + }; + segments.push(seg(text.to_string())); } - _ => {} // filtered out by span filter } - pos = span.end; - } - if pos < value.len() - && let Some(text) = value.get(pos..) - { - segments.push(WordSegment::Literal(text.to_string())); + _ => {} // filtered out by span filter } - segments } /// Handles `$'...'` spans with context-sensitive behavior: @@ -243,7 +256,10 @@ const fn is_decomposable(kind: &crate::lexer::word_builder::WordSpanKind) -> boo if is_sexp_relevant(kind) { return true; } - matches!(kind, WordSpanKind::ParamExpansion | WordSpanKind::SimpleVar) + matches!( + kind, + WordSpanKind::ParamExpansion | WordSpanKind::SimpleVar | WordSpanKind::BraceExpansion + ) } /// Collects top-level spans matching `filter`, sorted by start offset.